From 41c51f0c68b21b4603bd2a9c3d3ad017fdd22627 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Sat, 6 Apr 2024 22:58:23 -0500 Subject: [PATCH] Optimize visibilitymap_count() with AVX-512 instructions. Commit 792752af4e added infrastructure for using AVX-512 intrinsic functions, and this commit uses that infrastructure to optimize visibilitymap_count(). Specificially, a new pg_popcount_masked() function is introduced that applies a bitmask to every byte in the buffer prior to calculating the population count, which is used to filter out the all-visible or all-frozen bits as needed. Platforms without AVX-512 support should also see a nice speedup due to the reduced number of calls to a function pointer. Co-authored-by: Ants Aasma Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com --- src/backend/access/heap/visibilitymap.c | 25 +---- src/include/port/pg_bitutils.h | 34 +++++++ src/port/pg_bitutils.c | 126 ++++++++++++++++++++++++ src/port/pg_popcount_avx512.c | 60 +++++++++++ 4 files changed, 225 insertions(+), 20 deletions(-) diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 1ab6c865e3..8b24e7bc33 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -119,10 +119,8 @@ #define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK) /* Masks for counting subsets of bits in the visibility map. */ -#define VISIBLE_MASK64 UINT64CONST(0x5555555555555555) /* The lower bit of each - * bit pair */ -#define FROZEN_MASK64 UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each - * bit pair */ +#define VISIBLE_MASK8 (0x55) /* The lower bit of each bit pair */ +#define FROZEN_MASK8 (0xaa) /* The upper bit of each bit pair */ /* prototypes for internal routines */ static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend); @@ -396,7 +394,6 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro { Buffer mapBuffer; uint64 *map; - int i; /* * Read till we fall off the end of the map. We assume that any extra @@ -414,21 +411,9 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro */ map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer)); - StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0, - "unsupported MAPSIZE"); - if (all_frozen == NULL) - { - for (i = 0; i < MAPSIZE / sizeof(uint64); i++) - nvisible += pg_popcount64(map[i] & VISIBLE_MASK64); - } - else - { - for (i = 0; i < MAPSIZE / sizeof(uint64); i++) - { - nvisible += pg_popcount64(map[i] & VISIBLE_MASK64); - nfrozen += pg_popcount64(map[i] & FROZEN_MASK64); - } - } + nvisible += pg_popcount_masked((const char *) map, MAPSIZE, VISIBLE_MASK8); + if (all_frozen) + nfrozen += pg_popcount_masked((const char *) map, MAPSIZE, FROZEN_MASK8); ReleaseBuffer(mapBuffer); } diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index b453f84d8f..4d88478c9c 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -303,6 +303,7 @@ pg_ceil_log2_64(uint64 num) extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); +extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); /* * We can also try to use the AVX-512 popcount instruction on some systems. @@ -313,6 +314,7 @@ extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK extern bool pg_popcount_avx512_available(void); extern uint64 pg_popcount_avx512(const char *buf, int bytes); +extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); #endif #else @@ -320,6 +322,7 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes); extern int pg_popcount32(uint32 word); extern int pg_popcount64(uint64 word); extern uint64 pg_popcount_optimized(const char *buf, int bytes); +extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); #endif /* TRY_POPCNT_FAST */ @@ -357,6 +360,37 @@ pg_popcount(const char *buf, int bytes) return pg_popcount_optimized(buf, bytes); } +/* + * Returns the number of 1-bits in buf after applying the mask to each byte. + * + * Similar to pg_popcount(), we only take on the function pointer overhead when + * it's likely to be faster. + */ +static inline uint64 +pg_popcount_masked(const char *buf, int bytes, bits8 mask) +{ + /* + * We set the threshold to the point at which we'll first use special + * instructions in the optimized version. + */ +#if SIZEOF_VOID_P >= 8 + int threshold = 8; +#else + int threshold = 4; +#endif + + if (bytes < threshold) + { + uint64 popcnt = 0; + + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask]; + return popcnt; + } + + return pg_popcount_masked_optimized(buf, bytes, mask); +} + /* * Rotate the bits of "word" to the right/left by n bits. */ diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 411be90f73..87f56e82b8 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -106,19 +106,23 @@ const uint8 pg_number_of_ones[256] = { static inline int pg_popcount32_slow(uint32 word); static inline int pg_popcount64_slow(uint64 word); static uint64 pg_popcount_slow(const char *buf, int bytes); +static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask); #ifdef TRY_POPCNT_FAST static bool pg_popcount_available(void); static int pg_popcount32_choose(uint32 word); static int pg_popcount64_choose(uint64 word); static uint64 pg_popcount_choose(const char *buf, int bytes); +static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); static inline int pg_popcount32_fast(uint32 word); static inline int pg_popcount64_fast(uint64 word); static uint64 pg_popcount_fast(const char *buf, int bytes); +static uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask); int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; +uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; #endif /* TRY_POPCNT_FAST */ #ifdef TRY_POPCNT_FAST @@ -156,17 +160,22 @@ choose_popcount_functions(void) pg_popcount32 = pg_popcount32_fast; pg_popcount64 = pg_popcount64_fast; pg_popcount_optimized = pg_popcount_fast; + pg_popcount_masked_optimized = pg_popcount_masked_fast; } else { pg_popcount32 = pg_popcount32_slow; pg_popcount64 = pg_popcount64_slow; pg_popcount_optimized = pg_popcount_slow; + pg_popcount_masked_optimized = pg_popcount_masked_slow; } #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK if (pg_popcount_avx512_available()) + { pg_popcount_optimized = pg_popcount_avx512; + pg_popcount_masked_optimized = pg_popcount_masked_avx512; + } #endif } @@ -191,6 +200,13 @@ pg_popcount_choose(const char *buf, int bytes) return pg_popcount_optimized(buf, bytes); } +static uint64 +pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask) +{ + choose_popcount_functions(); + return pg_popcount_masked(buf, bytes, mask); +} + /* * pg_popcount32_fast * Return the number of 1 bits set in word @@ -271,6 +287,56 @@ pg_popcount_fast(const char *buf, int bytes) return popcnt; } +/* + * pg_popcount_masked_fast + * Returns the number of 1-bits in buf after applying the mask to each byte + */ +static uint64 +pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask) +{ + uint64 popcnt = 0; + +#if SIZEOF_VOID_P >= 8 + /* Process in 64-bit chunks if the buffer is aligned */ + uint64 maskv = ~UINT64CONST(0) / 0xFF * mask; + + if (buf == (const char *) TYPEALIGN(8, buf)) + { + const uint64 *words = (const uint64 *) buf; + + while (bytes >= 8) + { + popcnt += pg_popcount64_fast(*words++ & maskv); + bytes -= 8; + } + + buf = (const char *) words; + } +#else + /* Process in 32-bit chunks if the buffer is aligned. */ + uint32 maskv = ~((uint32) 0) / 0xFF * mask; + + if (buf == (const char *) TYPEALIGN(4, buf)) + { + const uint32 *words = (const uint32 *) buf; + + while (bytes >= 4) + { + popcnt += pg_popcount32_fast(*words++ & maskv); + bytes -= 4; + } + + buf = (const char *) words; + } +#endif + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask]; + + return popcnt; +} + #endif /* TRY_POPCNT_FAST */ @@ -370,6 +436,56 @@ pg_popcount_slow(const char *buf, int bytes) return popcnt; } +/* + * pg_popcount_masked_slow + * Returns the number of 1-bits in buf after applying the mask to each byte + */ +static uint64 +pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask) +{ + uint64 popcnt = 0; + +#if SIZEOF_VOID_P >= 8 + /* Process in 64-bit chunks if the buffer is aligned */ + uint64 maskv = ~UINT64CONST(0) / 0xFF * mask; + + if (buf == (const char *) TYPEALIGN(8, buf)) + { + const uint64 *words = (const uint64 *) buf; + + while (bytes >= 8) + { + popcnt += pg_popcount64_slow(*words++ & maskv); + bytes -= 8; + } + + buf = (const char *) words; + } +#else + /* Process in 32-bit chunks if the buffer is aligned. */ + uint32 maskv = ~((uint32) 0) / 0xFF * mask; + + if (buf == (const char *) TYPEALIGN(4, buf)) + { + const uint32 *words = (const uint32 *) buf; + + while (bytes >= 4) + { + popcnt += pg_popcount32_slow(*words++ & maskv); + bytes -= 4; + } + + buf = (const char *) words; + } +#endif + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask]; + + return popcnt; +} + #ifndef TRY_POPCNT_FAST /* @@ -401,4 +517,14 @@ pg_popcount_optimized(const char *buf, int bytes) return pg_popcount_slow(buf, bytes); } +/* + * pg_popcount_masked_optimized + * Returns the number of 1-bits in buf after applying the mask to each byte + */ +uint64 +pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) +{ + return pg_popcount_masked_slow(buf, bytes, mask); +} + #endif /* !TRY_POPCNT_FAST */ diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c index 908817617a..a67092b9c4 100644 --- a/src/port/pg_popcount_avx512.c +++ b/src/port/pg_popcount_avx512.c @@ -78,4 +78,64 @@ pg_popcount_avx512(const char *buf, int bytes) return _mm512_reduce_add_epi64(accum); } +/* + * pg_popcount_masked_avx512 + * Returns the number of 1-bits in buf after applying the mask to each byte + */ +uint64 +pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) +{ + __m512i val, + vmasked, + cnt; + __m512i accum = _mm512_setzero_si512(); + const char *final; + int tail_idx; + __mmask64 bmask = ~UINT64CONST(0); + const __m512i maskv = _mm512_set1_epi8(mask); + + /* + * Align buffer down to avoid double load overhead from unaligned access. + * Calculate a mask to ignore preceding bytes. Find start offset of final + * iteration and ensure it is not empty. + */ + bmask <<= ((uintptr_t) buf) % sizeof(__m512i); + tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1; + final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1); + buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf); + + /* + * Iterate through all but the final iteration. Starting from the second + * iteration, the mask is ignored. + */ + if (buf < final) + { + val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf); + vmasked = _mm512_and_si512(val, maskv); + cnt = _mm512_popcnt_epi64(vmasked); + accum = _mm512_add_epi64(accum, cnt); + + buf += sizeof(__m512i); + bmask = ~UINT64CONST(0); + + for (; buf < final; buf += sizeof(__m512i)) + { + val = _mm512_load_si512((const __m512i *) buf); + vmasked = _mm512_and_si512(val, maskv); + cnt = _mm512_popcnt_epi64(vmasked); + accum = _mm512_add_epi64(accum, cnt); + } + } + + /* Final iteration needs to ignore bytes that are not within the length */ + bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx)); + + val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf); + vmasked = _mm512_and_si512(val, maskv); + cnt = _mm512_popcnt_epi64(vmasked); + accum = _mm512_add_epi64(accum, cnt); + + return _mm512_reduce_add_epi64(accum); +} + #endif /* TRY_POPCNT_FAST */