Inline pg_popcount() for small buffers.

If there aren't many bytes to process, the function call overhead
of the optimized implementation isn't worth taking, so instead we
inline a loop that consults pg_number_of_ones in that case.  If
there are many bytes to process, we accept the function call
overhead because the optimized versions are likely to be faster.
The threshold at which we use the optimized implementation is set
to the smallest amount of data required to use special popcount
instructions.

Reviewed-by: Alvaro Herrera, Tom Lane
Discussion: https://postgr.es/m/20240402155301.GA2750455%40nathanxps13
This commit is contained in:
Nathan Bossart 2024-04-03 12:22:02 -05:00
parent 6dbb490261
commit deb1486c7d
2 changed files with 42 additions and 8 deletions

View File

@ -302,16 +302,50 @@ pg_ceil_log2_64(uint64 num)
/* Attempt to use the POPCNT instruction, but perform a runtime check first */
extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes);
extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
#else
/* Use a portable implementation -- no need for a function pointer. */
extern int pg_popcount32(uint32 word);
extern int pg_popcount64(uint64 word);
extern uint64 pg_popcount(const char *buf, int bytes);
extern uint64 pg_popcount_optimized(const char *buf, int bytes);
#endif /* TRY_POPCNT_FAST */
/*
* Returns the number of 1-bits in buf.
*
* If there aren't many bytes to process, the function call overhead of the
* optimized versions isn't worth taking, so we inline a loop that consults
* pg_number_of_ones in that case. If there are many bytes to process, we
* accept the function call overhead because the optimized versions are likely
* to be faster.
*/
static inline uint64
pg_popcount(const char *buf, int bytes)
{
/*
* We set the threshold to the point at which we'll first use special
* instructions in the optimized version.
*/
#if SIZEOF_VOID_P >= 8
int threshold = 8;
#else
int threshold = 4;
#endif
if (bytes < threshold)
{
uint64 popcnt = 0;
while (bytes--)
popcnt += pg_number_of_ones[(unsigned char) *buf++];
return popcnt;
}
return pg_popcount_optimized(buf, bytes);
}
/*
* Rotate the bits of "word" to the right/left by n bits.
*/

View File

@ -118,7 +118,7 @@ static uint64 pg_popcount_fast(const char *buf, int bytes);
int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
uint64 (*pg_popcount) (const char *buf, int bytes) = pg_popcount_choose;
uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
#endif /* TRY_POPCNT_FAST */
#ifdef TRY_POPCNT_FAST
@ -155,13 +155,13 @@ choose_popcount_functions(void)
{
pg_popcount32 = pg_popcount32_fast;
pg_popcount64 = pg_popcount64_fast;
pg_popcount = pg_popcount_fast;
pg_popcount_optimized = pg_popcount_fast;
}
else
{
pg_popcount32 = pg_popcount32_slow;
pg_popcount64 = pg_popcount64_slow;
pg_popcount = pg_popcount_slow;
pg_popcount_optimized = pg_popcount_slow;
}
}
@ -183,7 +183,7 @@ static uint64
pg_popcount_choose(const char *buf, int bytes)
{
choose_popcount_functions();
return pg_popcount(buf, bytes);
return pg_popcount_optimized(buf, bytes);
}
/*
@ -387,11 +387,11 @@ pg_popcount64(uint64 word)
}
/*
* pg_popcount
* pg_popcount_optimized
* Returns the number of 1-bits in buf
*/
uint64
pg_popcount(const char *buf, int bytes)
pg_popcount_optimized(const char *buf, int bytes)
{
return pg_popcount_slow(buf, bytes);
}