postgresql/src/port/pg_popcount_avx512.c

/*-------------------------------------------------------------------------
 *
 * pg_popcount_avx512.c
 *	  Holds the AVX-512 pg_popcount() implementation.
 *
 * Copyright (c) 2024, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  src/port/pg_popcount_avx512.c
 *
 *-------------------------------------------------------------------------
 */
#include "c.h"

#include <immintrin.h>

#include "port/pg_bitutils.h"

/*
 * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
 * use AVX-512 intrinsics, but we check it anyway to be sure.  We piggy-back on
 * the function pointers that are only used when TRY_POPCNT_FAST is set.
 */
#ifdef TRY_POPCNT_FAST

/*
 * pg_popcount_avx512
 *		Returns the number of 1-bits in buf
 */
uint64
pg_popcount_avx512(const char *buf, int bytes)
{
	__m512i		val,
				cnt;
	__m512i		accum = _mm512_setzero_si512();
	const char *final;
	int			tail_idx;
	__mmask64	mask = ~UINT64CONST(0);

	/*
	 * Align buffer down to avoid double load overhead from unaligned access.
	 * Calculate a mask to ignore preceding bytes.  Find start offset of final
	 * iteration and ensure it is not empty.
	 */
	mask <<= ((uintptr_t) buf) % sizeof(__m512i);
	tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
	final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
	buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);

	/*
	 * Iterate through all but the final iteration.  Starting from the second
	 * iteration, the mask is ignored.
	 */
	if (buf < final)
	{
		val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
		cnt = _mm512_popcnt_epi64(val);
		accum = _mm512_add_epi64(accum, cnt);

		buf += sizeof(__m512i);
		mask = ~UINT64CONST(0);

		for (; buf < final; buf += sizeof(__m512i))
		{
			val = _mm512_load_si512((const __m512i *) buf);
			cnt = _mm512_popcnt_epi64(val);
			accum = _mm512_add_epi64(accum, cnt);
		}
	}

	/* Final iteration needs to ignore bytes that are not within the length */
	mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));

	val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
	cnt = _mm512_popcnt_epi64(val);
	accum = _mm512_add_epi64(accum, cnt);

	return _mm512_reduce_add_epi64(accum);
}

/*
 * pg_popcount_masked_avx512
 *		Returns the number of 1-bits in buf after applying the mask to each byte
 */
uint64
pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
{
	__m512i		val,
				vmasked,
				cnt;
	__m512i		accum = _mm512_setzero_si512();
	const char *final;
	int			tail_idx;
	__mmask64	bmask = ~UINT64CONST(0);
	const __m512i maskv = _mm512_set1_epi8(mask);

	/*
	 * Align buffer down to avoid double load overhead from unaligned access.
	 * Calculate a mask to ignore preceding bytes.  Find start offset of final
	 * iteration and ensure it is not empty.
	 */
	bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
	tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
	final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
	buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);

	/*
	 * Iterate through all but the final iteration.  Starting from the second
	 * iteration, the mask is ignored.
	 */
	if (buf < final)
	{
		val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
		vmasked = _mm512_and_si512(val, maskv);
		cnt = _mm512_popcnt_epi64(vmasked);
		accum = _mm512_add_epi64(accum, cnt);

		buf += sizeof(__m512i);
		bmask = ~UINT64CONST(0);

		for (; buf < final; buf += sizeof(__m512i))
		{
			val = _mm512_load_si512((const __m512i *) buf);
			vmasked = _mm512_and_si512(val, maskv);
			cnt = _mm512_popcnt_epi64(vmasked);
			accum = _mm512_add_epi64(accum, cnt);
		}
	}

	/* Final iteration needs to ignore bytes that are not within the length */
	bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));

	val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
	vmasked = _mm512_and_si512(val, maskv);
	cnt = _mm512_popcnt_epi64(vmasked);
	accum = _mm512_add_epi64(accum, cnt);

	return _mm512_reduce_add_epi64(accum);
}

#endif							/* TRY_POPCNT_FAST */
Optimize pg_popcount() with AVX-512 instructions. Presently, pg_popcount() processes data in 32-bit or 64-bit chunks when possible. Newer hardware that supports AVX-512 instructions can use 512-bit chunks, which provides a nice speedup, especially for larger buffers. This commit introduces the infrastructure required to detect compiler and CPU support for the required AVX-512 intrinsic functions, and it adds a new pg_popcount() implementation that uses these functions. If CPU support for this optimized implementation is detected at runtime, a function pointer is updated so that it is used by subsequent calls to pg_popcount(). Most of the existing in-tree calls to pg_popcount() should benefit from these instructions, and calls with smaller buffers should at least not regress compared to v16. The new infrastructure introduced by this commit can also be used to optimize visibilitymap_count(), but that is left for a follow-up commit. Co-authored-by: Paul Amonson, Ants Aasma Reviewed-by: Matthias van de Meent, Tom Lane, Noah Misch, Akash Shankaran, Alvaro Herrera, Andres Freund, David Rowley Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com 2024-04-07 04:56:23 +02:00			`/*-------------------------------------------------------------------------`
			`*`
			`* pg_popcount_avx512.c`
			`* Holds the AVX-512 pg_popcount() implementation.`
			`*`
			`* Copyright (c) 2024, PostgreSQL Global Development Group`
			`*`
			`* IDENTIFICATION`
			`* src/port/pg_popcount_avx512.c`
			`*`
			`*-------------------------------------------------------------------------`
			`*/`
			`#include "c.h"`

			`#include <immintrin.h>`

			`#include "port/pg_bitutils.h"`

			`/*`
			`* It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to`
			`* use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on`
			`* the function pointers that are only used when TRY_POPCNT_FAST is set.`
			`*/`
			`#ifdef TRY_POPCNT_FAST`

			`/*`
			`* pg_popcount_avx512`
			`* Returns the number of 1-bits in buf`
			`*/`
			`uint64`
			`pg_popcount_avx512(const char *buf, int bytes)`
			`{`
			`__m512i val,`
			`cnt;`
			`__m512i accum = _mm512_setzero_si512();`
			`const char *final;`
			`int tail_idx;`
			`__mmask64 mask = ~UINT64CONST(0);`

			`/*`
			`* Align buffer down to avoid double load overhead from unaligned access.`
			`* Calculate a mask to ignore preceding bytes. Find start offset of final`
			`* iteration and ensure it is not empty.`
			`*/`
			`mask <<= ((uintptr_t) buf) % sizeof(__m512i);`
			`tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;`
			`final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);`
			`buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);`

			`/*`
			`* Iterate through all but the final iteration. Starting from the second`
			`* iteration, the mask is ignored.`
			`*/`
			`if (buf < final)`
			`{`
			`val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);`
			`cnt = _mm512_popcnt_epi64(val);`
			`accum = _mm512_add_epi64(accum, cnt);`

			`buf += sizeof(__m512i);`
			`mask = ~UINT64CONST(0);`

			`for (; buf < final; buf += sizeof(__m512i))`
			`{`
			`val = _mm512_load_si512((const __m512i *) buf);`
			`cnt = _mm512_popcnt_epi64(val);`
			`accum = _mm512_add_epi64(accum, cnt);`
			`}`
			`}`

			`/* Final iteration needs to ignore bytes that are not within the length */`
			`mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));`

			`val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);`
			`cnt = _mm512_popcnt_epi64(val);`
			`accum = _mm512_add_epi64(accum, cnt);`

			`return _mm512_reduce_add_epi64(accum);`
			`}`

Optimize visibilitymap_count() with AVX-512 instructions. Commit 792752af4e added infrastructure for using AVX-512 intrinsic functions, and this commit uses that infrastructure to optimize visibilitymap_count(). Specificially, a new pg_popcount_masked() function is introduced that applies a bitmask to every byte in the buffer prior to calculating the population count, which is used to filter out the all-visible or all-frozen bits as needed. Platforms without AVX-512 support should also see a nice speedup due to the reduced number of calls to a function pointer. Co-authored-by: Ants Aasma Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com 2024-04-07 05:58:23 +02:00			`/*`
			`* pg_popcount_masked_avx512`
			`* Returns the number of 1-bits in buf after applying the mask to each byte`
			`*/`
			`uint64`
			`pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)`
			`{`
			`__m512i val,`
			`vmasked,`
			`cnt;`
			`__m512i accum = _mm512_setzero_si512();`
			`const char *final;`
			`int tail_idx;`
			`__mmask64 bmask = ~UINT64CONST(0);`
			`const __m512i maskv = _mm512_set1_epi8(mask);`

			`/*`
			`* Align buffer down to avoid double load overhead from unaligned access.`
			`* Calculate a mask to ignore preceding bytes. Find start offset of final`
			`* iteration and ensure it is not empty.`
			`*/`
			`bmask <<= ((uintptr_t) buf) % sizeof(__m512i);`
			`tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;`
			`final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);`
			`buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);`

			`/*`
			`* Iterate through all but the final iteration. Starting from the second`
			`* iteration, the mask is ignored.`
			`*/`
			`if (buf < final)`
			`{`
			`val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);`
			`vmasked = _mm512_and_si512(val, maskv);`
			`cnt = _mm512_popcnt_epi64(vmasked);`
			`accum = _mm512_add_epi64(accum, cnt);`

			`buf += sizeof(__m512i);`
			`bmask = ~UINT64CONST(0);`

			`for (; buf < final; buf += sizeof(__m512i))`
			`{`
			`val = _mm512_load_si512((const __m512i *) buf);`
			`vmasked = _mm512_and_si512(val, maskv);`
			`cnt = _mm512_popcnt_epi64(vmasked);`
			`accum = _mm512_add_epi64(accum, cnt);`
			`}`
			`}`

			`/* Final iteration needs to ignore bytes that are not within the length */`
			`bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));`

			`val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);`
			`vmasked = _mm512_and_si512(val, maskv);`
			`cnt = _mm512_popcnt_epi64(vmasked);`
			`accum = _mm512_add_epi64(accum, cnt);`

			`return _mm512_reduce_add_epi64(accum);`
			`}`

Optimize pg_popcount() with AVX-512 instructions. Presently, pg_popcount() processes data in 32-bit or 64-bit chunks when possible. Newer hardware that supports AVX-512 instructions can use 512-bit chunks, which provides a nice speedup, especially for larger buffers. This commit introduces the infrastructure required to detect compiler and CPU support for the required AVX-512 intrinsic functions, and it adds a new pg_popcount() implementation that uses these functions. If CPU support for this optimized implementation is detected at runtime, a function pointer is updated so that it is used by subsequent calls to pg_popcount(). Most of the existing in-tree calls to pg_popcount() should benefit from these instructions, and calls with smaller buffers should at least not regress compared to v16. The new infrastructure introduced by this commit can also be used to optimize visibilitymap_count(), but that is left for a follow-up commit. Co-authored-by: Paul Amonson, Ants Aasma Reviewed-by: Matthias van de Meent, Tom Lane, Noah Misch, Akash Shankaran, Alvaro Herrera, Andres Freund, David Rowley Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com 2024-04-07 04:56:23 +02:00			`#endif /* TRY_POPCNT_FAST */`