/*------------------------------------------------------------------------- * * pg_lfind.h * Optimized linear search routines using SIMD intrinsics where * available. * * Copyright (c) 2022-2023, PostgreSQL Global Development Group * * IDENTIFICATION * src/include/port/pg_lfind.h * *------------------------------------------------------------------------- */ #ifndef PG_LFIND_H #define PG_LFIND_H #include "port/simd.h" /* * pg_lfind8 * * Return true if there is an element in 'base' that equals 'key', otherwise * return false. */ static inline bool pg_lfind8(uint8 key, uint8 *base, uint32 nelem) { uint32 i; /* round down to multiple of vector length */ uint32 tail_idx = nelem & ~(sizeof(Vector8) - 1); Vector8 chunk; for (i = 0; i < tail_idx; i += sizeof(Vector8)) { vector8_load(&chunk, &base[i]); if (vector8_has(chunk, key)) return true; } /* Process the remaining elements one at a time. */ for (; i < nelem; i++) { if (key == base[i]) return true; } return false; } /* * pg_lfind8_le * * Return true if there is an element in 'base' that is less than or equal to * 'key', otherwise return false. */ static inline bool pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem) { uint32 i; /* round down to multiple of vector length */ uint32 tail_idx = nelem & ~(sizeof(Vector8) - 1); Vector8 chunk; for (i = 0; i < tail_idx; i += sizeof(Vector8)) { vector8_load(&chunk, &base[i]); if (vector8_has_le(chunk, key)) return true; } /* Process the remaining elements one at a time. */ for (; i < nelem; i++) { if (base[i] <= key) return true; } return false; } /* * pg_lfind32 * * Return true if there is an element in 'base' that equals 'key', otherwise * return false. */ static inline bool pg_lfind32(uint32 key, uint32 *base, uint32 nelem) { uint32 i = 0; #ifndef USE_NO_SIMD /* * For better instruction-level parallelism, each loop iteration operates * on a block of four registers. Testing for SSE2 has showed this is ~40% * faster than using a block of two registers. */ const Vector32 keys = vector32_broadcast(key); /* load copies of key */ const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32); const uint32 nelem_per_iteration = 4 * nelem_per_vector; /* round down to multiple of elements per iteration */ const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1); #if defined(USE_ASSERT_CHECKING) bool assert_result = false; /* pre-compute the result for assert checking */ for (i = 0; i < nelem; i++) { if (key == base[i]) { assert_result = true; break; } } #endif for (i = 0; i < tail_idx; i += nelem_per_iteration) { Vector32 vals1, vals2, vals3, vals4, result1, result2, result3, result4, tmp1, tmp2, result; /* load the next block into 4 registers */ vector32_load(&vals1, &base[i]); vector32_load(&vals2, &base[i + nelem_per_vector]); vector32_load(&vals3, &base[i + nelem_per_vector * 2]); vector32_load(&vals4, &base[i + nelem_per_vector * 3]); /* compare each value to the key */ result1 = vector32_eq(keys, vals1); result2 = vector32_eq(keys, vals2); result3 = vector32_eq(keys, vals3); result4 = vector32_eq(keys, vals4); /* combine the results into a single variable */ tmp1 = vector32_or(result1, result2); tmp2 = vector32_or(result3, result4); result = vector32_or(tmp1, tmp2); /* see if there was a match */ if (vector32_is_highbit_set(result)) { Assert(assert_result == true); return true; } } #endif /* ! USE_NO_SIMD */ /* Process the remaining elements one at a time. */ for (; i < nelem; i++) { if (key == base[i]) { #ifndef USE_NO_SIMD Assert(assert_result == true); #endif return true; } } #ifndef USE_NO_SIMD Assert(assert_result == false); #endif return false; } #endif /* PG_LFIND_H */