210 lines
4.9 KiB
C
210 lines
4.9 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* pg_lfind.h
|
|
* Optimized linear search routines using SIMD intrinsics where
|
|
* available.
|
|
*
|
|
* Copyright (c) 2022-2024, PostgreSQL Global Development Group
|
|
*
|
|
* IDENTIFICATION
|
|
* src/include/port/pg_lfind.h
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#ifndef PG_LFIND_H
|
|
#define PG_LFIND_H
|
|
|
|
#include "port/simd.h"
|
|
|
|
/*
|
|
* pg_lfind8
|
|
*
|
|
* Return true if there is an element in 'base' that equals 'key', otherwise
|
|
* return false.
|
|
*/
|
|
static inline bool
|
|
pg_lfind8(uint8 key, uint8 *base, uint32 nelem)
|
|
{
|
|
uint32 i;
|
|
|
|
/* round down to multiple of vector length */
|
|
uint32 tail_idx = nelem & ~(sizeof(Vector8) - 1);
|
|
Vector8 chunk;
|
|
|
|
for (i = 0; i < tail_idx; i += sizeof(Vector8))
|
|
{
|
|
vector8_load(&chunk, &base[i]);
|
|
if (vector8_has(chunk, key))
|
|
return true;
|
|
}
|
|
|
|
/* Process the remaining elements one at a time. */
|
|
for (; i < nelem; i++)
|
|
{
|
|
if (key == base[i])
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* pg_lfind8_le
|
|
*
|
|
* Return true if there is an element in 'base' that is less than or equal to
|
|
* 'key', otherwise return false.
|
|
*/
|
|
static inline bool
|
|
pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
|
|
{
|
|
uint32 i;
|
|
|
|
/* round down to multiple of vector length */
|
|
uint32 tail_idx = nelem & ~(sizeof(Vector8) - 1);
|
|
Vector8 chunk;
|
|
|
|
for (i = 0; i < tail_idx; i += sizeof(Vector8))
|
|
{
|
|
vector8_load(&chunk, &base[i]);
|
|
if (vector8_has_le(chunk, key))
|
|
return true;
|
|
}
|
|
|
|
/* Process the remaining elements one at a time. */
|
|
for (; i < nelem; i++)
|
|
{
|
|
if (base[i] <= key)
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* pg_lfind32_one_by_one_helper
|
|
*
|
|
* Searches the array of integers one-by-one. The caller is responsible for
|
|
* ensuring that there are at least "nelem" integers in the array.
|
|
*/
|
|
static inline bool
|
|
pg_lfind32_one_by_one_helper(uint32 key, const uint32 *base, uint32 nelem)
|
|
{
|
|
for (uint32 i = 0; i < nelem; i++)
|
|
{
|
|
if (key == base[i])
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
#ifndef USE_NO_SIMD
|
|
/*
|
|
* pg_lfind32_simd_helper
|
|
*
|
|
* Searches one 4-register-block of integers. The caller is responsible for
|
|
* ensuring that there are at least 4-registers-worth of integers remaining.
|
|
*/
|
|
static inline bool
|
|
pg_lfind32_simd_helper(const Vector32 keys, const uint32 *base)
|
|
{
|
|
const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
|
|
Vector32 vals1,
|
|
vals2,
|
|
vals3,
|
|
vals4,
|
|
result1,
|
|
result2,
|
|
result3,
|
|
result4,
|
|
tmp1,
|
|
tmp2,
|
|
result;
|
|
|
|
/* load the next block into 4 registers */
|
|
vector32_load(&vals1, base);
|
|
vector32_load(&vals2, &base[nelem_per_vector]);
|
|
vector32_load(&vals3, &base[nelem_per_vector * 2]);
|
|
vector32_load(&vals4, &base[nelem_per_vector * 3]);
|
|
|
|
/* compare each value to the key */
|
|
result1 = vector32_eq(keys, vals1);
|
|
result2 = vector32_eq(keys, vals2);
|
|
result3 = vector32_eq(keys, vals3);
|
|
result4 = vector32_eq(keys, vals4);
|
|
|
|
/* combine the results into a single variable */
|
|
tmp1 = vector32_or(result1, result2);
|
|
tmp2 = vector32_or(result3, result4);
|
|
result = vector32_or(tmp1, tmp2);
|
|
|
|
/* return whether there was a match */
|
|
return vector32_is_highbit_set(result);
|
|
}
|
|
#endif /* ! USE_NO_SIMD */
|
|
|
|
/*
|
|
* pg_lfind32
|
|
*
|
|
* Return true if there is an element in 'base' that equals 'key', otherwise
|
|
* return false.
|
|
*/
|
|
static inline bool
|
|
pg_lfind32(uint32 key, const uint32 *base, uint32 nelem)
|
|
{
|
|
#ifndef USE_NO_SIMD
|
|
uint32 i = 0;
|
|
|
|
/*
|
|
* For better instruction-level parallelism, each loop iteration operates
|
|
* on a block of four registers.
|
|
*/
|
|
const Vector32 keys = vector32_broadcast(key); /* load copies of key */
|
|
const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
|
|
const uint32 nelem_per_iteration = 4 * nelem_per_vector;
|
|
|
|
/* round down to multiple of elements per iteration */
|
|
const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
|
|
|
|
#if defined(USE_ASSERT_CHECKING)
|
|
bool assert_result = pg_lfind32_one_by_one_helper(key, base, nelem);
|
|
#endif
|
|
|
|
/*
|
|
* If there aren't enough elements for the SIMD code, use the standard
|
|
* one-by-one linear search code.
|
|
*/
|
|
if (nelem < nelem_per_iteration)
|
|
return pg_lfind32_one_by_one_helper(key, base, nelem);
|
|
|
|
/*
|
|
* Process as many elements as possible with a block of 4 registers.
|
|
*/
|
|
do
|
|
{
|
|
if (pg_lfind32_simd_helper(keys, &base[i]))
|
|
{
|
|
Assert(assert_result == true);
|
|
return true;
|
|
}
|
|
|
|
i += nelem_per_iteration;
|
|
|
|
} while (i < tail_idx);
|
|
|
|
/*
|
|
* Process the last 'nelem_per_iteration' elements in the array with a
|
|
* 4-register block. This will cause us to check a subset of the elements
|
|
* more than once, but that won't affect correctness, and testing has
|
|
* demonstrated that this helps more cases than it harms.
|
|
*/
|
|
Assert(assert_result == pg_lfind32_simd_helper(keys, &base[nelem - nelem_per_iteration]));
|
|
return pg_lfind32_simd_helper(keys, &base[nelem - nelem_per_iteration]);
|
|
#else
|
|
/* Process the elements one at a time. */
|
|
return pg_lfind32_one_by_one_helper(key, base, nelem);
|
|
#endif
|
|
}
|
|
|
|
#endif /* PG_LFIND_H */
|