2022-08-03 18:49:04 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* pg_lfind.h
|
2022-08-21 06:14:01 +02:00
|
|
|
* Optimized linear search routines using SIMD intrinsics where
|
|
|
|
* available.
|
2022-08-03 18:49:04 +02:00
|
|
|
*
|
|
|
|
* Copyright (c) 2022, PostgreSQL Global Development Group
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
|
|
|
* src/include/port/pg_lfind.h
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#ifndef PG_LFIND_H
|
|
|
|
#define PG_LFIND_H
|
|
|
|
|
|
|
|
#include "port/simd.h"
|
|
|
|
|
2022-08-21 06:14:01 +02:00
|
|
|
/*
|
|
|
|
* pg_lfind8
|
|
|
|
*
|
|
|
|
* Return true if there is an element in 'base' that equals 'key', otherwise
|
|
|
|
* return false.
|
|
|
|
*/
|
|
|
|
static inline bool
|
|
|
|
pg_lfind8(uint8 key, uint8 *base, uint32 nelem)
|
|
|
|
{
|
|
|
|
uint32 i;
|
|
|
|
|
|
|
|
/* round down to multiple of vector length */
|
|
|
|
uint32 tail_idx = nelem & ~(sizeof(Vector8) - 1);
|
|
|
|
Vector8 chunk;
|
|
|
|
|
|
|
|
for (i = 0; i < tail_idx; i += sizeof(Vector8))
|
|
|
|
{
|
|
|
|
vector8_load(&chunk, &base[i]);
|
|
|
|
if (vector8_has(chunk, key))
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Process the remaining elements one at a time. */
|
|
|
|
for (; i < nelem; i++)
|
|
|
|
{
|
|
|
|
if (key == base[i])
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pg_lfind8_le
|
|
|
|
*
|
|
|
|
* Return true if there is an element in 'base' that is less than or equal to
|
|
|
|
* 'key', otherwise return false.
|
|
|
|
*/
|
|
|
|
static inline bool
|
|
|
|
pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
|
|
|
|
{
|
|
|
|
uint32 i;
|
|
|
|
|
|
|
|
/* round down to multiple of vector length */
|
|
|
|
uint32 tail_idx = nelem & ~(sizeof(Vector8) - 1);
|
|
|
|
Vector8 chunk;
|
|
|
|
|
|
|
|
for (i = 0; i < tail_idx; i += sizeof(Vector8))
|
|
|
|
{
|
|
|
|
vector8_load(&chunk, &base[i]);
|
|
|
|
if (vector8_has_le(chunk, key))
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Process the remaining elements one at a time. */
|
|
|
|
for (; i < nelem; i++)
|
|
|
|
{
|
|
|
|
if (base[i] <= key)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2022-08-03 18:49:04 +02:00
|
|
|
/*
|
|
|
|
* pg_lfind32
|
|
|
|
*
|
|
|
|
* Return true if there is an element in 'base' that equals 'key', otherwise
|
|
|
|
* return false.
|
|
|
|
*/
|
|
|
|
static inline bool
|
|
|
|
pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
|
|
|
|
{
|
|
|
|
uint32 i = 0;
|
|
|
|
|
2022-08-29 08:40:53 +02:00
|
|
|
#ifndef USE_NO_SIMD
|
2022-08-03 18:49:04 +02:00
|
|
|
|
|
|
|
/*
|
2022-08-29 08:40:53 +02:00
|
|
|
* For better instruction-level parallelism, each loop iteration operates
|
|
|
|
* on a block of four registers. Testing for SSE2 has showed this is ~40%
|
|
|
|
* faster than using a block of two registers.
|
2022-08-03 18:49:04 +02:00
|
|
|
*/
|
2022-08-29 08:40:53 +02:00
|
|
|
const Vector32 keys = vector32_broadcast(key); /* load copies of key */
|
|
|
|
const uint32 nelem_per_vector = sizeof(Vector32) / sizeof(uint32);
|
|
|
|
const uint32 nelem_per_iteration = 4 * nelem_per_vector;
|
|
|
|
|
|
|
|
/* round down to multiple of elements per iteration */
|
|
|
|
const uint32 tail_idx = nelem & ~(nelem_per_iteration - 1);
|
2022-08-03 18:49:04 +02:00
|
|
|
|
|
|
|
#if defined(USE_ASSERT_CHECKING)
|
|
|
|
bool assert_result = false;
|
|
|
|
|
|
|
|
/* pre-compute the result for assert checking */
|
|
|
|
for (i = 0; i < nelem; i++)
|
|
|
|
{
|
|
|
|
if (key == base[i])
|
|
|
|
{
|
|
|
|
assert_result = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2022-08-29 08:40:53 +02:00
|
|
|
for (i = 0; i < tail_idx; i += nelem_per_iteration)
|
2022-08-03 18:49:04 +02:00
|
|
|
{
|
2022-08-29 08:40:53 +02:00
|
|
|
Vector32 vals1,
|
|
|
|
vals2,
|
|
|
|
vals3,
|
|
|
|
vals4,
|
|
|
|
result1,
|
|
|
|
result2,
|
|
|
|
result3,
|
|
|
|
result4,
|
|
|
|
tmp1,
|
|
|
|
tmp2,
|
|
|
|
result;
|
|
|
|
|
|
|
|
/* load the next block into 4 registers */
|
|
|
|
vector32_load(&vals1, &base[i]);
|
|
|
|
vector32_load(&vals2, &base[i + nelem_per_vector]);
|
|
|
|
vector32_load(&vals3, &base[i + nelem_per_vector * 2]);
|
|
|
|
vector32_load(&vals4, &base[i + nelem_per_vector * 3]);
|
2022-08-03 18:49:04 +02:00
|
|
|
|
|
|
|
/* compare each value to the key */
|
2022-08-29 08:40:53 +02:00
|
|
|
result1 = vector32_eq(keys, vals1);
|
|
|
|
result2 = vector32_eq(keys, vals2);
|
|
|
|
result3 = vector32_eq(keys, vals3);
|
|
|
|
result4 = vector32_eq(keys, vals4);
|
2022-08-03 18:49:04 +02:00
|
|
|
|
|
|
|
/* combine the results into a single variable */
|
2022-08-29 08:40:53 +02:00
|
|
|
tmp1 = vector32_or(result1, result2);
|
|
|
|
tmp2 = vector32_or(result3, result4);
|
|
|
|
result = vector32_or(tmp1, tmp2);
|
2022-08-03 18:49:04 +02:00
|
|
|
|
|
|
|
/* see if there was a match */
|
2022-08-29 08:40:53 +02:00
|
|
|
if (vector8_is_highbit_set((Vector8) result))
|
2022-08-03 18:49:04 +02:00
|
|
|
{
|
|
|
|
Assert(assert_result == true);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
2022-08-29 08:40:53 +02:00
|
|
|
#endif /* ! USE_NO_SIMD */
|
2022-08-03 18:49:04 +02:00
|
|
|
|
|
|
|
/* Process the remaining elements one at a time. */
|
|
|
|
for (; i < nelem; i++)
|
|
|
|
{
|
|
|
|
if (key == base[i])
|
|
|
|
{
|
2022-08-29 08:40:53 +02:00
|
|
|
#ifndef USE_NO_SIMD
|
2022-08-03 18:49:04 +02:00
|
|
|
Assert(assert_result == true);
|
|
|
|
#endif
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-08-29 08:40:53 +02:00
|
|
|
#ifndef USE_NO_SIMD
|
2022-08-03 18:49:04 +02:00
|
|
|
Assert(assert_result == false);
|
|
|
|
#endif
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* PG_LFIND_H */
|