Use native CRC instructions on 64-bit LoongArch

As with the Intel and Arm CRC instructions, compiler intrinsics for
them must be supported by the compiler. In contrast, no runtime check
is needed. Aligned memory access is faster, so use the Arm coding as
a model.

YANG Xudong

Discussion: https://postgr.es/m/b522a0c5-e3b2-99cc-6387-58134fb88cbe%40ymatrix.cn
This commit is contained in:
John Naylor 2023-08-10 11:36:15 +07:00
parent fa2e874946
commit 4d14ccd6af
8 changed files with 240 additions and 17 deletions

View File

@ -661,3 +661,36 @@ if test x"$Ac_cachevar" = x"yes"; then
fi
undefine([Ac_cachevar])dnl
])# PGAC_ARMV8_CRC32C_INTRINSICS
# PGAC_LOONGARCH_CRC32C_INTRINSICS
# ---------------------------
# Check if the compiler supports the LoongArch CRCC instructions, using
# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w
# intrinsic functions.
#
# We test for the 8-byte variant since platforms capable of running
# Postgres are 64-bit only (as of PG17), and we know CRC instructions
# are available there without a runtime check.
#
# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics.
AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS],
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics])])dnl
AC_CACHE_CHECK(
[for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w],
[Ac_cachevar],
[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
[unsigned int crc = 0;
crc = __builtin_loongarch_crcc_w_b_w(0, crc);
crc = __builtin_loongarch_crcc_w_h_w(0, crc);
crc = __builtin_loongarch_crcc_w_w_w(0, crc);
crc = __builtin_loongarch_crcc_w_d_w(0, crc);
/* return computed value, to prevent the above being optimized away */
return crc == 0;])],
[Ac_cachevar=yes],
[Ac_cachevar=no])])
if test x"$Ac_cachevar" = x"yes"; then
pgac_loongarch_crc32c_intrinsics=yes
fi
undefine([Ac_cachevar])dnl
])# PGAC_LOONGARCH_CRC32C_INTRINSICS

74
configure vendored
View File

@ -18047,6 +18047,47 @@ fi
fi
# Check for LoongArch CRC intrinsics to do CRC calculations.
#
# Check if __builtin_loongarch_crcc_* intrinsics can be used
# with the default compiler flags.
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w" >&5
$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w... " >&6; }
if ${pgac_cv_loongarch_crc32c_intrinsics+:} false; then :
$as_echo_n "(cached) " >&6
else
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main ()
{
unsigned int crc = 0;
crc = __builtin_loongarch_crcc_w_b_w(0, crc);
crc = __builtin_loongarch_crcc_w_h_w(0, crc);
crc = __builtin_loongarch_crcc_w_w_w(0, crc);
crc = __builtin_loongarch_crcc_w_d_w(0, crc);
/* return computed value, to prevent the above being optimized away */
return crc == 0;
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"; then :
pgac_cv_loongarch_crc32c_intrinsics=yes
else
pgac_cv_loongarch_crc32c_intrinsics=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics" >&5
$as_echo "$pgac_cv_loongarch_crc32c_intrinsics" >&6; }
if test x"$pgac_cv_loongarch_crc32c_intrinsics" = x"yes"; then
pgac_loongarch_crc32c_intrinsics=yes
fi
# Select CRC-32C implementation.
@ -18063,9 +18104,12 @@ fi
# we're not targeting such a processor, but can nevertheless produce code that
# uses the CRC instructions, compile both, and select at runtime.
#
# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
# in the template or configure command line.
if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
#
# If we are targeting a LoongArch processor, CRC instructions are
# always available (at least on 64 bit), so no runtime check is needed.
if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
# Use Intel SSE 4.2 if available.
if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
USE_SSE42_CRC32C=1
@ -18083,10 +18127,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
else
# fall back to slicing-by-8 algorithm, which doesn't require any
# special CPU support.
USE_SLICING_BY_8_CRC32C=1
fi
# LoongArch CRCC instructions.
if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
USE_LOONGARCH_CRC32C=1
else
# fall back to slicing-by-8 algorithm, which doesn't require any
# special CPU support.
USE_SLICING_BY_8_CRC32C=1
fi
fi
fi
fi
fi
@ -18127,12 +18176,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
$as_echo "ARMv8 CRC instructions with runtime check" >&6; }
else
if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
$as_echo "LoongArch CRCC instructions" >&6; }
else
$as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
PG_CRC32C_OBJS="pg_crc32c_sb8.o"
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
PG_CRC32C_OBJS="pg_crc32c_sb8.o"
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
$as_echo "slicing-by-8" >&6; }
fi
fi
fi
fi

View File

@ -2099,6 +2099,12 @@ if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then
PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc])
fi
# Check for LoongArch CRC intrinsics to do CRC calculations.
#
# Check if __builtin_loongarch_crcc_* intrinsics can be used
# with the default compiler flags.
PGAC_LOONGARCH_CRC32C_INTRINSICS()
AC_SUBST(CFLAGS_CRC)
# Select CRC-32C implementation.
@ -2115,9 +2121,12 @@ AC_SUBST(CFLAGS_CRC)
# we're not targeting such a processor, but can nevertheless produce code that
# uses the CRC instructions, compile both, and select at runtime.
#
# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
# in the template or configure command line.
if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
#
# If we are targeting a LoongArch processor, CRC instructions are
# always available (at least on 64 bit), so no runtime check is needed.
if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
# Use Intel SSE 4.2 if available.
if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
USE_SSE42_CRC32C=1
@ -2135,10 +2144,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
else
# fall back to slicing-by-8 algorithm, which doesn't require any
# special CPU support.
USE_SLICING_BY_8_CRC32C=1
fi
# LoongArch CRCC instructions.
if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
USE_LOONGARCH_CRC32C=1
else
# fall back to slicing-by-8 algorithm, which doesn't require any
# special CPU support.
USE_SLICING_BY_8_CRC32C=1
fi
fi
fi
fi
fi
@ -2166,9 +2180,15 @@ else
PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
else
AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
PG_CRC32C_OBJS="pg_crc32c_sb8.o"
AC_MSG_RESULT(slicing-by-8)
if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.])
PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
AC_MSG_RESULT(LoongArch CRCC instructions)
else
AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
PG_CRC32C_OBJS="pg_crc32c_sb8.o"
AC_MSG_RESULT(slicing-by-8)
fi
fi
fi
fi

View File

@ -2065,6 +2065,30 @@ int main(void)
cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
have_optimized_crc = true
endif
elif host_cpu == 'loongarch64'
prog = '''
int main(void)
{
unsigned int crc = 0;
crc = __builtin_loongarch_crcc_w_b_w(0, crc);
crc = __builtin_loongarch_crcc_w_h_w(0, crc);
crc = __builtin_loongarch_crcc_w_w_w(0, crc);
crc = __builtin_loongarch_crcc_w_d_w(0, crc);
/* return computed value, to prevent the above being optimized away */
return crc == 0;
}
'''
if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w',
args: test_c_args)
# Use LoongArch CRC instruction unconditionally
cdata.set('USE_LOONGARCH_CRC32C', 1)
have_optimized_crc = true
endif
endif
if not have_optimized_crc

View File

@ -714,6 +714,9 @@
/* Define to 1 to build with LLVM based JIT support. (--with-llvm) */
#undef USE_LLVM
/* Define to 1 to use LoongArch CRCC instructions. */
#undef USE_LOONGARCH_CRC32C
/* Define to 1 to build with LZ4 support. (--with-lz4) */
#undef USE_LZ4

View File

@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le
extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
#elif defined(USE_LOONGARCH_CRC32C)
/* Use LoongArch CRCC instructions. */
#define COMP_CRC32C(crc, data, len) \
((crc) = pg_comp_crc32c_loongarch((crc), (data), (len)))
#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
/*

View File

@ -92,6 +92,9 @@ replace_funcs_pos = [
['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
# loongarch
['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
# generic fallback
['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
]

View File

@ -0,0 +1,73 @@
/*-------------------------------------------------------------------------
*
* pg_crc32c_loongarch.c
* Compute CRC-32C checksum using LoongArch CRCC instructions
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/port/pg_crc32c_loongarch.c
*
*-------------------------------------------------------------------------
*/
#include "c.h"
#include "port/pg_crc32c.h"
pg_crc32c
pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len)
{
const unsigned char *p = data;
const unsigned char *pend = p + len;
/*
* LoongArch doesn't require alignment, but aligned memory access is
* significantly faster. Process leading bytes so that the loop below
* starts with a pointer aligned to eight bytes.
*/
if (!PointerIsAligned(p, uint16) &&
p + 1 <= pend)
{
crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
p += 1;
}
if (!PointerIsAligned(p, uint32) &&
p + 2 <= pend)
{
crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
p += 2;
}
if (!PointerIsAligned(p, uint64) &&
p + 4 <= pend)
{
crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
p += 4;
}
/* Process eight bytes at a time, as far as we can. */
while (p + 8 <= pend)
{
crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc);
p += 8;
}
/* Process remaining 0-7 bytes. */
if (p + 4 <= pend)
{
crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
p += 4;
}
if (p + 2 <= pend)
{
crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
p += 2;
}
if (p < pend)
{
crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
}
return crc;
}