postgresql/src/include/port/atomics/arch-x86.h

253 lines
7.2 KiB
C

/*-------------------------------------------------------------------------
*
* arch-x86.h
* Atomic operations considerations specific to intel x86
*
* Note that we actually require a 486 upwards because the 386 doesn't have
* support for xadd and cmpxchg. Given that the 386 isn't supported anywhere
* anymore that's not much of a restriction luckily.
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* NOTES:
*
* src/include/port/atomics/arch-x86.h
*
*-------------------------------------------------------------------------
*/
/*
* Both 32 and 64 bit x86 do not allow loads to be reordered with other loads,
* or stores to be reordered with other stores, but a load can be performed
* before a subsequent store.
*
* Technically, some x86-ish chips support uncached memory access and/or
* special instructions that are weakly ordered. In those cases we'd need
* the read and write barriers to be lfence and sfence. But since we don't
* do those things, a compiler barrier should be enough.
*
* "lock; addl" has worked for longer than "mfence". It's also rumored to be
* faster in many scenarios.
*/
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#if defined(__i386__) || defined(__i386)
#define pg_memory_barrier_impl() \
__asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory", "cc")
#elif defined(__x86_64__)
#define pg_memory_barrier_impl() \
__asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory", "cc")
#endif
#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
#define pg_read_barrier_impl() pg_compiler_barrier_impl()
#define pg_write_barrier_impl() pg_compiler_barrier_impl()
/*
* Provide implementation for atomics using inline assembly on x86 gcc. It's
* nice to support older gcc's and the compare/exchange implementation here is
* actually more efficient than the * __sync variant.
*/
#if defined(HAVE_ATOMICS)
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define PG_HAVE_ATOMIC_FLAG_SUPPORT
typedef struct pg_atomic_flag
{
volatile char value;
} pg_atomic_flag;
#define PG_HAVE_ATOMIC_U32_SUPPORT
typedef struct pg_atomic_uint32
{
volatile uint32 value;
} pg_atomic_uint32;
/*
* It's too complicated to write inline asm for 64bit types on 32bit and the
* 486 can't do it anyway.
*/
#ifdef __x86_64__
#define PG_HAVE_ATOMIC_U64_SUPPORT
typedef struct pg_atomic_uint64
{
/* alignment guaranteed due to being on a 64bit platform */
volatile uint64 value;
} pg_atomic_uint64;
#endif /* __x86_64__ */
#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
#endif /* defined(HAVE_ATOMICS) */
#if !defined(PG_HAVE_SPIN_DELAY)
/*
* This sequence is equivalent to the PAUSE instruction ("rep" is
* ignored by old IA32 processors if the following instruction is
* not a string operation); the IA-32 Architecture Software
* Developer's Manual, Vol. 3, Section 7.7.2 describes why using
* PAUSE in the inner loop of a spin lock is necessary for good
* performance:
*
* The PAUSE instruction improves the performance of IA-32
* processors supporting Hyper-Threading Technology when
* executing spin-wait loops and other routines where one
* thread is accessing a shared lock or semaphore in a tight
* polling loop. When executing a spin-wait loop, the
* processor can suffer a severe performance penalty when
* exiting the loop because it detects a possible memory order
* violation and flushes the core processor's pipeline. The
* PAUSE instruction provides a hint to the processor that the
* code sequence is a spin-wait loop. The processor uses this
* hint to avoid the memory order violation and prevent the
* pipeline flush. In addition, the PAUSE instruction
* de-pipelines the spin-wait loop to prevent it from
* consuming execution resources excessively.
*/
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define PG_HAVE_SPIN_DELAY
static __inline__ void
pg_spin_delay_impl(void)
{
__asm__ __volatile__(" rep; nop \n");
}
#elif defined(_MSC_VER) && defined(__x86_64__)
#define PG_HAVE_SPIN_DELAY
static __forceinline void
pg_spin_delay_impl(void)
{
_mm_pause();
}
#elif defined(_MSC_VER)
#define PG_HAVE_SPIN_DELAY
static __forceinline void
pg_spin_delay_impl(void)
{
/* See comment for gcc code. Same code, MASM syntax */
__asm rep nop;
}
#endif
#endif /* !defined(PG_HAVE_SPIN_DELAY) */
#if defined(HAVE_ATOMICS)
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define PG_HAVE_ATOMIC_TEST_SET_FLAG
static inline bool
pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr)
{
char _res = 1;
__asm__ __volatile__(
" lock \n"
" xchgb %0,%1 \n"
: "+q"(_res), "+m"(ptr->value)
:
: "memory");
return _res == 0;
}
#define PG_HAVE_ATOMIC_CLEAR_FLAG
static inline void
pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr)
{
/*
* On a TSO architecture like x86 it's sufficient to use a compiler
* barrier to achieve release semantics.
*/
__asm__ __volatile__("" ::: "memory");
ptr->value = 0;
}
#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
static inline bool
pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
uint32 *expected, uint32 newval)
{
char ret;
/*
* Perform cmpxchg and use the zero flag which it implicitly sets when
* equal to measure the success.
*/
__asm__ __volatile__(
" lock \n"
" cmpxchgl %4,%5 \n"
" setz %2 \n"
: "=a" (*expected), "=m"(ptr->value), "=q" (ret)
: "a" (*expected), "r" (newval), "m"(ptr->value)
: "memory", "cc");
return (bool) ret;
}
#define PG_HAVE_ATOMIC_FETCH_ADD_U32
static inline uint32
pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
{
uint32 res;
__asm__ __volatile__(
" lock \n"
" xaddl %0,%1 \n"
: "=q"(res), "=m"(ptr->value)
: "0" (add_), "m"(ptr->value)
: "memory", "cc");
return res;
}
#ifdef __x86_64__
#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
static inline bool
pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
uint64 *expected, uint64 newval)
{
char ret;
/*
* Perform cmpxchg and use the zero flag which it implicitly sets when
* equal to measure the success.
*/
__asm__ __volatile__(
" lock \n"
" cmpxchgq %4,%5 \n"
" setz %2 \n"
: "=a" (*expected), "=m"(ptr->value), "=q" (ret)
: "a" (*expected), "r" (newval), "m"(ptr->value)
: "memory", "cc");
return (bool) ret;
}
#define PG_HAVE_ATOMIC_FETCH_ADD_U64
static inline uint64
pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
{
uint64 res;
__asm__ __volatile__(
" lock \n"
" xaddq %0,%1 \n"
: "=q"(res), "=m"(ptr->value)
: "0" (add_), "m"(ptr->value)
: "memory", "cc");
return res;
}
#endif /* __x86_64__ */
#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
/*
* 8 byte reads / writes have single-copy atomicity on 32 bit x86 platforms
* since at least the 586. As well as on all x86-64 cpus.
*/
#if defined(__i568__) || defined(__i668__) || /* gcc i586+ */ \
(defined(_M_IX86) && _M_IX86 >= 500) || /* msvc i586+ */ \
defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) /* gcc, sunpro, msvc */
#define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
#endif /* 8 byte single-copy atomicity */
#endif /* HAVE_ATOMICS */