arm64: add optimized 16byte ld/st for armv8.4a+

16B ldp/stp are atomic on v8.4a+. See Arm Architecture Reference Manual,
"Changes to single-copy atomicity in Armv8.4". Add load/release atomic
impls for this instruction and add detection for 8.4a+ capability.
This commit is contained in:
sguo35 2022-07-13 10:59:46 -07:00 committed by Ivan
parent b13fd68848
commit 9b19f16698
2 changed files with 46 additions and 0 deletions

View File

@ -1096,8 +1096,21 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
#endif
}
#elif defined(ARCH_ARM64)
static inline T load(const T& dest)
{
#if defined(ARM_FEATURE_LSE2)
u64 data[2];
__asm__ volatile("1:\n"
"ldp %x[data0], %x[data1], %[dest]\n"
"dmb ish\n"
: [data0] "=r"(data[0]), [data1] "=r"(data[1])
: [dest] "Q"(dest)
: "memory");
T result;
std::memcpy(&result, data, 16);
return result;
#else
u32 tmp;
u64 data[2];
__asm__ volatile("1:\n"
@ -1111,6 +1124,7 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
T result;
std::memcpy(&result, data, 16);
return result;
#endif
}
static inline T observe(const T& dest)
@ -1172,13 +1186,38 @@ struct atomic_storage<T, 16> : atomic_storage<T, 0>
static inline void store(T& dest, T value)
{
// TODO
#if defined(ARM_FEATURE_LSE2)
u64 src[2];
std::memcpy(src, &value, 16);
__asm__ volatile("1:\n"
"dmb ish\n"
"stp %x[data0], %x[data1], %[dest]\n"
"dmb ish\n"
: [dest] "=Q" (dest)
: [data0] "r" (src[0]), [data1] "r" (src[1])
: "memory"
);
#else
exchange(dest, value);
#endif
}
static inline void release(T& dest, T value)
{
#if defined(ARM_FEATURE_LSE2)
u64 src[2];
std::memcpy(src, &value, 16);
__asm__ volatile("1:\n"
"dmb ish\n"
"stp %x[data0], %x[data1], %[dest]\n"
: [dest] "=Q" (dest)
: [data0] "r" (src[0]), [data1] "r" (src[1])
: "memory"
);
#else
// TODO
exchange(dest, value);
#endif
}
#endif

View File

@ -16,6 +16,13 @@
#define ARCH_X64 1
#elif defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
#define ARCH_ARM64 1
// v8.4a+ gives us atomic 16 byte ld/st
// See Arm C Language Extensions Documentation
// Currently there is no feature macro for LSE2 specifically so we define it ourself
// Unfortunately the __ARM_ARCH integer macro isn't universally defined so we use this hack instead
#if defined(__ARM_ARCH_8_4__) || defined(__ARM_ARCH_8_5__) || defined(__ARM_ARCH_8_6__) || defined(__ARM_ARCH_9__)
#define ARM_FEATURE_LSE2 1
#endif
#endif
using std::chrono::steady_clock;