From 8939210d8aafcf809f4c29e16603ac162c942ebc Mon Sep 17 00:00:00 2001 From: Shawn Hoffman Date: Mon, 25 Jul 2022 18:30:12 -0700 Subject: [PATCH] Common/Crypto/SHA1: implement accelerated versions --- Source/Core/Common/Crypto/SHA1.cpp | 349 +++++++++++++++++++++++++++++ 1 file changed, 349 insertions(+) diff --git a/Source/Core/Common/Crypto/SHA1.cpp b/Source/Core/Common/Crypto/SHA1.cpp index 6987f3670b..10bdb59741 100644 --- a/Source/Core/Common/Crypto/SHA1.cpp +++ b/Source/Core/Common/Crypto/SHA1.cpp @@ -3,6 +3,7 @@ #include "SHA1.h" +#include #include #include @@ -10,6 +11,32 @@ #include "Common/Assert.h" #include "Common/CPUDetect.h" #include "Common/CommonTypes.h" +#include "Common/Swap.h" + +#ifdef _MSC_VER +#include +#else +#ifdef _M_X86_64 +#include +#elif defined(_M_ARM_64) +#if defined(__clang__) +// This is a bit of a hack to get clang to accept the sha1 intrinsics without modifying cmdline +// flags. Note __ARM_FEATURE_CRYPTO is deprecated and "SHA2" flag is the lowest one which includes +// SHA1. +#define __ARM_FEATURE_SHA2 +// ...needed for older clang before they made the switchover to more granular flags. +#define __ARM_FEATURE_CRYPTO +#endif +#include +#include +#endif +#endif + +#ifdef _MSC_VER +#define ATTRIBUTE_TARGET(x) +#else +#define ATTRIBUTE_TARGET(x) [[gnu::target(x)]] +#endif namespace Common::SHA1 { @@ -37,8 +64,330 @@ private: mbedtls_sha1_context ctx{}; }; +class BlockContext : public Context +{ +protected: + static constexpr size_t BLOCK_LEN = 64; + static constexpr u32 K[4]{0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6}; + static constexpr u32 H[5]{0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0}; + + virtual void ProcessBlock(const u8* msg) = 0; + virtual Digest GetDigest() = 0; + + virtual void Update(const u8* msg, size_t len) override + { + if (len == 0) + return; + msg_len += len; + + if (block_used) + { + if (block_used + len >= block.size()) + { + size_t rem = block.size() - block_used; + std::memcpy(&block[block_used], msg, rem); + ProcessBlock(&block[0]); + block_used = 0; + msg += rem; + len -= rem; + } + else + { + std::memcpy(&block[block_used], msg, len); + block_used += len; + return; + } + } + while (len >= BLOCK_LEN) + { + ProcessBlock(msg); + msg += BLOCK_LEN; + len -= BLOCK_LEN; + } + if (len) + { + std::memcpy(&block[0], msg, len); + block_used = len; + } + } + + virtual Digest Finish() override + { + // block_used is guaranteed < BLOCK_LEN + block[block_used++] = 0x80; + + constexpr size_t MSG_LEN_POS = BLOCK_LEN - sizeof(u64); + if (block_used > MSG_LEN_POS) + { + // Pad current block and process it + std::memset(&block[block_used], 0, BLOCK_LEN - block_used); + ProcessBlock(&block[0]); + + // Pad a new block + std::memset(&block[0], 0, MSG_LEN_POS); + } + else + { + // Pad current block + std::memset(&block[block_used], 0, MSG_LEN_POS - block_used); + } + + Common::BigEndianValue msg_bitlen(msg_len * 8); + std::memcpy(&block[MSG_LEN_POS], &msg_bitlen, sizeof(msg_bitlen)); + + ProcessBlock(&block[0]); + + return GetDigest(); + } + + alignas(64) std::array block{}; + size_t block_used{}; + size_t msg_len{}; +}; + +template +class CyclicArray +{ +public: + inline ValueType operator[](size_t i) const { return data[i % Size]; } + inline ValueType& operator[](size_t i) { return data[i % Size]; } + constexpr size_t size() { return Size; } + +private: + std::array data; +}; + +#ifdef _M_X86_64 + +// Uses the dedicated SHA1 instructions. Normal SSE(AVX*) would be needed for parallel +// multi-message processing. While Dolphin could gain from such implementation, it requires the +// calling code to be modified and/or making the SHA1 implementation asynchronous so it can +// optimistically batch. +class ContextX64SHA1 final : public BlockContext +{ +public: + ContextX64SHA1() + { + state[0] = _mm_set_epi32(H[0], H[1], H[2], H[3]); + state[1] = _mm_set_epi32(H[4], 0, 0, 0); + } + +private: + using WorkBlock = CyclicArray<__m128i, 4>; + + ATTRIBUTE_TARGET("ssse3") + static inline __m128i byterev_16B(__m128i x) + { + return _mm_shuffle_epi8(x, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); + } + + template + ATTRIBUTE_TARGET("sha") + static inline __m128i MsgSchedule(WorkBlock* wblock) + { + auto& w = *wblock; + // Update and return this location + auto& wx = w[I]; + // Do all the xors and rol(x,1) required for 4 rounds of msg schedule + wx = _mm_sha1msg1_epu32(wx, w[I + 1]); + wx = _mm_xor_si128(wx, w[I + 2]); + wx = _mm_sha1msg2_epu32(wx, w[I + 3]); + return wx; + } + + ATTRIBUTE_TARGET("sha") + virtual void ProcessBlock(const u8* msg) override + { + // There are 80 rounds with 4 bytes per round, giving 0x140 byte work space, but we can keep + // active state in just 0x40 bytes. + // see FIPS 180-4 6.1.3 Alternate Method for Computing a SHA-1 Message Digest + WorkBlock w; + auto msg_block = (const __m128i*)msg; + for (size_t i = 0; i < w.size(); i++) + w[i] = byterev_16B(_mm_loadu_si128(&msg_block[i])); + + // 0: abcd, 1: e + auto abcde = state; + + // Not sure of a (non-ugly) way to have constant-evaluated for-loop, so just rely on inlining. + // Problem is that sha1rnds4 requires imm8 arg, and first/last rounds have different behavior. + + // clang-format off + // E0 += MSG0, special case of "nexte", can do normal add + abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_add_epi32(abcde[1], w[0]), 0); + abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], w[1]), 0); + abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], w[2]), 0); + abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], w[3]), 0); + abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<4>(&w)), 0); + abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<5>(&w)), 1); + abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<6>(&w)), 1); + abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<7>(&w)), 1); + abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<8>(&w)), 1); + abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<9>(&w)), 1); + abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<10>(&w)), 2); + abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<11>(&w)), 2); + abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<12>(&w)), 2); + abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<13>(&w)), 2); + abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<14>(&w)), 2); + abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<15>(&w)), 3); + abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<16>(&w)), 3); + abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<17>(&w)), 3); + abcde[1] = _mm_sha1rnds4_epu32(abcde[0], _mm_sha1nexte_epu32(abcde[1], MsgSchedule<18>(&w)), 3); + abcde[0] = _mm_sha1rnds4_epu32(abcde[1], _mm_sha1nexte_epu32(abcde[0], MsgSchedule<19>(&w)), 3); + // state += abcde + state[1] = _mm_sha1nexte_epu32(abcde[1], state[1]); + state[0] = _mm_add_epi32(abcde[0], state[0]); + // clang-format on + } + + virtual Digest GetDigest() override + { + Digest digest; + _mm_storeu_si128((__m128i*)&digest[0], byterev_16B(state[0])); + u32 hi = _mm_cvtsi128_si32(byterev_16B(state[1])); + std::memcpy(&digest[sizeof(__m128i)], &hi, sizeof(hi)); + return digest; + } + + std::array<__m128i, 2> state{}; +}; + +#endif + +#ifdef _M_ARM_64 + +// The armv8 flags are very annoying: +// clang inserts "+" prefixes itself, gcc does not. +// clang has deprecated "crypto" (removed in clang 13), gcc has not. +#ifdef _MSC_VER +#define TARGET_ARMV8_SHA1 +#elif defined(__clang__) +#define TARGET_ARMV8_SHA1 [[gnu::target("sha2")]] +#elif defined(__GNUC__) +#define TARGET_ARMV8_SHA1 [[gnu::target("+crypto")]] +#endif + +class ContextNeon final : public BlockContext +{ +public: + ContextNeon() + { + state.abcd = vld1q_u32(&H[0]); + state.e = H[4]; + } + +private: + using WorkBlock = CyclicArray; + + struct State + { + // ARM thought they were being clever by exposing e as u32, but it actually makes non-asm + // implementations pretty annoying/makes compiler's life needlessly difficult. + uint32x4_t abcd{}; + u32 e{}; + }; + + static inline uint32x4_t byterev_16B(uint32x4_t x) + { + // Just rev32 with casting wrappers + return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))); + } + + TARGET_ARMV8_SHA1 + static inline uint32x4_t MsgSchedule(WorkBlock* wblock, size_t i) + { + auto& w = *wblock; + // Update and return this location + auto& wx = w[0 + i]; + wx = vsha1su0q_u32(wx, w[1 + i], w[2 + i]); + wx = vsha1su1q_u32(wx, w[3 + i]); + return wx; + } + + template + TARGET_ARMV8_SHA1 static inline constexpr uint32x4_t f(State state, uint32x4_t w) + { + const auto wk = vaddq_u32(w, vdupq_n_u32(K[Func])); + if constexpr (Func == 0) + return vsha1cq_u32(state.abcd, state.e, wk); + if constexpr (Func == 1 || Func == 3) + return vsha1pq_u32(state.abcd, state.e, wk); + if constexpr (Func == 2) + return vsha1mq_u32(state.abcd, state.e, wk); + } + + template + TARGET_ARMV8_SHA1 static inline constexpr State FourRounds(State state, uint32x4_t w) + { +#ifdef _MSC_VER + // FIXME it seems the msvc optimizer gets a little too happy + _ReadBarrier(); +#endif + return {f(state, w), vsha1h_u32(vgetq_lane_u32(state.abcd, 0))}; + } + + virtual void ProcessBlock(const u8* msg) override + { + WorkBlock w; + for (size_t i = 0; i < w.size(); i++) + w[i] = byterev_16B(vld1q_u8(&msg[sizeof(uint32x4_t) * i])); + + std::array states{state}; + + // Fashioned to look like x64 impl. + // In each case the goal is to have compiler inline + unroll everything. + states[1] = FourRounds<0>(states[0], w[0]); + states[0] = FourRounds<0>(states[1], w[1]); + states[1] = FourRounds<0>(states[0], w[2]); + states[0] = FourRounds<0>(states[1], w[3]); + states[1] = FourRounds<0>(states[0], MsgSchedule(&w, 4)); + states[0] = FourRounds<1>(states[1], MsgSchedule(&w, 5)); + states[1] = FourRounds<1>(states[0], MsgSchedule(&w, 6)); + states[0] = FourRounds<1>(states[1], MsgSchedule(&w, 7)); + states[1] = FourRounds<1>(states[0], MsgSchedule(&w, 8)); + states[0] = FourRounds<1>(states[1], MsgSchedule(&w, 9)); + states[1] = FourRounds<2>(states[0], MsgSchedule(&w, 10)); + states[0] = FourRounds<2>(states[1], MsgSchedule(&w, 11)); + states[1] = FourRounds<2>(states[0], MsgSchedule(&w, 12)); + states[0] = FourRounds<2>(states[1], MsgSchedule(&w, 13)); + states[1] = FourRounds<2>(states[0], MsgSchedule(&w, 14)); + states[0] = FourRounds<3>(states[1], MsgSchedule(&w, 15)); + states[1] = FourRounds<3>(states[0], MsgSchedule(&w, 16)); + states[0] = FourRounds<3>(states[1], MsgSchedule(&w, 17)); + states[1] = FourRounds<3>(states[0], MsgSchedule(&w, 18)); + states[0] = FourRounds<3>(states[1], MsgSchedule(&w, 19)); + + state = {vaddq_u32(state.abcd, states[0].abcd), state.e + states[0].e}; + } + + virtual Digest GetDigest() override + { + Digest digest; + vst1q_u8(&digest[0], byterev_16B(state.abcd)); + u32 e = Common::FromBigEndian(state.e); + std::memcpy(&digest[sizeof(state.abcd)], &e, sizeof(e)); + return digest; + } + + State state; +}; + +#endif + std::unique_ptr CreateContext() { + if (cpu_info.bSHA1) + { +#ifdef _M_X86_64 + // Note: As of mid 2022, > 99% of CPUs reporting to Steam survey have SSSE3, ~40% have SHA. + // Seems unlikely we'll see any cpus supporting SHA but not SSSE3 (in the foreseeable future at + // least). + if (cpu_info.bSSSE3) + return std::make_unique(); +#elif defined(_M_ARM_64) + return std::make_unique(); +#endif + } return std::make_unique(); }