From 46267f6a2d80902aaea2c7468e0849d1cfc1d273 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Mon, 9 Oct 2023 14:47:50 +0100 Subject: [PATCH] Tidy-up: move GCM code into one place Signed-off-by: Dave Rodgman --- library/aesce.c | 131 +++++++++++++++++++++++++----------------------- 1 file changed, 68 insertions(+), 63 deletions(-) diff --git a/library/aesce.c b/library/aesce.c index 4c85941b2a..ffebbfd27a 100644 --- a/library/aesce.c +++ b/library/aesce.c @@ -90,65 +90,6 @@ #ifdef __ARM_NEON #include - -#if defined(MBEDTLS_ARCH_IS_ARM32) -#if defined(__clang__) -/* On clang for A32/T32, work around some missing intrinsics and types which are listed in [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1) */ - -#define vreinterpretq_p64_u8 (poly64x2_t) -#define vreinterpretq_u8_p128 (uint8x16_t) -#define vreinterpretq_u64_p64 (uint64x2_t) - -typedef uint8x16_t poly128_t; - -static inline poly128_t vmull_p64(poly64_t a, poly64_t b) -{ - poly128_t r; - asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :); - return r; -} - -static inline poly64x1_t vget_low_p64(poly64x2_t a) -{ - return (poly64x1_t) vget_low_u64(vreinterpretq_u64_p64(a)); -} - -static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b) -{ - return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)), - (poly64_t) (vget_high_u64((uint64x2_t) b))); -} - -#endif /* defined(__clang__) */ - -static inline uint8x16_t vrbitq_u8(uint8x16_t x) -{ - /* There is no vrbitq_u8 instruction in A32/T32, so provide - * an equivalent non-Neon implementation. Reverse bit order in each - * byte with 4x rbit, rev. */ - asm ("ldm %[p], { r2-r5 } \n\t" - "rbit r2, r2 \n\t" - "rev r2, r2 \n\t" - "rbit r3, r3 \n\t" - "rev r3, r3 \n\t" - "rbit r4, r4 \n\t" - "rev r4, r4 \n\t" - "rbit r5, r5 \n\t" - "rev r5, r5 \n\t" - "stm %[p], { r2-r5 } \n\t" - : - /* Output: 16 bytes of memory pointed to by &x */ - "+m" (*(uint8_t(*)[16]) &x) - : - [p] "r" (&x) - : - "r2", "r3", "r4", "r5" - ); - return x; -} - -#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */ - #else #error "Target does not support NEON instructions" #endif @@ -457,24 +398,87 @@ int mbedtls_aesce_setkey_enc(unsigned char *rk, #if defined(MBEDTLS_GCM_C) -#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 5 +#if defined(MBEDTLS_ARCH_IS_ARM32) + +#if defined(__clang__) +/* On clang for A32/T32, work around some missing intrinsics and types which are listed in + * [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1) + * These are only required for GCM. + */ +#define vreinterpretq_p64_u8(a) ((poly64x2_t) a) +#define vreinterpretq_u8_p128(a) ((uint8x16_t) a) +#define vreinterpretq_u64_p64(a) ((uint64x2_t) a) + +typedef uint8x16_t poly128_t; + +static inline poly128_t vmull_p64(poly64_t a, poly64_t b) +{ + poly128_t r; + asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :); + return r; +} + +static inline poly64x1_t vget_low_p64(poly64x2_t a) +{ + uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a)); + return (poly64x1_t) r; +} + +static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b) +{ + return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)), + (poly64_t) (vget_high_u64((uint64x2_t) b))); +} + +#endif /* defined(__clang__) */ + +static inline uint8x16_t vrbitq_u8(uint8x16_t x) +{ + /* There is no vrbitq_u8 instruction in A32/T32, so provide + * an equivalent non-Neon implementation. Reverse bit order in each + * byte with 4x rbit, rev. */ + asm ("ldm %[p], { r2-r5 } \n\t" + "rbit r2, r2 \n\t" + "rev r2, r2 \n\t" + "rbit r3, r3 \n\t" + "rev r3, r3 \n\t" + "rbit r4, r4 \n\t" + "rev r4, r4 \n\t" + "rbit r5, r5 \n\t" + "rev r5, r5 \n\t" + "stm %[p], { r2-r5 } \n\t" + : + /* Output: 16 bytes of memory pointed to by &x */ + "+m" (*(uint8_t(*)[16]) &x) + : + [p] "r" (&x) + : + "r2", "r3", "r4", "r5" + ); + return x; +} +#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */ + + +#if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5 /* Some intrinsics are not available for GCC 5.X. */ #define vreinterpretq_p64_u8(a) ((poly64x2_t) a) #define vreinterpretq_u8_p128(a) ((uint8x16_t) a) + static inline poly64_t vget_low_p64(poly64x2_t __a) { uint64x2_t tmp = (uint64x2_t) (__a); uint64x1_t lo = vcreate_u64(vgetq_lane_u64(tmp, 0)); return (poly64_t) (lo); } -#endif /* !__clang__ && __GNUC__ && __GNUC__ == 5*/ +#endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */ /* vmull_p64/vmull_high_p64 wrappers. * * Older compilers miss some intrinsic functions for `poly*_t`. We use * uint8x16_t and uint8x16x3_t as input/output parameters. */ -#if defined(__GNUC__) && !defined(__clang__) +#if defined(MBEDTLS_COMPILER_IS_GCC) /* GCC reports incompatible type error without cast. GCC think poly64_t and * poly64x1_t are different, that is different with MSVC and Clang. */ #define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b) @@ -483,7 +487,8 @@ static inline poly64_t vget_low_p64(poly64x2_t __a) * error with/without cast. And I think poly64_t and poly64x1_t are same, no * cast for clang also. */ #define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b) -#endif +#endif /* MBEDTLS_COMPILER_IS_GCC */ + static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b) {