From 0ffc6f48fadf129413b070459e8d98a1cee3158d Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Wed, 24 May 2023 17:19:10 +0100
Subject: [PATCH] First draft at fixing the choice of asm

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/bn_mul.h | 58 ++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/library/bn_mul.h b/library/bn_mul.h
index c5124702bd..92a9266303 100644
--- a/library/bn_mul.h
+++ b/library/bn_mul.h
@@ -659,39 +659,46 @@
 #endif /* TriCore */
 
 /*
+ * There is a fairly complex matrix of supported options for Thumb / Thumb2 / Arm
+ * assembly. Choosing the correct code path depends on the target, the compiler,
+ * and the optimisation level.
+ *
  * Note, gcc -O0 by default uses r7 for the frame pointer, so it complains about
  * our use of r7 below, unless -fomit-frame-pointer is passed.
  *
  * On the other hand, -fomit-frame-pointer is implied by any -Ox options with
  * x !=0, which we can detect using __OPTIMIZE__ (which is also defined by
  * clang and armcc5 under the same conditions).
- *
- * So, only use the optimized assembly below for optimized build, which avoids
- * the build error and is pretty reasonable anyway.
  */
-#if defined(__GNUC__) && !defined(__OPTIMIZE__)
-#define MULADDC_CANNOT_USE_R7
+
+
+#if defined(__thumb__) && !defined(__thumb2__) // Thumb1 (not Thumb 2) ISA
+// Only supported by gcc, when optimisation is enabled; only option A works
+#if defined(__OPTIMIZE__) && !defined(__ARMCC_VERSION)
+#define ARM_OPTION_A
 #endif
 
-/*
- * Similarly, we need to disable the assembly below if:
- * - compiler is armclang
- * - optimisation is not -O0
- * - target is Thumb
- * - target cpu is one of cortex-m0, cortex-m0plus, cortex-m1, cortex-m23, sc000
- *
- * Checking for __ARM_ARCH_6M__ or __ARM_ARCH_8M_BASE__ seems to identify exactly these
- * cpus and no others (tested against all values for -mcpu known to armclang 6.20).
- */
-#if defined(__ARMCC_VERSION) && defined(__OPTIMIZE__) && defined(__thumb__)
-#if defined(__ARM_ARCH_8M_BASE__) || defined(__ARM_ARCH_6M__)
-#define MULADDC_CANNOT_USE_R7
-#endif
+#elif defined(__thumb2__) // Thumb 2 ISA
+
+#if !defined(__ARMCC_VERSION) && !defined(__OPTIMIZE__)
+// gcc -O0
+// only option B builds
+#define ARM_OPTION_B
+#elif !defined(__ARMCC_VERSION)
+// gcc with optimisation - any option builds
+#define ARM_OPTION_A
+#else
+// armclang
+// options A or C build
+#define ARM_OPTION_A
 #endif
 
-#if defined(__arm__) && !defined(MULADDC_CANNOT_USE_R7)
+#elif defined(__arm__) // Arm ISA
+// any option builds. A does not seem to work; B is about 2x faster than C (under emulation).
+#define ARM_OPTION_B
+#endif
 
-#if defined(__thumb__) && !defined(__thumb2__)
+#if defined(ARM_OPTION_A)
 
 #define MULADDC_X1_INIT                                 \
     asm(                                                \
@@ -746,8 +753,7 @@
            "r6", "r7", "r8", "r9", "cc"         \
          );
 
-#elif (__ARM_ARCH >= 6) && \
-    defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)
+#elif defined(ARM_OPTION_B)
 
 #define MULADDC_X1_INIT                            \
     {                                              \
@@ -812,7 +818,7 @@
         );                                                   \
     }
 
-#else
+#elif defined(ARM_OPTION_C)
 
 #define MULADDC_X1_INIT                                 \
     asm(                                                \
@@ -840,9 +846,7 @@
            "r6", "r7", "cc"                     \
          );
 
-#endif /* Thumb */
-
-#endif /* ARMv3 */
+#endif /* Arm */
 
 #if defined(__alpha__)