From 28f62f6212b3ad3541cd8e7bd30b03d9bd0acf3a Mon Sep 17 00:00:00 2001
From: Gilles Peskine <Gilles.Peskine@arm.com>
Date: Fri, 24 Jul 2020 02:06:46 +0200
Subject: [PATCH 01/10] Support running the benchmark with a single curve

If you pass a curve name to the benchmark program, the ECDH and ECDSA
benchmarks will only run for that particular curve. By default, all
curves are benchmarked.

To simplify the implementation, if you pass multiple curves, only the
last one will be benchmarked.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
---
 programs/test/benchmark.c | 58 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 7 deletions(-)

diff --git a/programs/test/benchmark.c b/programs/test/benchmark.c
index 251cbb692e..9c5911ba29 100644
--- a/programs/test/benchmark.c
+++ b/programs/test/benchmark.c
@@ -266,6 +266,21 @@ void ecp_clear_precomputed( mbedtls_ecp_group *grp )
 #define ecp_clear_precomputed( g )
 #endif
 
+#if defined(MBEDTLS_ECP_C)
+static int set_ecp_curve( const char *string, mbedtls_ecp_curve_info *curve )
+{
+    const mbedtls_ecp_curve_info *found =
+        mbedtls_ecp_curve_info_from_name( string );
+    if( found != NULL )
+    {
+        *curve = *found;
+        return( 1 );
+    }
+    else
+        return( 0 );
+}
+#endif
+
 unsigned char buf[BUFSIZE];
 
 typedef struct {
@@ -289,6 +304,17 @@ int main( int argc, char *argv[] )
 #if defined(MBEDTLS_MEMORY_BUFFER_ALLOC_C)
     unsigned char alloc_buf[HEAP_SIZE] = { 0 };
 #endif
+#if defined(MBEDTLS_ECP_C)
+    mbedtls_ecp_curve_info single_curve[2] = {
+        { MBEDTLS_ECP_DP_NONE, 0, 0, NULL },
+        { MBEDTLS_ECP_DP_NONE, 0, 0, NULL },
+    };
+    const mbedtls_ecp_curve_info *curve_list = mbedtls_ecp_curve_list( );
+#endif
+
+#if defined(MBEDTLS_ECP_C)
+    (void) curve_list; /* Unused in some configurations where no benchmark uses ECC */
+#endif
 
     if( argc <= 1 )
     {
@@ -356,6 +382,10 @@ int main( int argc, char *argv[] )
                 todo.ecdsa = 1;
             else if( strcmp( argv[i], "ecdh" ) == 0 )
                 todo.ecdh = 1;
+#if defined(MBEDTLS_ECP_C)
+            else if( set_ecp_curve( argv[i], single_curve ) )
+                curve_list = single_curve;
+#endif
             else
             {
                 mbedtls_printf( "Unrecognized option: %s\n", argv[i] );
@@ -845,7 +875,7 @@ int main( int argc, char *argv[] )
 
         memset( buf, 0x2A, sizeof( buf ) );
 
-        for( curve_info = mbedtls_ecp_curve_list();
+        for( curve_info = curve_list;
              curve_info->grp_id != MBEDTLS_ECP_DP_NONE;
              curve_info++ )
         {
@@ -867,7 +897,7 @@ int main( int argc, char *argv[] )
             mbedtls_ecdsa_free( &ecdsa );
         }
 
-        for( curve_info = mbedtls_ecp_curve_list();
+        for( curve_info = curve_list;
              curve_info->grp_id != MBEDTLS_ECP_DP_NONE;
              curve_info++ )
         {
@@ -911,8 +941,23 @@ int main( int argc, char *argv[] )
         };
         const mbedtls_ecp_curve_info *curve_info;
         size_t olen;
+        const mbedtls_ecp_curve_info *selected_montgomery_curve_list =
+            montgomery_curve_list;
 
-        for( curve_info = mbedtls_ecp_curve_list();
+        if( curve_list == (const mbedtls_ecp_curve_info*) &single_curve )
+        {
+            mbedtls_ecp_group grp;
+            mbedtls_ecp_group_init( &grp );
+            if( mbedtls_ecp_group_load( &grp, curve_list->grp_id ) != 0 )
+                mbedtls_exit( 1 );
+            if( mbedtls_ecp_get_type( &grp ) == MBEDTLS_ECP_TYPE_MONTGOMERY )
+                selected_montgomery_curve_list = single_curve;
+            else /* empty list */
+                selected_montgomery_curve_list = single_curve + 1;
+            mbedtls_ecp_group_free( &grp );
+        }
+
+        for( curve_info = curve_list;
              curve_info->grp_id != MBEDTLS_ECP_DP_NONE;
              curve_info++ )
         {
@@ -938,7 +983,7 @@ int main( int argc, char *argv[] )
         }
 
         /* Montgomery curves need to be handled separately */
-        for ( curve_info = montgomery_curve_list;
+        for ( curve_info = selected_montgomery_curve_list;
               curve_info->grp_id != MBEDTLS_ECP_DP_NONE;
               curve_info++ )
         {
@@ -960,7 +1005,7 @@ int main( int argc, char *argv[] )
             mbedtls_mpi_free( &z );
         }
 
-        for( curve_info = mbedtls_ecp_curve_list();
+        for( curve_info = curve_list;
              curve_info->grp_id != MBEDTLS_ECP_DP_NONE;
              curve_info++ )
         {
@@ -986,7 +1031,7 @@ int main( int argc, char *argv[] )
         }
 
         /* Montgomery curves need to be handled separately */
-        for ( curve_info = montgomery_curve_list;
+        for ( curve_info = selected_montgomery_curve_list;
               curve_info->grp_id != MBEDTLS_ECP_DP_NONE;
               curve_info++)
         {
@@ -1015,7 +1060,6 @@ int main( int argc, char *argv[] )
     {
         mbedtls_ecdh_context ecdh_srv, ecdh_cli;
         unsigned char buf_srv[BUFSIZE], buf_cli[BUFSIZE];
-        const mbedtls_ecp_curve_info * curve_list = mbedtls_ecp_curve_list();
         const mbedtls_ecp_curve_info *curve_info;
         size_t olen;
 

From d10e8fae9e30cac60297b1e1834002db183429e5 Mon Sep 17 00:00:00 2001
From: Gilles Peskine <Gilles.Peskine@arm.com>
Date: Wed, 22 Jul 2020 19:58:28 +0200
Subject: [PATCH 02/10] Optimize fix_negative

Reduce the code size, stack consumption and heap consumption in
fix_negative by encoding the special-case subtraction manually.

* Code size: ecp_curves.o goes down from 7837B down to 7769 in a
  sample Cortex-M0 build with all curves enabled. The savings come
  from not having to set up C in INIT (which is used many times) and
  from not having to catch errors in fix_negative.
* Stack consumption: get rid of C on the stack.
* Heap: mbedtls_mpi_sub_abs with destination == second operand would
  make a heap allocation. The new code doesn't do any heap allocation.
* Performance: no measurable difference.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
---
 library/ecp_curves.c | 54 ++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 30 deletions(-)

diff --git a/library/ecp_curves.c b/library/ecp_curves.c
index 839fb5e36e..a1aab5deb6 100644
--- a/library/ecp_curves.c
+++ b/library/ecp_curves.c
@@ -1000,25 +1000,20 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
 #define ADD( j )    add32( &cur, A( j ), &c );
 #define SUB( j )    sub32( &cur, A( j ), &c );
 
+#define ciL    (sizeof(mbedtls_mpi_uint))         /* chars in limb  */
+#define biL    (ciL << 3)               /* bits  in limb  */
+
 /*
  * Helpers for the main 'loop'
- * (see fix_negative for the motivation of C)
  */
 #define INIT( b )                                                       \
-    int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;                                                            \
+    int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;                    \
     signed char c = 0, cc;                                              \
     uint32_t cur;                                                       \
     size_t i = 0, bits = (b);                                           \
-    mbedtls_mpi C;                                                      \
-    mbedtls_mpi_uint Cp[ (b) / 8 / sizeof( mbedtls_mpi_uint) + 1 ];     \
-                                                                        \
-    C.s = 1;                                                            \
-    C.n = (b) / 8 / sizeof( mbedtls_mpi_uint) + 1;                      \
-    C.p = Cp;                                                           \
-    memset( Cp, 0, C.n * sizeof( mbedtls_mpi_uint ) );                  \
-                                                                        \
-    MBEDTLS_MPI_CHK( mbedtls_mpi_grow( N, (b) * 2 / 8 /                 \
-                                       sizeof( mbedtls_mpi_uint ) ) );  \
+    /* N is the size of the product of two b-bit numbers, plus one */   \
+    /* limb for fix_negative */                                         \
+    MBEDTLS_MPI_CHK( mbedtls_mpi_grow( N, ( b ) * 2 / biL + 1 ) );      \
     LOAD32;
 
 #define NEXT                    \
@@ -1033,33 +1028,32 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
     STORE32; i++;                               \
     cur = c > 0 ? c : 0; STORE32;               \
     cur = 0; while( ++i < MAX32 ) { STORE32; }  \
-    if( c < 0 ) MBEDTLS_MPI_CHK( fix_negative( N, c, &C, bits ) );
+    if( c < 0 ) fix_negative( N, c, bits );
 
 /*
  * If the result is negative, we get it in the form
  * c * 2^(bits + 32) + N, with c negative and N positive shorter than 'bits'
  */
-static inline int fix_negative( mbedtls_mpi *N, signed char c, mbedtls_mpi *C, size_t bits )
+static inline void fix_negative( mbedtls_mpi *N, signed char c, size_t bits )
 {
-    int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
+    size_t i;
 
-    /* C = - c * 2^(bits + 32) */
-#if !defined(MBEDTLS_HAVE_INT64)
-    ((void) bits);
-#else
-    if( bits == 224 )
-        C->p[ C->n - 1 ] = ((mbedtls_mpi_uint) -c) << 32;
-    else
-#endif
-        C->p[ C->n - 1 ] = (mbedtls_mpi_uint) -c;
-
-    /* N = - ( C - N ) */
-    MBEDTLS_MPI_CHK( mbedtls_mpi_sub_abs( N, C, N ) );
+    /* Set N := N - 2^bits */
+    --N->p[0];
+    for( i = 0; i <= bits / 8 / sizeof( mbedtls_mpi_uint ); i++ )
+    {
+        N->p[i] = ~(mbedtls_mpi_uint)0 - N->p[i];
+    }
     N->s = -1;
 
-cleanup:
-
-    return( ret );
+    /* Add |c| * 2^(bits + 32) to the absolute value. Since c and N are
+    * negative, this adds c * 2^(bits + 32). */
+    mbedtls_mpi_uint msw = (mbedtls_mpi_uint) -c;
+#if defined(MBEDTLS_HAVE_INT64)
+    if( bits == 224 )
+        msw <<= 32;
+#endif
+    N->p[bits / 8 / sizeof( mbedtls_mpi_uint)] += msw;
 }
 
 #if defined(MBEDTLS_ECP_DP_SECP224R1_ENABLED)

From 1acf7cb76c2d3f55d536169992db53987893571c Mon Sep 17 00:00:00 2001
From: Gilles Peskine <Gilles.Peskine@arm.com>
Date: Thu, 23 Jul 2020 01:03:22 +0200
Subject: [PATCH 03/10] Avoid reallocating during subtraction

mbedtls_mpi_sub_abs systematically allocated a new mpi when the result
was aliased with the right operand (i.e. X = A - X). This aliasing
very commonly happens during ECP operations. Rewrite the function to
allocate only if the result might not fit otherwise.

This costs a few bytes of code size in bignum.o, and might make
mbedtls_mpi_sub_abs very very slightly slower when no reallocation is
done. However, there is a substantial performance gain in ECP
operations with Montgomery curves (10-20% on my PC).

test_suite_ecp drops from 1422794 to 1271506 calls to calloc().

This commit also fixes a bug whereby mbedtls_mpi_sub_abs would leak
memory when X == B (so TB was in use) and the result was negative.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
---
 library/bignum.c | 68 ++++++++++++++++++++++--------------------------
 1 file changed, 31 insertions(+), 37 deletions(-)

diff --git a/library/bignum.c b/library/bignum.c
index b11239e274..5cd1c3e842 100644
--- a/library/bignum.c
+++ b/library/bignum.c
@@ -1339,29 +1339,32 @@ cleanup:
 /**
  * Helper for mbedtls_mpi subtraction.
  *
- * Calculate d - s where d and s have the same size.
+ * Calculate l - r where l and r have the same size.
  * This function operates modulo (2^ciL)^n and returns the carry
- * (1 if there was a wraparound, i.e. if `d < s`, and 0 otherwise).
+ * (1 if there was a wraparound, i.e. if `l < r`, and 0 otherwise).
  *
- * \param n             Number of limbs of \p d and \p s.
- * \param[in,out] d     On input, the left operand.
- *                      On output, the result of the subtraction:
- * \param[in] s         The right operand.
+ * d may be aliased to l or r.
  *
- * \return              1 if `d < s`.
- *                      0 if `d >= s`.
+ * \param n             Number of limbs of \p d, \p l and \p r.
+ * \param[out] d        The result of the subtraction.
+ * \param[in] l         The left operand.
+ * \param[in] r         The right operand.
+ *
+ * \return              1 if `l < r`.
+ *                      0 if `l >= r`.
  */
 static mbedtls_mpi_uint mpi_sub_hlp( size_t n,
                                      mbedtls_mpi_uint *d,
-                                     const mbedtls_mpi_uint *s )
+                                     const mbedtls_mpi_uint *l,
+                                     const mbedtls_mpi_uint *r )
 {
     size_t i;
-    mbedtls_mpi_uint c, z;
+    mbedtls_mpi_uint c = 0, t, z;
 
-    for( i = c = 0; i < n; i++, s++, d++ )
+    for( i = 0; i < n; i++ )
     {
-        z = ( *d <  c );     *d -=  c;
-        c = ( *d < *s ) + z; *d -= *s;
+        z = ( l[i] <  c );    t = l[i] - c;
+        c = ( t < r[i] ) + z; d[i] = t - r[i];
     }
 
     return( c );
@@ -1372,7 +1375,6 @@ static mbedtls_mpi_uint mpi_sub_hlp( size_t n,
  */
 int mbedtls_mpi_sub_abs( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *B )
 {
-    mbedtls_mpi TB;
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     size_t n;
     mbedtls_mpi_uint carry;
@@ -1380,29 +1382,21 @@ int mbedtls_mpi_sub_abs( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi
     MPI_VALIDATE_RET( A != NULL );
     MPI_VALIDATE_RET( B != NULL );
 
-    mbedtls_mpi_init( &TB );
-
-    if( X == B )
-    {
-        MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &TB, B ) );
-        B = &TB;
-    }
-
-    if( X != A )
-        MBEDTLS_MPI_CHK( mbedtls_mpi_copy( X, A ) );
-
-    /*
-     * X should always be positive as a result of unsigned subtractions.
-     */
-    X->s = 1;
-
-    ret = 0;
-
     for( n = B->n; n > 0; n-- )
         if( B->p[n - 1] != 0 )
             break;
 
-    carry = mpi_sub_hlp( n, X->p, B->p );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, A->n ) );
+
+    /* Set the high limbs of X to match A. Don't touch the lower limbs
+     * because X might be aliased to B, and we must not overwrite the
+     * significant digits of B. */
+    if( A->n > n )
+        memcpy( X->p + n, A->p + n, ( A->n - n ) * ciL );
+    if( X->n > A->n )
+        memset( X->p + A->n, 0, ( X->n - A->n ) * ciL );
+
+    carry = mpi_sub_hlp( n, X->p, A->p, B->p );
     if( carry != 0 )
     {
         /* Propagate the carry to the first nonzero limb of X. */
@@ -1418,10 +1412,10 @@ int mbedtls_mpi_sub_abs( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi
         --X->p[n];
     }
 
+    /* X should always be positive as a result of unsigned subtractions. */
+    X->s = 1;
+
 cleanup:
-
-    mbedtls_mpi_free( &TB );
-
     return( ret );
 }
 
@@ -2065,7 +2059,7 @@ static void mpi_montmul( mbedtls_mpi *A, const mbedtls_mpi *B, const mbedtls_mpi
      * do the calculation without using conditional tests. */
     /* Set d to d0 + (2^biL)^n - N where d0 is the current value of d. */
     d[n] += 1;
-    d[n] -= mpi_sub_hlp( n, d, N->p );
+    d[n] -= mpi_sub_hlp( n, d, d, N->p );
     /* If d0 < N then d < (2^biL)^n
      * so d[n] == 0 and we want to keep A as it is.
      * If d0 >= N then d >= (2^biL)^n, and d <= (2^biL)^n + N < 2 * (2^biL)^n

From a5d8d89cca057e4541e7297c3f9457d2ab222088 Mon Sep 17 00:00:00 2001
From: Gilles Peskine <Gilles.Peskine@arm.com>
Date: Thu, 23 Jul 2020 21:27:15 +0200
Subject: [PATCH 04/10] Document mpi_mul_hlp

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
---
 library/bignum.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/library/bignum.c b/library/bignum.c
index 5cd1c3e842..a847e5071e 100644
--- a/library/bignum.c
+++ b/library/bignum.c
@@ -1525,8 +1525,21 @@ int mbedtls_mpi_sub_int( mbedtls_mpi *X, const mbedtls_mpi *A, mbedtls_mpi_sint
     return( mbedtls_mpi_sub_mpi( X, A, &_B ) );
 }
 
-/*
- * Helper for mbedtls_mpi multiplication
+/** Helper for mbedtls_mpi multiplication.
+ *
+ * Add \p b * \p s to \p d.
+ *
+ * \param i             The number of limbs of \p s.
+ * \param[in] s         A bignum to multiply, of size \p i.
+ *                      It may overlap with \p d, but only if
+ *                      \p d <= \p s.
+ *                      Its leading limb must not be \c 0.
+ * \param[in,out] d     The bignum to add to.
+ *                      It must be sufficiently large to store the
+ *                      result of the multiplication. This means
+ *                      \p i + 1 limbs if \p d[\p i - 1] started as 0 and \p b
+ *                      is not known a priori.
+ * \param b             A scalar to multiply.
  */
 static
 #if defined(__APPLE__) && defined(__arm__)
@@ -1536,7 +1549,10 @@ static
  */
 __attribute__ ((noinline))
 #endif
-void mpi_mul_hlp( size_t i, mbedtls_mpi_uint *s, mbedtls_mpi_uint *d, mbedtls_mpi_uint b )
+void mpi_mul_hlp( size_t i,
+                  const mbedtls_mpi_uint *s,
+                  mbedtls_mpi_uint *d,
+                  mbedtls_mpi_uint b )
 {
     mbedtls_mpi_uint c = 0, t = 0;
 

From 8fd95c6757509432f5e09ef5ddf730b47aacf076 Mon Sep 17 00:00:00 2001
From: Gilles Peskine <Gilles.Peskine@arm.com>
Date: Thu, 23 Jul 2020 21:58:50 +0200
Subject: [PATCH 05/10] Perform mbedtls_mpi_mul_int in place if possible

Rewrite mbedtls_mpi_mul_int to call mpi_mul_hlp directly rather than
create a temporary mpi object. This has the benefit of not performing
an allocation when the multiplication is in place (mpi operand aliased
with the result) and the result mpi is large enough.

This saves about 40% of the calloc() calls in test_suite_ecp. There is
no measurable performance difference on my Linux PC.

The cost is a few bytes in bignum.o.

When there is no aliasing, or when there is aliasing but the mpi
object needs to be enlarged, the performance difference is negligible.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
---
 library/bignum.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/library/bignum.c b/library/bignum.c
index a847e5071e..0eb212560a 100644
--- a/library/bignum.c
+++ b/library/bignum.c
@@ -1658,17 +1658,30 @@ cleanup:
  */
 int mbedtls_mpi_mul_int( mbedtls_mpi *X, const mbedtls_mpi *A, mbedtls_mpi_uint b )
 {
-    mbedtls_mpi _B;
-    mbedtls_mpi_uint p[1];
     MPI_VALIDATE_RET( X != NULL );
     MPI_VALIDATE_RET( A != NULL );
 
-    _B.s = 1;
-    _B.n = 1;
-    _B.p = p;
-    p[0] = b;
+    /* mpi_mul_hlp can't deal with a leading 0. */
+    size_t n = A->n;
+    while( n > 0 && A->p[n - 1] == 0 )
+        --n;
 
-    return( mbedtls_mpi_mul_mpi( X, A, &_B ) );
+    /* The general method below doesn't work if n==0 or b==0. By chance
+     * calculating the result is trivial in those cases. */
+    if( b == 0 || n == 0 )
+    {
+        mbedtls_mpi_lset( X, 0 );
+        return( 0 );
+    }
+
+    /* Calculate X*b as A + A*(b-1) to take advantage of mpi_mul_hlp */
+    int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
+    MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, n + 1 ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_copy( X, A ) );
+    mpi_mul_hlp( n, A->p, X->p, b - 1 );
+
+cleanup:
+    return( ret );
 }
 
 /*

From 8e464c407a1ce8b88412c6a8cc8aafa8d2cf1b0f Mon Sep 17 00:00:00 2001
From: Gilles Peskine <Gilles.Peskine@arm.com>
Date: Fri, 24 Jul 2020 00:08:38 +0200
Subject: [PATCH 06/10] mpi_mul_hlp: microoptimization

If c == 0, no need to add it to *d.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
---
 library/bignum.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/bignum.c b/library/bignum.c
index 0eb212560a..af9a399b6f 100644
--- a/library/bignum.c
+++ b/library/bignum.c
@@ -1607,10 +1607,10 @@ void mpi_mul_hlp( size_t i,
 
     t++;
 
-    do {
+    while( c != 0 )
+    {
         *d += c; c = ( *d < c ); d++;
     }
-    while( c != 0 );
 }
 
 /*

From cd0dbf36b6e35b8d02fe9535186fc79cd7e245a0 Mon Sep 17 00:00:00 2001
From: Gilles Peskine <Gilles.Peskine@arm.com>
Date: Fri, 24 Jul 2020 00:09:04 +0200
Subject: [PATCH 07/10] mbedtls_mpi_mul_hlp: no microoptimization

Note a possible microoptimization in mbedtls_mpi_mul_hlp that I tried
in the hope of reducing the number of allocations, but turned out to
be counterproductive.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
---
 library/bignum.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/library/bignum.c b/library/bignum.c
index af9a399b6f..4413752633 100644
--- a/library/bignum.c
+++ b/library/bignum.c
@@ -1676,6 +1676,14 @@ int mbedtls_mpi_mul_int( mbedtls_mpi *X, const mbedtls_mpi *A, mbedtls_mpi_uint
 
     /* Calculate X*b as A + A*(b-1) to take advantage of mpi_mul_hlp */
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
+    /* In general, A * b requires 1 limb more than b. If
+     * A->p[n - 1] * b / b == A->p[n - 1], then A * b fits in the same
+     * number of limbs as A and the call to grow() is not required since
+     * copy() will take care of the growth. However, experimentally,
+     * making the call to grow() conditional causes slightly fewer
+     * calls to calloc() in ECP code, presumably because it reuses the
+     * same mpi for a while and this way the mpi is more likely to directly
+     * grow to its final size. */
     MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, n + 1 ) );
     MBEDTLS_MPI_CHK( mbedtls_mpi_copy( X, A ) );
     mpi_mul_hlp( n, A->p, X->p, b - 1 );

From 2536aa709bc26b6cb8dd840cd2f7368767eee7e6 Mon Sep 17 00:00:00 2001
From: Gilles Peskine <Gilles.Peskine@arm.com>
Date: Fri, 24 Jul 2020 00:12:59 +0200
Subject: [PATCH 08/10] mbedtls_mpi_div_mpi: directly grow T1 to its useful
 size

T1 is set to a 2-limb value. The first operation that takes it as
input is mbedtls_mpi_mul_int, which makes it grow to 3 limbs. Later it
is shifted left, which causes it to grow again. Set its size to the
final size from the start. This saves two calls to calloc(), at the
expense of a slowdown in some operations involving T1 as input since
it now has more leading zeros.

Setting T1 to 3 limbs initially instead of 2 saves about 6% of the
calloc() calls in test_suite_ecp and does not incur a performance
penalty. Setting T1 to A->n + 2 limbs instead of 2 saves about 20% of
the calloc calls and does not cause a measurable performance
difference on my Linux PC.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
---
 library/bignum.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/bignum.c b/library/bignum.c
index 4413752633..f1e544370f 100644
--- a/library/bignum.c
+++ b/library/bignum.c
@@ -1830,7 +1830,7 @@ int mbedtls_mpi_div_mpi( mbedtls_mpi *Q, mbedtls_mpi *R, const mbedtls_mpi *A,
 
     MBEDTLS_MPI_CHK( mbedtls_mpi_grow( &Z, A->n + 2 ) );
     MBEDTLS_MPI_CHK( mbedtls_mpi_lset( &Z,  0 ) );
-    MBEDTLS_MPI_CHK( mbedtls_mpi_grow( &T1, 2 ) );
+    MBEDTLS_MPI_CHK( mbedtls_mpi_grow( &T1, A->n + 2 ) );
 
     k = mbedtls_mpi_bitlen( &Y ) % biL;
     if( k < biL - 1 )

From e1bba7ce481bfb06e059d1aad5efee50945fb451 Mon Sep 17 00:00:00 2001
From: Gilles Peskine <Gilles.Peskine@arm.com>
Date: Wed, 10 Mar 2021 23:44:10 +0100
Subject: [PATCH 09/10] Fix semantically meaningful typos in comments

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
---
 library/bignum.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/library/bignum.c b/library/bignum.c
index f1e544370f..7981175135 100644
--- a/library/bignum.c
+++ b/library/bignum.c
@@ -1674,13 +1674,13 @@ int mbedtls_mpi_mul_int( mbedtls_mpi *X, const mbedtls_mpi *A, mbedtls_mpi_uint
         return( 0 );
     }
 
-    /* Calculate X*b as A + A*(b-1) to take advantage of mpi_mul_hlp */
+    /* Calculate A*b as A + A*(b-1) to take advantage of mpi_mul_hlp */
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     /* In general, A * b requires 1 limb more than b. If
      * A->p[n - 1] * b / b == A->p[n - 1], then A * b fits in the same
      * number of limbs as A and the call to grow() is not required since
-     * copy() will take care of the growth. However, experimentally,
-     * making the call to grow() conditional causes slightly fewer
+     * copy() will take care of the growth if needed. However, experimentally,
+     * making the call to grow() unconditional causes slightly fewer
      * calls to calloc() in ECP code, presumably because it reuses the
      * same mpi for a while and this way the mpi is more likely to directly
      * grow to its final size. */

From b76517b764182c18ad8519300426e99e07ecee21 Mon Sep 17 00:00:00 2001
From: Gilles Peskine <Gilles.Peskine@arm.com>
Date: Wed, 10 Mar 2021 23:44:28 +0100
Subject: [PATCH 10/10] Cosmetic improvement

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
---
 library/ecp_curves.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/ecp_curves.c b/library/ecp_curves.c
index a1aab5deb6..962d5af9bc 100644
--- a/library/ecp_curves.c
+++ b/library/ecp_curves.c
@@ -1001,7 +1001,7 @@ static inline void sub32( uint32_t *dst, uint32_t src, signed char *carry )
 #define SUB( j )    sub32( &cur, A( j ), &c );
 
 #define ciL    (sizeof(mbedtls_mpi_uint))         /* chars in limb  */
-#define biL    (ciL << 3)               /* bits  in limb  */
+#define biL    (ciL << 3)                         /* bits  in limb  */
 
 /*
  * Helpers for the main 'loop'