diff --git a/include/mbedtls/ecp.h b/include/mbedtls/ecp.h
index 04a557bdb4..07ed110cdc 100644
--- a/include/mbedtls/ecp.h
+++ b/include/mbedtls/ecp.h
@@ -310,9 +310,15 @@ typedef void mbedtls_ecp_restart_ctx;
  *                  MBEDTLS_ERR_ECP_IN_PROGRESS will be returned by the
  *                  function performing the computation. It is then the
  *                  caller's responsibility to either call again with the same
- *                  arguments until it returns 0 or an error code; or to free
+ *                  parameters until it returns 0 or an error code; or to free
  *                  the restart context if the operation is to be aborted.
  *
+ *                  It is strictly required that all input parameters and the
+ *                  restart context be the same on successive calls for the
+ *                  same operation, but output parameters need not be the
+ *                  same; they must not be used until the function finally
+ *                  returns 0.
+ *
  *                  This only affects functions that accept a pointer to a
  *                  \c mbedtls_ecp_restart_ctx as an argument, and only works
  *                  if that pointer valid (in particular, not NULL).
@@ -334,10 +340,13 @@ typedef void mbedtls_ecp_restart_ctx;
  *                  operations, and will do so even if max_ops is set to a
  *                  lower value.  That minimum depends on the curve size, and
  *                  can be made lower by decreasing the value of
- *                  \c MBEDTLS_ECP_WINDOW_SIZE.  As an indication, with that
- *                  parameter set to 4, the minimum amount of blocking is:
- *                  - around 165 basic operations for P-256
- *                  - around 330 basic operations for P-384
+ *                  \c MBEDTLS_ECP_WINDOW_SIZE.  As an indication, here is the
+ *                  lowest effective value for various curves and values of
+ *                  that parameter (w for short):
+ *                          w=6     w=5     w=4     w=3     w=2
+ *                  P-256   208     208     160     136     124
+ *                  P-384   682     416     320     272     248
+ *                  P-521  1364     832     640     544     496
  *
  * \note            This setting is currently ignored by Curve25519
  */
diff --git a/library/ecp.c b/library/ecp.c
index 74a19eecb3..9a8f552b74 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -89,6 +89,13 @@ static unsigned long add_count, dbl_count, mul_count;
 #if defined(MBEDTLS_ECP_RESTARTABLE)
 /*
  * Maximum number of "basic operations" to be done in a row.
+ *
+ * Default value 0 means that ECC operations will not yield.
+ * Note that regardless of the value of ecp_max_ops, always at
+ * least one step is performed before yielding.
+ *
+ * Setting ecp_max_ops=1 can be suitable for testing purposes
+ * as it will interrupt computation at all possible points.
  */
 static unsigned ecp_max_ops = 0;
 
@@ -1341,11 +1348,38 @@ cleanup:
  * modified version that provides resistance to SPA by avoiding zero
  * digits in the representation as in [3]. We modify the method further by
  * requiring that all K_i be odd, which has the small cost that our
- * representation uses one more K_i, due to carries.
+ * representation uses one more K_i, due to carries, but saves on the size of
+ * the precomputed table.
  *
- * Also, for the sake of compactness, only the seven low-order bits of x[i]
- * are used to represent K_i, and the msb of x[i] encodes the the sign (s_i in
- * the paper): it is set if and only if if s_i == -1;
+ * Summary of the comb method and its modifications:
+ *
+ * - The goal is to compute m*P for some w*d-bit integer m.
+ *
+ * - The basic comb method splits m into the w-bit integers
+ *   x[0] .. x[d-1] where x[i] consists of the bits in m whose
+ *   index has residue i modulo d, and computes m * P as
+ *   S[x[0]] + 2 * S[x[1]] + .. + 2^(d-1) S[x[d-1]], where
+ *   S[i_{w-1} .. i_0] := i_{w-1} 2^{(w-1)d} P + ... + i_1 2^d P + i_0 P.
+ *
+ * - If it happens that, say, x[i+1]=0 (=> S[x[i+1]]=0), one can replace the sum by
+ *    .. + 2^{i-1} S[x[i-1]] - 2^i S[x[i]] + 2^{i+1} S[x[i]] + 2^{i+2} S[x[i+2]] ..,
+ *   thereby successively converting it into a form where all summands
+ *   are nonzero, at the cost of negative summands. This is the basic idea of [3].
+ *
+ * - More generally, even if x[i+1] != 0, we can first transform the sum as
+ *   .. - 2^i S[x[i]] + 2^{i+1} ( S[x[i]] + S[x[i+1]] ) + 2^{i+2} S[x[i+2]] ..,
+ *   and then replace S[x[i]] + S[x[i+1]] = S[x[i] ^ x[i+1]] + 2 S[x[i] & x[i+1]].
+ *   Performing and iterating this procedure for those x[i] that are even
+ *   (keeping track of carry), we can transform the original sum into one of the form
+ *   S[x'[0]] +- 2 S[x'[1]] +- .. +- 2^{d-1} S[x'[d-1]] + 2^d S[x'[d]]
+ *   with all x'[i] odd. It is therefore only necessary to know S at odd indices,
+ *   which is why we are only computing half of it in the first place in
+ *   ecp_precompute_comb and accessing it with index abs(i) / 2 in ecp_select_comb.
+ *
+ * - For the sake of compactness, only the seven low-order bits of x[i]
+ *   are used to represent its absolute value (K_i in the paper), and the msb
+ *   of x[i] encodes the the sign (s_i in the paper): it is set if and only if
+ *   if s_i == -1;
  *
  * Calling conventions:
  * - x is an array of size d + 1
@@ -1385,14 +1419,41 @@ static void ecp_comb_recode_core( unsigned char x[], size_t d,
 }
 
 /*
- * Precompute points for the comb method
+ * Precompute points for the adapted comb method
  *
- * If i = i_{w-1} ... i_1 is the binary representation of i, then
- * T[i] = i_{w-1} 2^{(w-1)d} P + ... + i_1 2^d P + P
+ * Assumption: T must be able to hold 2^{w - 1} elements.
  *
- * T must be able to hold 2^{w - 1} elements
+ * Operation: If i = i_{w-1} ... i_1 is the binary representation of i,
+ *            sets T[i] = i_{w-1} 2^{(w-1)d} P + ... + i_1 2^d P + P.
  *
  * Cost: d(w-1) D + (2^{w-1} - 1) A + 1 N(w-1) + 1 N(2^{w-1} - 1)
+ *
+ * Note: Even comb values (those where P would be omitted from the
+ *       sum defining T[i] above) are not needed in our adaption
+ *       the the comb method. See ecp_comb_recode_core().
+ *
+ * This function currently works in four steps:
+ * (1) Computation of intermediate T[i] for 2-powers values of i
+ *     (restart state is ecp_rsm_init).
+ * (2) Normalization of coordinates of these T[i]
+ *     (restart state is ecp_rsm_pre_norm_dbl).
+ * (3) Computation of all T[i] (restart state is ecp_rsm_pre_add).
+ * (4) Normalization of all T[i] (restart state is ecp_rsm_pre_norm_add)
+ * The final restart state is ecp_rsm_T_done.
+ *
+ * Step 1 can be interrupted but not the others; together with the final
+ * coordinate normalization they are the largest steps done at once, depending
+ * on the window size. Here are operation counts for P-256:
+ *
+ * step     (2)     (3)     (4)
+ * w = 5    142     165     208
+ * w = 4    136      77     160
+ * w = 3    130      33     136
+ * w = 2    124      11     124
+ *
+ * So if ECC operations are blocking for too long even with a low max_ops
+ * value, it's useful to set MBEDTLS_ECP_WINDOW_SIZE to a lower value in order
+ * to minimize maximum blocking time.
  */
 static int ecp_precompute_comb( const mbedtls_ecp_group *grp,
                                 mbedtls_ecp_point T[], const mbedtls_ecp_point *P,
@@ -1534,6 +1595,8 @@ cleanup:
 
 /*
  * Select precomputed point: R = sign(i) * T[ abs(i) / 2 ]
+ *
+ * See ecp_comb_recode_core() for background
  */
 static int ecp_select_comb( const mbedtls_ecp_group *grp, mbedtls_ecp_point *R,
                             const mbedtls_ecp_point T[], unsigned char t_len,
@@ -1637,6 +1700,8 @@ cleanup:
  * As the actual scalar recoding needs an odd scalar as a starting point,
  * this wrapper ensures that by replacing m by N - m if necessary, and
  * informs the caller that the result of multiplication will be negated.
+ *
+ * See ecp_comb_recode_core() for background.
  */
 static int ecp_comb_recode_scalar( const mbedtls_ecp_group *grp,
                                    const mbedtls_mpi *m,
@@ -1824,8 +1889,7 @@ static int ecp_mul_comb( mbedtls_ecp_group *grp, mbedtls_ecp_point *R,
     /* Pre-computed table: do we have it already for the base point? */
     if( p_eq_g && grp->T != NULL )
     {
-        /* second pointer to the same table
-         * no ownership transfer as other threads might be using T too */
+        /* second pointer to the same table, will be deleted on exit */
         T = grp->T;
         T_ok = 1;
     }
@@ -1862,9 +1926,10 @@ static int ecp_mul_comb( mbedtls_ecp_group *grp, mbedtls_ecp_point *R,
 
         if( p_eq_g )
         {
+            /* almost transfer ownership of T to the group, but keep a copy of
+             * the pointer to use for caling the next function more easily */
             grp->T = T;
             grp->T_size = pre_len;
-            /* now have two pointers to the same table */
         }
     }