aria: define SLA() as sl(a())

This decreases the size with -Os by nearly 1k while not hurting performance too much with -O2 and -O3 Before: O aria.o ins s 8784 41,408 2 11112 37,001 3 13096 27,438 After: O aria.o ins s 7976 43,865 2 10520 37,631 3 13040 28,146 (See previous commit for measurement details.)
2025-04-01 04:20:45 +00:00 · 2018-02-21 12:35:19 +01:00 · 2018-02-21 12:35:19 +01:00 · 64744f88b6
commit 64744f88b6
parent 8c76a9489e
1 changed files with 63 additions and 102 deletions
--- a/library/aria.c
+++ b/library/aria.c
@ -84,87 +84,62 @@ static void mbedtls_zeroize( void *v, size_t n ) {
 #define ARIA_FLIP2(x) ((((x) >> 8) & 0x00FF00FF) ^ (((x) & 0x00FF00FF) << 8))
 /*
- * Affine Transform A
+ * ARIA Affine Transform
 * (ra, rb, rc, rd) = state in/out
 */
-#define ARIA_A( ra, rb, rc, rd ) {      \
+static inline void aria_a( uint32_t *a, uint32_t *b,
-    uint32_t ta, tb, tc;                \
+                           uint32_t *c, uint32_t *d )
-    ta  =   rb;                         \
+{
-    rb  =   ra;                         \
+    uint32_t ta, tb, tc;
-    ra  =   ARIA_FLIP1( ta );           \
+    ta  =   *b;
-    tb  =   ARIA_FLIP1( rd );           \
+    *b  =   *a;
-    rd  =   ARIA_FLIP2( rc );           \
+    *a  =   ARIA_FLIP1( ta );
-    rc  =   ARIA_FLIP2( tb );           \
+    tb  =   ARIA_FLIP1( *d );
-    ta  ^=  rd;                         \
+    *d  =   ARIA_FLIP2( *c );
-    tc  =   ARIA_FLIP1( rb );           \
+    *c  =   ARIA_FLIP2( tb );
-    ta  =   ARIA_FLIP2( ta ) ^ tc ^ rc; \
+    ta  ^=  *d;
-    tb  ^=  ARIA_FLIP1( rd );           \
+    tc  =   ARIA_FLIP1( *b );
-    tc  ^=  ARIA_FLIP2( ra );           \
+    ta  =   ARIA_FLIP2( ta ) ^ tc ^ *c;
-    rb  ^=  ta ^ tb;                    \
+    tb  ^=  ARIA_FLIP1( *d );
-    tb  =   ARIA_FLIP1( tb ) ^ ta;      \
+    tc  ^=  ARIA_FLIP2( *a );
-    ra  ^=  ARIA_FLIP2( tb );           \
+    *b  ^=  ta ^ tb;
-    ta  =   ARIA_FLIP1( ta );           \
+    tb  =   ARIA_FLIP1( tb ) ^ ta;
-    rd  ^=  ARIA_FLIP2( ta ) ^ tc;      \
+    *a  ^=  ARIA_FLIP2( tb );
-    tc  =   ARIA_FLIP1( tc );           \
+    ta  =   ARIA_FLIP1( ta );
-    rc  ^=  ARIA_FLIP2( tc ) ^ ta;      \
+    *d  ^=  ARIA_FLIP2( ta ) ^ tc;
    tc  =   ARIA_FLIP1( tc );
    *c  ^=  ARIA_FLIP2( tc ) ^ ta;
 }
 /*
- * ARIA Round function ( Substitution Layer SLx + Affine Transform A )
+ * ARIA Substitution Layer SL1 / SL2
- * (ra, rb, rc, rd) = state in/out
+ * (a, b, c, d) = state in/out
 * (sa, sb, sc, sd) = 256 8-bit S-Boxes (see below)
 *
- * By passing sb1, sb2, is1, is2 as S-Boxes you get SL1-then-A.
+ * By passing sb1, sb2, is1, is2 as S-Boxes you get SL1
- * By passing is1, is2, sb1, sb2 as S-Boxes you get SL2-then-A.
+ * By passing is1, is2, sb1, sb2 as S-Boxes you get SL2
 */
-static inline void aria_sla( uint32_t *a, uint32_t *b,
+static inline void aria_sl( uint32_t *a, uint32_t *b,
-                             uint32_t *c, uint32_t *d,
+                            uint32_t *c, uint32_t *d,
-                             const uint8_t sa[0x100], const uint8_t sb[0x100],
+                            const uint8_t sa[0x100], const uint8_t sb[0x100],
-                             const uint8_t sc[0x100], const uint8_t sd[0x100] )
+                            const uint8_t sc[0x100], const uint8_t sd[0x100] )
 {
-    uint32_t ra, rb, rc, rd, ta, tb, tc;
+    *a = ( (uint32_t) sa[ *a        & 0xFF])        ^
-
+         (((uint32_t) sb[(*a >>  8) & 0xFF]) <<  8) ^
-    ra = *a;
+         (((uint32_t) sc[(*a >> 16) & 0xFF]) << 16) ^
-    rb = *b;
+         (((uint32_t) sd[ *a >> 24        ]) << 24);
-    rc = *c;
+    *b = ( (uint32_t) sa[ *b        & 0xFF])        ^
-    rd = *d;
+         (((uint32_t) sb[(*b >>  8) & 0xFF]) <<  8) ^
-
+         (((uint32_t) sc[(*b >> 16) & 0xFF]) << 16) ^
-    ta  =   ( (uint32_t) sc[(rb >> 16) & 0xFF]) ^
+         (((uint32_t) sd[ *b >> 24        ]) << 24);
-            (((uint32_t) sd[ rb >> 24]) << 8)   ^
+    *c = ( (uint32_t) sa[ *c        & 0xFF])        ^
-            (((uint32_t) sa[ rb & 0xFF]) << 16) ^
+         (((uint32_t) sb[(*c >>  8) & 0xFF]) <<  8) ^
-            (((uint32_t) sb[(rb >> 8) & 0xFF]) << 24);
+         (((uint32_t) sc[(*c >> 16) & 0xFF]) << 16) ^
-    rb  =   ( (uint32_t) sa[ ra & 0xFF]) ^
+         (((uint32_t) sd[ *c >> 24        ]) << 24);
-            (((uint32_t) sb[(ra >> 8) & 0xFF]) << 8) ^
+    *d = ( (uint32_t) sa[ *d        & 0xFF])        ^
-            (((uint32_t) sc[(ra >> 16) & 0xFF]) << 16) ^
+         (((uint32_t) sb[(*d >>  8) & 0xFF]) <<  8) ^
-            (((uint32_t) sd[ ra >> 24]) << 24);
+         (((uint32_t) sc[(*d >> 16) & 0xFF]) << 16) ^
-    ra  =   ta;
+         (((uint32_t) sd[ *d >> 24        ]) << 24);
    ta  =   ( (uint32_t) sd[ rd >> 24]) ^
            (((uint32_t) sc[(rd >> 16) & 0xFF]) << 8) ^
            (((uint32_t) sb[(rd >> 8) & 0xFF]) << 16) ^
            (((uint32_t) sa[ rd & 0xFF]) << 24);
    rd  =   ( (uint32_t) sb[(rc >> 8) & 0xFF]) ^
            (((uint32_t) sa[ rc & 0xFF]) << 8) ^
            (((uint32_t) sd[ rc >> 24]) << 16) ^
            (((uint32_t) sc[(rc >> 16) & 0xFF]) << 24);
    rc  =   ta;
    ta  =   ARIA_FLIP1( ra ) ^ rd;
    tc  =   ARIA_FLIP1( rb );
    ta  =   ARIA_FLIP2( ta ) ^ tc ^ rc;
    tb  =   ARIA_FLIP2( rc ) ^ ARIA_FLIP1( rd );
    tc  ^=  ARIA_FLIP2( ra );
    rb  ^=  ta ^ tb;
    tb  =   ARIA_FLIP1( tb ) ^ ta;
    ra  ^=  ARIA_FLIP2( tb );
    ta  =   ARIA_FLIP1( ta );
    rd  ^=  ARIA_FLIP2( ta ) ^ tc;
    tc  =   ARIA_FLIP1( tc );
    rc  ^=  ARIA_FLIP2( tc ) ^ ta;
    *a = ra;
    *b = rb;
    *c = rc;
    *d = rd;
 }
 /*
@ -287,7 +262,8 @@ static void aria_fo_xor( uint32_t r[4],
    c = p[2] ^ k[2];
    d = p[3] ^ k[3];
-    aria_sla( &a, &b, &c, &d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
+    aria_sl( &a, &b, &c, &d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
    aria_a( &a, &b, &c, &d );
    r[0] = a ^ x[0];
    r[1] = b ^ x[1];
@ -308,7 +284,8 @@ static void aria_fe_xor(uint32_t r[4],
    c = p[2] ^ k[2];
    d = p[3] ^ k[3];
-    aria_sla( &a, &b, &c, &d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
+    aria_sl( &a, &b, &c, &d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
    aria_a( &a, &b, &c, &d );
    r[0] = a ^ x[0];
    r[1] = b ^ x[1];
@ -430,7 +407,7 @@ int mbedtls_aria_setkey_dec(mbedtls_aria_context *ctx,
    /* apply affine transform to middle keys */
    for (i = 1; i < ctx->nr; i++ )
-        ARIA_A( ctx->rk[i][0], ctx->rk[i][1], ctx->rk[i][2], ctx->rk[i][3] );
+        aria_a( &ctx->rk[i][0], &ctx->rk[i][1], &ctx->rk[i][2], &ctx->rk[i][3] );
    return 0;
 }
@ -462,43 +439,27 @@ int mbedtls_aria_crypt_ecb( mbedtls_aria_context *ctx,
        c ^= ctx->rk[i][2];
        d ^= ctx->rk[i][3];
        i++;
-        aria_sla( &a, &b, &c, &d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
+
        aria_sl( &a, &b, &c, &d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
        aria_a( &a, &b, &c, &d );
        a ^= ctx->rk[i][0];
        b ^= ctx->rk[i][1];
        c ^= ctx->rk[i][2];
        d ^= ctx->rk[i][3];
        i++;
        aria_sl( &a, &b, &c, &d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
        if (i >= ctx->nr)
            break;
-
+        aria_a( &a, &b, &c, &d );
        aria_sla( &a, &b, &c, &d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
    }
-    /* final substitution */
+    /* final key mixing */
-    a = ctx->rk[i][0] ^
+    a ^= ctx->rk[i][0];
-        ( (uint32_t) aria_is1[ a        & 0xFF])        ^
+    b ^= ctx->rk[i][1];
-        (((uint32_t) aria_is2[(a >>  8) & 0xFF]) <<  8) ^
+    c ^= ctx->rk[i][2];
-        (((uint32_t) aria_sb1[(a >> 16) & 0xFF]) << 16) ^
+    d ^= ctx->rk[i][3];
        (((uint32_t) aria_sb2[ a >> 24        ]) << 24);
    b = ctx->rk[i][1] ^
        ( (uint32_t) aria_is1[ b        & 0xFF])        ^
        (((uint32_t) aria_is2[(b >>  8) & 0xFF]) <<  8) ^
        (((uint32_t) aria_sb1[(b >> 16) & 0xFF]) << 16) ^
        (((uint32_t) aria_sb2[ b >> 24        ]) << 24);
    c = ctx->rk[i][2] ^
        ( (uint32_t) aria_is1[ c        & 0xFF])        ^
        (((uint32_t) aria_is2[(c >>  8) & 0xFF]) <<  8) ^
        (((uint32_t) aria_sb1[(c >> 16) & 0xFF]) << 16) ^
        (((uint32_t) aria_sb2[ c >> 24        ]) << 24);
    d = ctx->rk[i][3] ^
        ( (uint32_t) aria_is1[ d        & 0xFF])        ^
        (((uint32_t) aria_is2[(d >>  8) & 0xFF]) <<  8) ^
        (((uint32_t) aria_sb1[(d >> 16) & 0xFF]) << 16) ^
        (((uint32_t) aria_sb2[ d >> 24        ]) << 24);
    PUT_UINT32_LE( a, output,  0 );
    PUT_UINT32_LE( b, output,  4 );