aria: define SLA() as sl(a())

This decreases the size with -Os by nearly 1k while
not hurting performance too much with -O2 and -O3

Before:
O	aria.o	ins
s	8784	41,408
2	11112	37,001
3	13096	27,438

After:
O	aria.o	ins
s	7976	43,865
2	10520	37,631
3	13040	28,146

(See previous commit for measurement details.)
This commit is contained in:
Manuel Pégourié-Gonnard 2018-02-21 12:35:19 +01:00
parent 8c76a9489e
commit 64744f88b6

View File

@ -84,87 +84,62 @@ static void mbedtls_zeroize( void *v, size_t n ) {
#define ARIA_FLIP2(x) ((((x) >> 8) & 0x00FF00FF) ^ (((x) & 0x00FF00FF) << 8)) #define ARIA_FLIP2(x) ((((x) >> 8) & 0x00FF00FF) ^ (((x) & 0x00FF00FF) << 8))
/* /*
* Affine Transform A * ARIA Affine Transform
* (ra, rb, rc, rd) = state in/out * (ra, rb, rc, rd) = state in/out
*/ */
#define ARIA_A( ra, rb, rc, rd ) { \ static inline void aria_a( uint32_t *a, uint32_t *b,
uint32_t ta, tb, tc; \ uint32_t *c, uint32_t *d )
ta = rb; \ {
rb = ra; \ uint32_t ta, tb, tc;
ra = ARIA_FLIP1( ta ); \ ta = *b;
tb = ARIA_FLIP1( rd ); \ *b = *a;
rd = ARIA_FLIP2( rc ); \ *a = ARIA_FLIP1( ta );
rc = ARIA_FLIP2( tb ); \ tb = ARIA_FLIP1( *d );
ta ^= rd; \ *d = ARIA_FLIP2( *c );
tc = ARIA_FLIP1( rb ); \ *c = ARIA_FLIP2( tb );
ta = ARIA_FLIP2( ta ) ^ tc ^ rc; \ ta ^= *d;
tb ^= ARIA_FLIP1( rd ); \ tc = ARIA_FLIP1( *b );
tc ^= ARIA_FLIP2( ra ); \ ta = ARIA_FLIP2( ta ) ^ tc ^ *c;
rb ^= ta ^ tb; \ tb ^= ARIA_FLIP1( *d );
tb = ARIA_FLIP1( tb ) ^ ta; \ tc ^= ARIA_FLIP2( *a );
ra ^= ARIA_FLIP2( tb ); \ *b ^= ta ^ tb;
ta = ARIA_FLIP1( ta ); \ tb = ARIA_FLIP1( tb ) ^ ta;
rd ^= ARIA_FLIP2( ta ) ^ tc; \ *a ^= ARIA_FLIP2( tb );
tc = ARIA_FLIP1( tc ); \ ta = ARIA_FLIP1( ta );
rc ^= ARIA_FLIP2( tc ) ^ ta; \ *d ^= ARIA_FLIP2( ta ) ^ tc;
tc = ARIA_FLIP1( tc );
*c ^= ARIA_FLIP2( tc ) ^ ta;
} }
/* /*
* ARIA Round function ( Substitution Layer SLx + Affine Transform A ) * ARIA Substitution Layer SL1 / SL2
* (ra, rb, rc, rd) = state in/out * (a, b, c, d) = state in/out
* (sa, sb, sc, sd) = 256 8-bit S-Boxes (see below) * (sa, sb, sc, sd) = 256 8-bit S-Boxes (see below)
* *
* By passing sb1, sb2, is1, is2 as S-Boxes you get SL1-then-A. * By passing sb1, sb2, is1, is2 as S-Boxes you get SL1
* By passing is1, is2, sb1, sb2 as S-Boxes you get SL2-then-A. * By passing is1, is2, sb1, sb2 as S-Boxes you get SL2
*/ */
static inline void aria_sla( uint32_t *a, uint32_t *b, static inline void aria_sl( uint32_t *a, uint32_t *b,
uint32_t *c, uint32_t *d, uint32_t *c, uint32_t *d,
const uint8_t sa[0x100], const uint8_t sb[0x100], const uint8_t sa[0x100], const uint8_t sb[0x100],
const uint8_t sc[0x100], const uint8_t sd[0x100] ) const uint8_t sc[0x100], const uint8_t sd[0x100] )
{ {
uint32_t ra, rb, rc, rd, ta, tb, tc; *a = ( (uint32_t) sa[ *a & 0xFF]) ^
(((uint32_t) sb[(*a >> 8) & 0xFF]) << 8) ^
ra = *a; (((uint32_t) sc[(*a >> 16) & 0xFF]) << 16) ^
rb = *b; (((uint32_t) sd[ *a >> 24 ]) << 24);
rc = *c; *b = ( (uint32_t) sa[ *b & 0xFF]) ^
rd = *d; (((uint32_t) sb[(*b >> 8) & 0xFF]) << 8) ^
(((uint32_t) sc[(*b >> 16) & 0xFF]) << 16) ^
ta = ( (uint32_t) sc[(rb >> 16) & 0xFF]) ^ (((uint32_t) sd[ *b >> 24 ]) << 24);
(((uint32_t) sd[ rb >> 24]) << 8) ^ *c = ( (uint32_t) sa[ *c & 0xFF]) ^
(((uint32_t) sa[ rb & 0xFF]) << 16) ^ (((uint32_t) sb[(*c >> 8) & 0xFF]) << 8) ^
(((uint32_t) sb[(rb >> 8) & 0xFF]) << 24); (((uint32_t) sc[(*c >> 16) & 0xFF]) << 16) ^
rb = ( (uint32_t) sa[ ra & 0xFF]) ^ (((uint32_t) sd[ *c >> 24 ]) << 24);
(((uint32_t) sb[(ra >> 8) & 0xFF]) << 8) ^ *d = ( (uint32_t) sa[ *d & 0xFF]) ^
(((uint32_t) sc[(ra >> 16) & 0xFF]) << 16) ^ (((uint32_t) sb[(*d >> 8) & 0xFF]) << 8) ^
(((uint32_t) sd[ ra >> 24]) << 24); (((uint32_t) sc[(*d >> 16) & 0xFF]) << 16) ^
ra = ta; (((uint32_t) sd[ *d >> 24 ]) << 24);
ta = ( (uint32_t) sd[ rd >> 24]) ^
(((uint32_t) sc[(rd >> 16) & 0xFF]) << 8) ^
(((uint32_t) sb[(rd >> 8) & 0xFF]) << 16) ^
(((uint32_t) sa[ rd & 0xFF]) << 24);
rd = ( (uint32_t) sb[(rc >> 8) & 0xFF]) ^
(((uint32_t) sa[ rc & 0xFF]) << 8) ^
(((uint32_t) sd[ rc >> 24]) << 16) ^
(((uint32_t) sc[(rc >> 16) & 0xFF]) << 24);
rc = ta;
ta = ARIA_FLIP1( ra ) ^ rd;
tc = ARIA_FLIP1( rb );
ta = ARIA_FLIP2( ta ) ^ tc ^ rc;
tb = ARIA_FLIP2( rc ) ^ ARIA_FLIP1( rd );
tc ^= ARIA_FLIP2( ra );
rb ^= ta ^ tb;
tb = ARIA_FLIP1( tb ) ^ ta;
ra ^= ARIA_FLIP2( tb );
ta = ARIA_FLIP1( ta );
rd ^= ARIA_FLIP2( ta ) ^ tc;
tc = ARIA_FLIP1( tc );
rc ^= ARIA_FLIP2( tc ) ^ ta;
*a = ra;
*b = rb;
*c = rc;
*d = rd;
} }
/* /*
@ -287,7 +262,8 @@ static void aria_fo_xor( uint32_t r[4],
c = p[2] ^ k[2]; c = p[2] ^ k[2];
d = p[3] ^ k[3]; d = p[3] ^ k[3];
aria_sla( &a, &b, &c, &d, aria_sb1, aria_sb2, aria_is1, aria_is2 ); aria_sl( &a, &b, &c, &d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
aria_a( &a, &b, &c, &d );
r[0] = a ^ x[0]; r[0] = a ^ x[0];
r[1] = b ^ x[1]; r[1] = b ^ x[1];
@ -308,7 +284,8 @@ static void aria_fe_xor(uint32_t r[4],
c = p[2] ^ k[2]; c = p[2] ^ k[2];
d = p[3] ^ k[3]; d = p[3] ^ k[3];
aria_sla( &a, &b, &c, &d, aria_is1, aria_is2, aria_sb1, aria_sb2 ); aria_sl( &a, &b, &c, &d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
aria_a( &a, &b, &c, &d );
r[0] = a ^ x[0]; r[0] = a ^ x[0];
r[1] = b ^ x[1]; r[1] = b ^ x[1];
@ -430,7 +407,7 @@ int mbedtls_aria_setkey_dec(mbedtls_aria_context *ctx,
/* apply affine transform to middle keys */ /* apply affine transform to middle keys */
for (i = 1; i < ctx->nr; i++ ) for (i = 1; i < ctx->nr; i++ )
ARIA_A( ctx->rk[i][0], ctx->rk[i][1], ctx->rk[i][2], ctx->rk[i][3] ); aria_a( &ctx->rk[i][0], &ctx->rk[i][1], &ctx->rk[i][2], &ctx->rk[i][3] );
return 0; return 0;
} }
@ -462,43 +439,27 @@ int mbedtls_aria_crypt_ecb( mbedtls_aria_context *ctx,
c ^= ctx->rk[i][2]; c ^= ctx->rk[i][2];
d ^= ctx->rk[i][3]; d ^= ctx->rk[i][3];
i++; i++;
aria_sla( &a, &b, &c, &d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
aria_sl( &a, &b, &c, &d, aria_sb1, aria_sb2, aria_is1, aria_is2 );
aria_a( &a, &b, &c, &d );
a ^= ctx->rk[i][0]; a ^= ctx->rk[i][0];
b ^= ctx->rk[i][1]; b ^= ctx->rk[i][1];
c ^= ctx->rk[i][2]; c ^= ctx->rk[i][2];
d ^= ctx->rk[i][3]; d ^= ctx->rk[i][3];
i++; i++;
aria_sl( &a, &b, &c, &d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
if (i >= ctx->nr) if (i >= ctx->nr)
break; break;
aria_a( &a, &b, &c, &d );
aria_sla( &a, &b, &c, &d, aria_is1, aria_is2, aria_sb1, aria_sb2 );
} }
/* final substitution */ /* final key mixing */
a = ctx->rk[i][0] ^ a ^= ctx->rk[i][0];
( (uint32_t) aria_is1[ a & 0xFF]) ^ b ^= ctx->rk[i][1];
(((uint32_t) aria_is2[(a >> 8) & 0xFF]) << 8) ^ c ^= ctx->rk[i][2];
(((uint32_t) aria_sb1[(a >> 16) & 0xFF]) << 16) ^ d ^= ctx->rk[i][3];
(((uint32_t) aria_sb2[ a >> 24 ]) << 24);
b = ctx->rk[i][1] ^
( (uint32_t) aria_is1[ b & 0xFF]) ^
(((uint32_t) aria_is2[(b >> 8) & 0xFF]) << 8) ^
(((uint32_t) aria_sb1[(b >> 16) & 0xFF]) << 16) ^
(((uint32_t) aria_sb2[ b >> 24 ]) << 24);
c = ctx->rk[i][2] ^
( (uint32_t) aria_is1[ c & 0xFF]) ^
(((uint32_t) aria_is2[(c >> 8) & 0xFF]) << 8) ^
(((uint32_t) aria_sb1[(c >> 16) & 0xFF]) << 16) ^
(((uint32_t) aria_sb2[ c >> 24 ]) << 24);
d = ctx->rk[i][3] ^
( (uint32_t) aria_is1[ d & 0xFF]) ^
(((uint32_t) aria_is2[(d >> 8) & 0xFF]) << 8) ^
(((uint32_t) aria_sb1[(d >> 16) & 0xFF]) << 16) ^
(((uint32_t) aria_sb2[ d >> 24 ]) << 24);
PUT_UINT32_LE( a, output, 0 ); PUT_UINT32_LE( a, output, 0 );
PUT_UINT32_LE( b, output, 4 ); PUT_UINT32_LE( b, output, 4 );