From 30477c2518464a5f4f17450409d90c674e168c61 Mon Sep 17 00:00:00 2001 From: Alcaro Date: Sat, 22 Feb 2014 18:33:47 +0100 Subject: [PATCH] Even if SSE doesn't gain us anything, going 32bit is a clear win. --- rewind.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/rewind.c b/rewind.c index 636598011a..9aeea3902b 100644 --- a/rewind.c +++ b/rewind.c @@ -340,23 +340,35 @@ static inline size_t find_same(const uint16_t * a, const uint16_t * b) static inline size_t find_same(const uint16_t * a, const uint16_t * b) { const uint16_t * a_org=a; - - //Comparing two or three words makes no real difference. - //With two, the smaller blocks are less likely to be chopped up elsewhere due to 64KB; - // with three, we get larger blocks which should be a minuscle bit faster to decompress, - // but probably a little slower to compress. Since compression is more bottleneck than decompression is, we favor that. - while (a[0]!=b[0] || a[1]!=b[1]) +#ifdef NO_UNALIGNED_MEM + if ((uintptr_t)a & (sizeof(uint32_t)-1) && *a!=*b) { a++; b++; - //Optimize this by only checking one at the time for as long as possible. - while (*a!=*b) + } + if (*a!=*b) +#endif + { + //With this, it's random whether two consecutive identical words are caught. + //Luckily, compression rate is the same for both cases, and three is always caught. + //(We prefer to miss two-word blocks, anyways; fewer iterations of the outer loop, as well as in the decompressor.) + const uint32_t* a_big=(const uint32_t*)a; + const uint32_t* b_big=(const uint32_t*)b; + + while (*a_big!=*b_big) { - a++; - b++; + a_big++; + b_big++; + } + a=(const uint16_t*)a_big; + b=(const uint16_t*)b_big; + + if (a!=a_org && a[-1]==b[-1]) + { + a--; + b--; } } - return a-a_org; } #endif