diff --git a/rewind.c b/rewind.c index 0e992c8254..756348f4d2 100644 --- a/rewind.c +++ b/rewind.c @@ -19,7 +19,6 @@ #include "rewind.h" #include "performance.h" #include -#include #include #include #include "intl/intl.h" @@ -63,6 +62,260 @@ repeat { size thisstart; #endif +size_t state_manager_raw_maxsize(size_t uncomp) +{ + const int maxcblkcover = UINT16_MAX * sizeof(uint16_t); /* bytes covered by a compressed block */ + size_t uncomp16 = (uncomp + sizeof(uint16_t) - 1) & ~sizeof(uint16_t); /* uncompressed size, rounded to 16 bits */ + size_t maxcblks = (uncomp + maxcblkcover - 1) / maxcblkcover; /* number of blocks */ + return uncomp16 + maxcblks * sizeof(uint16_t)*2 /* two u16 overhead per block */ + sizeof(uint16_t)*3; /* three u16 to end it */ +} + +void *state_manager_raw_alloc(size_t len, uint16_t uniq) +{ + size_t len16 = (len + sizeof(uint16_t) - 1) & ~sizeof(uint16_t); + + uint16_t *ret = (uint16_t*)calloc(len16 + sizeof(uint16_t) * 4 + 16, 1); + + /* Force in a different byte at the end, so we don't need to check + * bounds in the innermost loop (it's expensive). + * + * There is also a large amount of data that's the same, to stop + * the other scan. + * + * There is also some padding at the end. This is so we don't + * read outside the buffer end if we're reading in large blocks; + * + * It doesn't make any difference to us, but sacrificing 16 bytes to get + * Valgrind happy is worth it. */ + ret[len16/sizeof(uint16_t) + 3] = uniq; + + return ret; +} + +#if __SSE2__ +#if defined(__GNUC__) +static INLINE int compat_ctz(unsigned x) +{ + return __builtin_ctz(x); +} +#else + +/* Only checks at nibble granularity, + * because that's what we need. */ + +static INLINE int compat_ctz(unsigned x) +{ + if (x & 0x000f) + return 0; + if (x & 0x00f0) + return 4; + if (x & 0x0f00) + return 8; + if (x & 0xf000) + return 12; + return 16; +} +#endif + +#include +/* There's no equivalent in libc, you'd think so ... + * std::mismatch exists, but it's not optimized at all. */ + +static INLINE size_t find_change(const uint16_t *a, const uint16_t *b) +{ + const __m128i *a128 = (const __m128i*)a; + const __m128i *b128 = (const __m128i*)b; + + for (;;) + { + __m128i v0 = _mm_loadu_si128(a128); + __m128i v1 = _mm_loadu_si128(b128); + __m128i c = _mm_cmpeq_epi32(v0, v1); + uint32_t mask = _mm_movemask_epi8(c); + + if (mask != 0xffff) /* Something has changed, figure out where. */ + { + size_t ret = (((uint8_t*)a128 - (uint8_t*)a) | + (compat_ctz(~mask))) >> 1; + return ret | (a[ret] == b[ret]); + } + + a128++; + b128++; + } +} +#else +static INLINE size_t find_change(const uint16_t *a, const uint16_t *b) +{ + const uint16_t *a_org = a; +#ifdef NO_UNALIGNED_MEM + while (((uintptr_t)a & (sizeof(size_t) - 1)) && *a == *b) + { + a++; + b++; + } + if (*a == *b) +#endif + { + const size_t *a_big = (const size_t*)a; + const size_t *b_big = (const size_t*)b; + + while (*a_big == *b_big) + { + a_big++; + b_big++; + } + a = (const uint16_t*)a_big; + b = (const uint16_t*)b_big; + + while (*a == *b) + { + a++; + b++; + } + } + return a - a_org; +} +#endif + +static INLINE size_t find_same(const uint16_t *a, const uint16_t *b) +{ + const uint16_t *a_org = a; +#ifdef NO_UNALIGNED_MEM + if (((uintptr_t)a & (sizeof(uint32_t) - 1)) && *a != *b) + { + a++; + b++; + } + if (*a != *b) +#endif + { + /* With this, it's random whether two consecutive identical + * words are caught. + * + * Luckily, compression rate is the same for both cases, and + * three is always caught. + * + * (We prefer to miss two-word blocks, anyways; fewer iterations + * of the outer loop, as well as in the decompressor.) */ + const uint32_t *a_big = (const uint32_t*)a; + const uint32_t *b_big = (const uint32_t*)b; + + while (*a_big != *b_big) + { + a_big++; + b_big++; + } + a = (const uint16_t*)a_big; + b = (const uint16_t*)b_big; + + if (a != a_org && a[-1] == b[-1]) + { + a--; + b--; + } + } + return a - a_org; +} + +size_t state_manager_raw_compress(const void *src, const void *dst, size_t len, void *patch) +{ + const uint16_t *old16 = (const uint16_t*)src; + const uint16_t *new16 = (const uint16_t*)dst; + uint16_t *compressed16 = (uint16_t*)patch; + size_t num16s = (len + sizeof(uint16_t) - 1) / sizeof(uint16_t); + + while (num16s) + { + size_t i; + size_t skip = find_change(old16, new16); + + if (skip >= num16s) + break; + + old16 += skip; + new16 += skip; + num16s -= skip; + + if (skip > UINT16_MAX) + { + if (skip > UINT32_MAX) + { + /* This will make it scan the entire thing again, + * but it only hits on 8GB unchanged data anyways, + * and if you're doing that, you've got bigger problems. */ + skip = UINT32_MAX; + } + *compressed16++ = 0; + *compressed16++ = skip; + *compressed16++ = skip >> 16; + skip = 0; + continue; + } + + size_t changed = find_same(old16, new16); + if (changed > UINT16_MAX) + changed = UINT16_MAX; + + *compressed16++ = changed; + *compressed16++ = skip; + + for (i = 0; i < changed; i++) + compressed16[i] = old16[i]; + + old16 += changed; + new16 += changed; + num16s -= changed; + compressed16 += changed; + } + + compressed16[0] = 0; + compressed16[1] = 0; + compressed16[2] = 0; + + return (uint8_t*)(compressed16+3) - (uint8_t*)patch; +} + +void state_manager_raw_decompress(const void *patch, size_t patchlen, void *data, size_t datalen) +{ + uint16_t *out16 = (uint16_t*)data; + const uint16_t *patch16 = (const uint16_t*)patch; + + (void)patchlen; + (void)datalen; + + for (;;) + { + uint16_t i; + uint16_t numchanged = *(patch16++); + + if (numchanged) + { + out16 += *patch16++; + + /* We could do memcpy, but it seems that memcpy has a + * constant-per-call overhead that actually shows up. + * + * Our average size in here seems to be 8 or something. + * Therefore, we do something with lower overhead. */ + for (i = 0; i < numchanged; i++) + out16[i] = patch16[i]; + + patch16 += numchanged; + out16 += numchanged; + } + else + { + uint32_t numunchanged = patch16[0] | (patch16[1] << 16); + + if (!numunchanged) + break; + patch16 += 2; + out16 += numunchanged; + } + } +} + /* The start offsets point to 'nextstart' of any given compressed frame. * Each uint16 is stored native endian; anything that claims any other * endianness refers to the endianness of this specific item. @@ -125,46 +378,21 @@ struct state_manager state_manager_t *state_manager_new(size_t state_size, size_t buffer_size) { - size_t newblocksize; - int maxcblks; - const int maxcblkcover = UINT16_MAX * sizeof(uint16_t); state_manager_t *state = (state_manager_t*)calloc(1, sizeof(*state)); if (!state) return NULL; - newblocksize = ((state_size - 1) | (sizeof(uint16_t) - 1)) + 1; - state->blocksize = newblocksize; - - maxcblks = (state->blocksize + maxcblkcover - 1) / maxcblkcover; - state->maxcompsize = state->blocksize + maxcblks * sizeof(uint16_t) * 2 + - sizeof(uint16_t) + sizeof(uint32_t) + sizeof(size_t) * 2; - + state->blocksize = (state_size + sizeof(uint16_t) - 1) & ~sizeof(uint16_t); + /* the compressed data is surrounded by pointers to the other side */ + state->maxcompsize = state_manager_raw_maxsize(state_size) + sizeof(size_t) * 2; state->data = (uint8_t*)malloc(buffer_size); - state->thisblock = (uint8_t*) - calloc(state->blocksize + sizeof(uint16_t) * 4 + 16, 1); - state->nextblock = (uint8_t*) - calloc(state->blocksize + sizeof(uint16_t) * 4 + 16, 1); + state->thisblock = (uint8_t*)state_manager_raw_alloc(state_size, 0); + state->nextblock = (uint8_t*)state_manager_raw_alloc(state_size, 1); if (!state->data || !state->thisblock || !state->nextblock) goto error; - /* Force in a different byte at the end, so we don't need to check - * bounds in the innermost loop (it's expensive). - * - * There is also a large amount of data that's the same, to stop - * the other scan. - * - * There is also some padding at the end. This is so we don't - * read outside the buffer end if we're reading in large blocks; - * - * It doesn't make any difference to us, but sacrificing 16 bytes to get - * Valgrind happy is worth it. */ - *(uint16_t*)(state->thisblock + state->blocksize + sizeof(uint16_t) * 3) = - 0xFFFF; - *(uint16_t*)(state->nextblock + state->blocksize + sizeof(uint16_t) * 3) = - 0x0000; - state->capacity = buffer_size; state->head = state->data + sizeof(size_t); @@ -215,42 +443,7 @@ bool state_manager_pop(state_manager_t *state, const void **data) compressed = state->data + start + sizeof(size_t); out = state->thisblock; - /* Begin decompression code - * out is the last pushed (or returned) state */ - compressed16 = (const uint16_t*)compressed; - out16 = (uint16_t*)out; - - for (;;) - { - uint16_t i; - uint16_t numchanged = *(compressed16++); - - if (numchanged) - { - out16 += *compressed16++; - - /* We could do memcpy, but it seems that memcpy has a - * constant-per-call overhead that actually shows up. - * - * Our average size in here seems to be 8 or something. - * Therefore, we do something with lower overhead. */ - for (i = 0; i < numchanged; i++) - out16[i] = compressed16[i]; - - compressed16 += numchanged; - out16 += numchanged; - } - else - { - uint32_t numunchanged = compressed16[0] | (compressed16[1] << 16); - - if (!numunchanged) - break; - compressed16 += 2; - out16 += numunchanged; - } - } - /* End decompression code */ + state_manager_raw_decompress(compressed, state->maxcompsize, out, state->blocksize); state->entries--; *data = state->thisblock; @@ -276,132 +469,6 @@ void state_manager_push_where(state_manager_t *state, void **data) *data = state->nextblock; } -#if __SSE2__ -#if defined(__GNUC__) -static INLINE int compat_ctz(unsigned x) -{ - return __builtin_ctz(x); -} -#else - -/* Only checks at nibble granularity, - * because that's what we need. */ - -static INLINE int compat_ctz(unsigned x) -{ - if (x & 0x000f) - return 0; - if (x & 0x00f0) - return 4; - if (x & 0x0f00) - return 8; - if (x & 0xf000) - return 12; - return 16; -} -#endif - -#include -/* There's no equivalent in libc, you'd think so ... - * std::mismatch exists, but it's not optimized at all. */ - -static INLINE size_t find_change(const uint16_t *a, const uint16_t *b) -{ - const __m128i *a128 = (const __m128i*)a; - const __m128i *b128 = (const __m128i*)b; - - for (;;) - { - __m128i v0 = _mm_loadu_si128(a128); - __m128i v1 = _mm_loadu_si128(b128); - __m128i c = _mm_cmpeq_epi32(v0, v1); - uint32_t mask = _mm_movemask_epi8(c); - - if (mask != 0xffff) /* Something has changed, figure out where. */ - { - size_t ret = (((uint8_t*)a128 - (uint8_t*)a) | - (compat_ctz(~mask))) >> 1; - return ret | (a[ret] == b[ret]); - } - - a128++; - b128++; - } -} -#else -static INLINE size_t find_change(const uint16_t *a, const uint16_t *b) -{ - const uint16_t *a_org = a; -#ifdef NO_UNALIGNED_MEM - while (((uintptr_t)a & (sizeof(size_t) - 1)) && *a == *b) - { - a++; - b++; - } - if (*a == *b) -#endif - { - const size_t *a_big = (const size_t*)a; - const size_t *b_big = (const size_t*)b; - - while (*a_big == *b_big) - { - a_big++; - b_big++; - } - a = (const uint16_t*)a_big; - b = (const uint16_t*)b_big; - - while (*a == *b) - { - a++; - b++; - } - } - return a - a_org; -} -#endif - -static INLINE size_t find_same(const uint16_t *a, const uint16_t *b) -{ - const uint16_t *a_org = a; -#ifdef NO_UNALIGNED_MEM - if (((uintptr_t)a & (sizeof(uint32_t) - 1)) && *a != *b) - { - a++; - b++; - } - if (*a != *b) -#endif - { - /* With this, it's random whether two consecutive identical - * words are caught. - * - * Luckily, compression rate is the same for both cases, and - * three is always caught. - * - * (We prefer to miss two-word blocks, anyways; fewer iterations - * of the outer loop, as well as in the decompressor.) */ - const uint32_t *a_big = (const uint32_t*)a; - const uint32_t *b_big = (const uint32_t*)b; - - while (*a_big != *b_big) - { - a_big++; - b_big++; - } - a = (const uint16_t*)a_big; - b = (const uint16_t*)b_big; - - if (a != a_org && a[-1] == b[-1]) - { - a--; - b--; - } - } - return a - a_org; -} - void state_manager_push_do(state_manager_t *state) { uint8_t *swap = NULL; @@ -438,62 +505,7 @@ recheckcapacity:; newb = state->nextblock; compressed = state->head + sizeof(size_t); - /* Begin compression code; 'compressed' will point to - * the end of the compressed data (excluding the prev pointer). */ - old16 = (const uint16_t*)oldb; - new16 = (const uint16_t*)newb; - compressed16 = (uint16_t*)compressed; - num16s = state->blocksize / sizeof(uint16_t); - - while (num16s) - { - size_t i; - size_t skip = find_change(old16, new16); - - if (skip >= num16s) - break; - - old16 += skip; - new16 += skip; - num16s -= skip; - - if (skip > UINT16_MAX) - { - if (skip > UINT32_MAX) - { - /* This will make it scan the entire thing again, - * but it only hits on 8GB unchanged data anyways, - * and if you're doing that, you've got bigger problems. */ - skip = UINT32_MAX; - } - *compressed16++ = 0; - *compressed16++ = skip; - *compressed16++ = skip >> 16; - skip = 0; - continue; - } - - size_t changed = find_same(old16, new16); - if (changed > UINT16_MAX) - changed = UINT16_MAX; - - *compressed16++ = changed; - *compressed16++ = skip; - - for (i = 0; i < changed; i++) - compressed16[i] = old16[i]; - - old16 += changed; - new16 += changed; - num16s -= changed; - compressed16 += changed; - } - - compressed16[0] = 0; - compressed16[1] = 0; - compressed16[2] = 0; - compressed = (uint8_t*)(compressed16 + 3); - /* End compression code. */ + compressed += state_manager_raw_compress(oldb, newb, state->blocksize, compressed); if (compressed - state->data + state->maxcompsize > state->capacity) { diff --git a/rewind.h b/rewind.h index 08052ee6f2..62aa660f56 100644 --- a/rewind.h +++ b/rewind.h @@ -24,6 +24,7 @@ extern "C" { #include #include +#include typedef struct state_manager state_manager_t; @@ -42,6 +43,31 @@ void state_manager_capacity(state_manager_t *state, void init_rewind(void); + +/* Returns the maximum compressed size of a savestate. It is very likely to compress to far less. */ +size_t state_manager_raw_maxsize(size_t uncomp); + +/* + * See state_manager_raw_compress for information about this. + * When you're done with it, send it to free(). + */ +void *state_manager_raw_alloc(size_t len, uint16_t uniq); + +/* + * Takes two savestates and creates a patch that turns 'src' into 'dst'. + * Both 'src' and 'dst' must be returned from state_manager_raw_alloc(), with the same 'len', and different 'uniq'. + * 'patch' must be size 'state_manager_raw_maxsize(len)' or more. + * Returns the number of bytes actually written to 'patch'. + */ +size_t state_manager_raw_compress(const void *src, const void *dst, size_t len, void *patch); + +/* + * Takes 'patch' from a previous call to 'state_manager_raw_compress' and applies it to 'data' ('src' from that call), + * yielding 'dst' in that call. + * If the given arguments do not match a previous call to state_manager_raw_compress(), anything at all can happen. + */ +void state_manager_raw_decompress(const void *patch, size_t patchlen, void *data, size_t datalen); + #ifdef __cplusplus } #endif