/* RetroArch - A frontend for libretro. * Copyright (C) 2010-2014 - Hans-Kristian Arntzen * * RetroArch is free software: you can redistribute it and/or modify it under the terms * of the GNU General Public License as published by the Free Software Found- * ation, either version 3 of the License, or (at your option) any later version. * * RetroArch is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with RetroArch. * If not, see . */ #define __STDC_LIMIT_MACROS #include "rewind.h" #include #include "msvc/msvc-stdint/stdint.h" //#define NO_UNALIGNED_MEM //Uncomment the above if alignment is enforced. //Format per frame: //size nextstart; //repeat { // uint16 numchanged; // everything is counted in units of uint16 // if (numchanged) { // uint16 numunchanged; // skip these before handling numchanged // uint16[numchanged] changeddata; // } // else // { // uint32 numunchanged; // if (!numunchanged) break; // } //} //size thisstart; // //The start offsets point to 'nextstart' of any given compressed frame. //Each uint16 is stored native endian; anything that claims any other endianness refers to the endianness of this specific item. //The uint32 is stored little endian. //Each size value is stored native endian if alignment is not enforced; if it is, they're little endian. //The start of the buffer contains a size pointing to the end of the buffer; the end points to its start. //Wrapping is handled by returning to the start of the buffer if the compressed data could potentially hit the edge; // if the compressed data could potentially overwrite the tail pointer, the tail retreats until it can no longer collide. //This means that on average, ~2*maxcompsize is unused at any given moment. #if SIZE_MAX == 0xFFFFFFFF extern char double_check_sizeof_size_t[(sizeof(size_t)==4)?1:-1]; #elif SIZE_MAX == 0xFFFFFFFFFFFFFFFF extern char double_check_sizeof_size_t[(sizeof(size_t)==8)?1:-1]; #define USE_64BIT #else #error This item is only tested on 32bit and 64bit. #endif #ifdef NO_UNALIGNED_MEM //These functions assume 16bit alignment. //They do not make any attempt at matching system native endian; values written by these can only be read by the matching partner. #ifdef USE_64BIT static inline void write_size_t(uint16_t* ptr, size_t val) { ptr[0] = val>>0; ptr[1] = val>>16; ptr[2] = val>>32; ptr[3] = val>>48; } static inline size_t read_size_t(uint16_t* ptr) { return ((size_t)ptr[0]<<0 | (size_t)ptr[1]<<16 | (size_t)ptr[2]<<32 | (size_t)ptr[3]<<48); } #else static inline void write_size_t(uint16_t* ptr, size_t val) { ptr[0] = val; ptr[1] = val>>16; } static inline size_t read_size_t(uint16_t* ptr) { return (ptr[0] | (size_t)ptr[1]<<16); } #endif #else #define read_size_t(ptr) (*(size_t*)(ptr)) #define write_size_t(ptr, val) (*(size_t*)(ptr) = (val)) #endif struct state_manager { char *data; size_t capacity; char *head;//Reading and writing is done here. char *tail;//If head comes close to this, discard a frame. char *thisblock; char *nextblock; bool thisblock_valid; size_t blocksize;//This one is runded up from reset::blocksize. size_t maxcompsize;//size_t+(blocksize+131071)/131072*(blocksize+u16+u16)+u16+u32+size_t (yes, the math is a bit ugly) unsigned int entries; }; state_manager_t *state_manager_new(size_t state_size, size_t buffer_size) { state_manager_t *state = (state_manager_t*)malloc(sizeof(*state)); state->capacity = 0; state->blocksize = 0; int newblocksize = ((state_size-1)|(sizeof(uint16_t)-1))+1; state->blocksize = newblocksize; const int maxcblkcover = UINT16_MAX*sizeof(uint16_t); const int maxcblks = (state->blocksize+maxcblkcover-1)/maxcblkcover; state->maxcompsize = state->blocksize + maxcblks*sizeof(uint16_t)*2 + sizeof(uint16_t)+sizeof(uint32_t) + sizeof(size_t)*2; state->data = (char*)malloc(buffer_size); state->thisblock = (char*)calloc(state->blocksize+sizeof(uint16_t)*8, 1); state->nextblock = (char*)calloc(state->blocksize+sizeof(uint16_t)*8, 1); if (!state->data || !state->thisblock || !state->nextblock) { free(state->data); free(state->thisblock); free(state->nextblock); free(state); return NULL; } //Force in a different byte at the end, so we don't need to check bounds in the innermost loop (it's expensive). //There is also a large amount of data that's the same, to stop the other scan //There is also some padding at the end. This is so we don't read outside the buffer end if we're reading in large blocks; // it doesn't make any difference to us, but sacrificing 16 bytes to get Valgrind happy is worth it. *(uint16_t*)(state->thisblock+state->blocksize+sizeof(uint16_t)*3) = 0xFFFF; *(uint16_t*)(state->nextblock+state->blocksize+sizeof(uint16_t)*3) = 0x0000; state->capacity=buffer_size; state->head = state->data+sizeof(size_t); state->tail = state->data+sizeof(size_t); state->thisblock_valid = false; state->entries = 0; return state; } void state_manager_free(state_manager_t *state) { free(state->data); free(state->thisblock); free(state->nextblock); free(state); } bool state_manager_pop(state_manager_t *state, const void **data) { *data = NULL; if (state->thisblock_valid) { state->thisblock_valid = false; state->entries--; *data = state->thisblock; return true; } if (state->head == state->tail) return false; size_t start = read_size_t((uint16_t*)(state->head - sizeof(size_t))); state->head = state->data+start; const char *compressed = state->data+start+sizeof(size_t); char *out = state->thisblock; //Begin decompression code //out is the last pushed (or returned) state const uint16_t *compressed16 = (const uint16_t*)compressed; uint16_t *out16 = (uint16_t*)out; while (true) { uint16_t numchanged = *(compressed16++); if (numchanged) { out16 += *(compressed16++); //We could do memcpy, but it seems that memcpy has a constant-per-call overhead that actually shows up. //Our average size in here seems to be 8 or something. //Therefore, we do something with lower overhead. for (int i=0;ientries--; *data = state->thisblock; return true; } void state_manager_push_where(state_manager_t *state, void **data) { //We need to ensure we have an uncompressed copy of the last pushed state, or we could // end up applying a 'patch' to wrong savestate, and that'd blow up rather quickly. if (!state->thisblock_valid) { const void *ignored; if (state_manager_pop(state, &ignored)) { state->thisblock_valid = true; } } *data=state->nextblock; } void state_manager_push_do(state_manager_t *state) { if (state->thisblock_valid) { if (state->capacitymaxcompsize) return; recheckcapacity:; size_t headpos = (state->head-state->data); size_t tailpos = (state->tail-state->data); size_t remaining = (tailpos+state->capacity-sizeof(size_t)-headpos-1)%state->capacity + 1; if (remaining <= state->maxcompsize) { state->tail = state->data + read_size_t((uint16_t*)state->tail); state->entries--; goto recheckcapacity; } const char *oldb = state->thisblock; const char *newb = state->nextblock; char *compressed = state->head+sizeof(size_t); //Begin compression code; 'compressed' will point to the end of the compressed data (excluding the prev pointer). const uint16_t *old16 = (const uint16_t*)oldb; const uint16_t *new16 = (const uint16_t*)newb; uint16_t *compressed16 = (uint16_t*)compressed; size_t num16s = state->blocksize/sizeof(uint16_t); while (num16s) { const uint16_t *oldprev = old16; #ifdef NO_UNALIGNED_MEM while ((uintptr_t)old16 & (sizeof(size_t)-1) && *old16==*new16) { old16++; new16++; } if (*old16==*new16) #endif { const size_t *olds = (const size_t*)old16; const size_t *news = (const size_t*)new16; while (*olds == *news) { olds++; news++; } old16 = (const uint16_t*)olds; new16 = (const uint16_t*)news; while (*old16 == *new16) { old16++; new16++; } } size_t skip = (old16-oldprev); if (skip >= num16s) break; num16s -= skip; if (skip > UINT16_MAX) { if (skip > UINT32_MAX) { // This will make it scan the entire thing again, but it only hits on 8GB unchanged // data anyways, and if you're doing that, you've got bigger problems. old16 -= skip; new16 -= skip; skip = UINT32_MAX; old16 += skip; new16 += skip; } *(compressed16++) = 0; *(compressed16++) = skip; *(compressed16++) = skip>>16; skip = 0; continue; } size_t changed; const uint16_t *old16prev = old16; //Comparing two or three words makes no real difference. //With two, the smaller blocks are less likely to be chopped up elsewhere due to 64KB; // with three, we get larger blocks which should be a minuscle bit faster to decompress, // but probably a little slower to compress. Since compression is more bottleneck than decompression is, we favor that. while (old16[0] != new16[0] || old16[1] != new16[1]) { old16++; new16++; //Optimize this by only checking one at the time for as long as possible. while (*old16 != *new16) { old16++; new16++; } } changed = (old16-old16prev); if (!changed) continue; if (changed > UINT16_MAX) { old16 -= changed; new16 -= changed; changed = UINT16_MAX; old16 += changed; new16 += changed; } num16s -= changed; *(compressed16++) = changed; *(compressed16++) = skip; for (int i=0;idata + state->maxcompsize > state->capacity) { compressed = state->data; if (state->tail == state->data+sizeof(size_t)) state->tail = state->data + *(size_t*)state->tail; } write_size_t((uint16_t*)compressed, state->head-state->data); compressed += sizeof(size_t); write_size_t((uint16_t*)state->head, compressed-state->data); state->head = compressed; } else { state->thisblock_valid = true; } char *swap = state->thisblock; state->thisblock = state->nextblock; state->nextblock = swap; state->entries++; return; } void state_manager_capacity(state_manager_t *state, unsigned int * entries, size_t * bytes, bool * full) { size_t headpos = (state->head-state->data); size_t tailpos = (state->tail-state->data); size_t remaining = (tailpos+state->capacity-sizeof(size_t)-headpos-1)%state->capacity + 1; if (entries) *entries = state->entries; if (bytes) *bytes = (state->capacity-remaining); if (full) *full = (remaining<=state->maxcompsize*2); }