diff --git a/rewind.c b/rewind.c
index 0e992c8254..756348f4d2 100644
--- a/rewind.c
+++ b/rewind.c
@@ -19,7 +19,6 @@
 #include "rewind.h"
 #include "performance.h"
 #include <stdlib.h>
-#include <stdint.h>
 #include <string.h>
 #include <retro_inline.h>
 #include "intl/intl.h"
@@ -63,6 +62,260 @@ repeat {
 size thisstart;
 #endif
 
+size_t state_manager_raw_maxsize(size_t uncomp)
+{
+   const int maxcblkcover = UINT16_MAX * sizeof(uint16_t); /* bytes covered by a compressed block */
+   size_t uncomp16 = (uncomp + sizeof(uint16_t) - 1) & ~sizeof(uint16_t); /* uncompressed size, rounded to 16 bits */
+   size_t maxcblks = (uncomp + maxcblkcover - 1) / maxcblkcover; /* number of blocks */
+   return uncomp16 + maxcblks * sizeof(uint16_t)*2 /* two u16 overhead per block */ + sizeof(uint16_t)*3; /* three u16 to end it */
+}
+
+void *state_manager_raw_alloc(size_t len, uint16_t uniq)
+{
+   size_t len16 = (len + sizeof(uint16_t) - 1) & ~sizeof(uint16_t);
+
+   uint16_t *ret = (uint16_t*)calloc(len16 + sizeof(uint16_t) * 4 + 16, 1);
+
+   /* Force in a different byte at the end, so we don't need to check 
+    * bounds in the innermost loop (it's expensive).
+    *
+    * There is also a large amount of data that's the same, to stop 
+    * the other scan.
+    *
+    * There is also some padding at the end. This is so we don't 
+    * read outside the buffer end if we're reading in large blocks;
+    *
+    * It doesn't make any difference to us, but sacrificing 16 bytes to get 
+    * Valgrind happy is worth it. */
+   ret[len16/sizeof(uint16_t) + 3] = uniq;
+
+   return ret;
+}
+
+#if __SSE2__
+#if defined(__GNUC__)
+static INLINE int compat_ctz(unsigned x)
+{
+   return __builtin_ctz(x);
+}
+#else
+
+/* Only checks at nibble granularity, 
+ * because that's what we need. */
+
+static INLINE int compat_ctz(unsigned x)
+{
+   if (x & 0x000f)
+      return 0;
+   if (x & 0x00f0)
+      return 4;
+   if (x & 0x0f00)
+      return 8;
+   if (x & 0xf000)
+      return 12;
+   return 16;
+}
+#endif
+
+#include <emmintrin.h>
+/* There's no equivalent in libc, you'd think so ...
+ * std::mismatch exists, but it's not optimized at all. */
+
+static INLINE size_t find_change(const uint16_t *a, const uint16_t *b)
+{
+   const __m128i *a128 = (const __m128i*)a;
+   const __m128i *b128 = (const __m128i*)b;
+   
+   for (;;)
+   {
+      __m128i v0    = _mm_loadu_si128(a128);
+      __m128i v1    = _mm_loadu_si128(b128);
+      __m128i c     = _mm_cmpeq_epi32(v0, v1);
+      uint32_t mask = _mm_movemask_epi8(c);
+
+      if (mask != 0xffff) /* Something has changed, figure out where. */
+      {
+         size_t ret = (((uint8_t*)a128 - (uint8_t*)a) |
+               (compat_ctz(~mask))) >> 1;
+         return ret | (a[ret] == b[ret]);
+      }
+
+      a128++;
+      b128++;
+   }
+}
+#else
+static INLINE size_t find_change(const uint16_t *a, const uint16_t *b)
+{
+   const uint16_t *a_org = a;
+#ifdef NO_UNALIGNED_MEM
+   while (((uintptr_t)a & (sizeof(size_t) - 1)) && *a == *b)
+   {
+      a++;
+      b++;
+   }
+   if (*a == *b)
+#endif
+   {
+      const size_t *a_big = (const size_t*)a;
+      const size_t *b_big = (const size_t*)b;
+      
+      while (*a_big == *b_big)
+      {
+         a_big++;
+         b_big++;
+      }
+      a = (const uint16_t*)a_big;
+      b = (const uint16_t*)b_big;
+      
+      while (*a == *b)
+      {
+         a++;
+         b++;
+      }
+   }
+   return a - a_org;
+}
+#endif
+
+static INLINE size_t find_same(const uint16_t *a, const uint16_t *b)
+{
+   const uint16_t *a_org = a;
+#ifdef NO_UNALIGNED_MEM
+   if (((uintptr_t)a & (sizeof(uint32_t) - 1)) && *a != *b)
+   {
+      a++;
+      b++;
+   }
+   if (*a != *b)
+#endif
+   {
+      /* With this, it's random whether two consecutive identical
+       * words are caught.
+       *
+       * Luckily, compression rate is the same for both cases, and 
+       * three is always caught.
+       *
+       * (We prefer to miss two-word blocks, anyways; fewer iterations 
+       * of the outer loop, as well as in the decompressor.) */
+      const uint32_t *a_big = (const uint32_t*)a;
+      const uint32_t *b_big = (const uint32_t*)b;
+      
+      while (*a_big != *b_big)
+      {
+         a_big++;
+         b_big++;
+      }
+      a = (const uint16_t*)a_big;
+      b = (const uint16_t*)b_big;
+      
+      if (a != a_org && a[-1] == b[-1])
+      {
+         a--;
+         b--;
+      }
+   }
+   return a - a_org;
+}
+
+size_t state_manager_raw_compress(const void *src, const void *dst, size_t len, void *patch)
+{
+   const uint16_t *old16 = (const uint16_t*)src;
+   const uint16_t *new16 = (const uint16_t*)dst;
+   uint16_t *compressed16 = (uint16_t*)patch;
+   size_t num16s = (len + sizeof(uint16_t) - 1) / sizeof(uint16_t);
+   
+   while (num16s)
+   {
+      size_t i;
+      size_t skip = find_change(old16, new16);
+   
+      if (skip >= num16s)
+         break;
+   
+      old16 += skip;
+      new16 += skip;
+      num16s -= skip;
+   
+      if (skip > UINT16_MAX)
+      {
+         if (skip > UINT32_MAX)
+         {
+            /* This will make it scan the entire thing again, 
+             * but it only hits on 8GB unchanged data anyways,
+             * and if you're doing that, you've got bigger problems. */
+            skip = UINT32_MAX;
+         }
+         *compressed16++ = 0;
+         *compressed16++ = skip;
+         *compressed16++ = skip >> 16;
+         skip = 0;
+         continue;
+      }
+   
+      size_t changed = find_same(old16, new16);
+      if (changed > UINT16_MAX)
+         changed = UINT16_MAX;
+   
+      *compressed16++ = changed;
+      *compressed16++ = skip;
+   
+      for (i = 0; i < changed; i++)
+         compressed16[i] = old16[i];
+   
+      old16 += changed;
+      new16 += changed;
+      num16s -= changed;
+      compressed16 += changed;
+   }
+   
+   compressed16[0] = 0;
+   compressed16[1] = 0;
+   compressed16[2] = 0;
+   
+   return (uint8_t*)(compressed16+3) - (uint8_t*)patch;
+}
+
+void state_manager_raw_decompress(const void *patch, size_t patchlen, void *data, size_t datalen)
+{
+   uint16_t *out16 = (uint16_t*)data;
+   const uint16_t *patch16 = (const uint16_t*)patch;
+   
+   (void)patchlen;
+   (void)datalen;
+   
+   for (;;)
+   {
+      uint16_t i;
+      uint16_t numchanged = *(patch16++);
+
+      if (numchanged)
+      {
+         out16 += *patch16++;
+
+         /* We could do memcpy, but it seems that memcpy has a 
+          * constant-per-call overhead that actually shows up.
+          *
+          * Our average size in here seems to be 8 or something.
+          * Therefore, we do something with lower overhead. */
+         for (i = 0; i < numchanged; i++)
+            out16[i] = patch16[i];
+
+         patch16 += numchanged;
+         out16 += numchanged;
+      }
+      else
+      {
+         uint32_t numunchanged = patch16[0] | (patch16[1] << 16);
+
+         if (!numunchanged)
+            break;
+         patch16 += 2;
+         out16 += numunchanged;
+      }
+   }
+}
+
 /* The start offsets point to 'nextstart' of any given compressed frame.
  * Each uint16 is stored native endian; anything that claims any other 
  * endianness refers to the endianness of this specific item.
@@ -125,46 +378,21 @@ struct state_manager
 
 state_manager_t *state_manager_new(size_t state_size, size_t buffer_size)
 {
-   size_t newblocksize;
-   int maxcblks;
-   const int maxcblkcover = UINT16_MAX * sizeof(uint16_t);
    state_manager_t *state = (state_manager_t*)calloc(1, sizeof(*state));
 
    if (!state)
       return NULL;
 
-   newblocksize = ((state_size - 1) | (sizeof(uint16_t) - 1)) + 1;
-   state->blocksize = newblocksize;
-
-   maxcblks = (state->blocksize + maxcblkcover - 1) / maxcblkcover;
-   state->maxcompsize = state->blocksize + maxcblks * sizeof(uint16_t) * 2 +
-      sizeof(uint16_t) + sizeof(uint32_t) + sizeof(size_t) * 2;
-
+   state->blocksize = (state_size + sizeof(uint16_t) - 1) & ~sizeof(uint16_t);
+   /* the compressed data is surrounded by pointers to the other side */
+   state->maxcompsize = state_manager_raw_maxsize(state_size) + sizeof(size_t) * 2;
    state->data = (uint8_t*)malloc(buffer_size);
 
-   state->thisblock = (uint8_t*)
-      calloc(state->blocksize + sizeof(uint16_t) * 4 + 16, 1);
-   state->nextblock = (uint8_t*)
-      calloc(state->blocksize + sizeof(uint16_t) * 4 + 16, 1);
+   state->thisblock = (uint8_t*)state_manager_raw_alloc(state_size, 0);
+   state->nextblock = (uint8_t*)state_manager_raw_alloc(state_size, 1);
    if (!state->data || !state->thisblock || !state->nextblock)
       goto error;
 
-   /* Force in a different byte at the end, so we don't need to check 
-    * bounds in the innermost loop (it's expensive).
-    *
-    * There is also a large amount of data that's the same, to stop 
-    * the other scan.
-    *
-    * There is also some padding at the end. This is so we don't 
-    * read outside the buffer end if we're reading in large blocks;
-    *
-    * It doesn't make any difference to us, but sacrificing 16 bytes to get 
-    * Valgrind happy is worth it. */
-   *(uint16_t*)(state->thisblock + state->blocksize + sizeof(uint16_t) * 3) =
-      0xFFFF;
-   *(uint16_t*)(state->nextblock + state->blocksize + sizeof(uint16_t) * 3) =
-      0x0000;
-
    state->capacity = buffer_size;
 
    state->head = state->data + sizeof(size_t);
@@ -215,42 +443,7 @@ bool state_manager_pop(state_manager_t *state, const void **data)
    compressed = state->data + start + sizeof(size_t);
    out = state->thisblock;
 
-   /* Begin decompression code
-    * out is the last pushed (or returned) state */
-   compressed16 = (const uint16_t*)compressed;
-   out16 = (uint16_t*)out;
-
-   for (;;)
-   {
-      uint16_t i;
-      uint16_t numchanged = *(compressed16++);
-
-      if (numchanged)
-      {
-         out16 += *compressed16++;
-
-         /* We could do memcpy, but it seems that memcpy has a 
-          * constant-per-call overhead that actually shows up.
-          *
-          * Our average size in here seems to be 8 or something.
-          * Therefore, we do something with lower overhead. */
-         for (i = 0; i < numchanged; i++)
-            out16[i] = compressed16[i];
-
-         compressed16 += numchanged;
-         out16 += numchanged;
-      }
-      else
-      {
-         uint32_t numunchanged = compressed16[0] | (compressed16[1] << 16);
-
-         if (!numunchanged)
-            break;
-         compressed16 += 2;
-         out16 += numunchanged;
-      }
-   }
-   /* End decompression code */
+   state_manager_raw_decompress(compressed, state->maxcompsize, out, state->blocksize);
 
    state->entries--;
    *data = state->thisblock;
@@ -276,132 +469,6 @@ void state_manager_push_where(state_manager_t *state, void **data)
    *data = state->nextblock;
 }
 
-#if __SSE2__
-#if defined(__GNUC__)
-static INLINE int compat_ctz(unsigned x)
-{
-   return __builtin_ctz(x);
-}
-#else
-
-/* Only checks at nibble granularity, 
- * because that's what we need. */
-
-static INLINE int compat_ctz(unsigned x)
-{
-   if (x & 0x000f)
-      return 0;
-   if (x & 0x00f0)
-      return 4;
-   if (x & 0x0f00)
-      return 8;
-   if (x & 0xf000)
-      return 12;
-   return 16;
-}
-#endif
-
-#include <emmintrin.h>
-/* There's no equivalent in libc, you'd think so ...
- * std::mismatch exists, but it's not optimized at all. */
-
-static INLINE size_t find_change(const uint16_t *a, const uint16_t *b)
-{
-   const __m128i *a128 = (const __m128i*)a;
-   const __m128i *b128 = (const __m128i*)b;
-	
-   for (;;)
-   {
-      __m128i v0    = _mm_loadu_si128(a128);
-      __m128i v1    = _mm_loadu_si128(b128);
-      __m128i c     = _mm_cmpeq_epi32(v0, v1);
-      uint32_t mask = _mm_movemask_epi8(c);
-
-      if (mask != 0xffff) /* Something has changed, figure out where. */
-      {
-         size_t ret = (((uint8_t*)a128 - (uint8_t*)a) |
-               (compat_ctz(~mask))) >> 1;
-			return ret | (a[ret] == b[ret]);
-      }
-
-      a128++;
-      b128++;
-   }
-}
-#else
-static INLINE size_t find_change(const uint16_t *a, const uint16_t *b)
-{
-   const uint16_t *a_org = a;
-#ifdef NO_UNALIGNED_MEM
-   while (((uintptr_t)a & (sizeof(size_t) - 1)) && *a == *b)
-   {
-      a++;
-      b++;
-   }
-   if (*a == *b)
-#endif
-   {
-      const size_t *a_big = (const size_t*)a;
-      const size_t *b_big = (const size_t*)b;
-		
-      while (*a_big == *b_big)
-      {
-         a_big++;
-         b_big++;
-      }
-      a = (const uint16_t*)a_big;
-      b = (const uint16_t*)b_big;
-		
-      while (*a == *b)
-      {
-         a++;
-         b++;
-      }
-   }
-   return a - a_org;
-}
-#endif
-
-static INLINE size_t find_same(const uint16_t *a, const uint16_t *b)
-{
-   const uint16_t *a_org = a;
-#ifdef NO_UNALIGNED_MEM
-   if (((uintptr_t)a & (sizeof(uint32_t) - 1)) && *a != *b)
-   {
-      a++;
-      b++;
-   }
-   if (*a != *b)
-#endif
-   {
-      /* With this, it's random whether two consecutive identical
-       * words are caught.
-       *
-       * Luckily, compression rate is the same for both cases, and 
-       * three is always caught.
-       *
-       * (We prefer to miss two-word blocks, anyways; fewer iterations 
-       * of the outer loop, as well as in the decompressor.) */
-      const uint32_t *a_big = (const uint32_t*)a;
-      const uint32_t *b_big = (const uint32_t*)b;
-		
-      while (*a_big != *b_big)
-      {
-         a_big++;
-         b_big++;
-      }
-      a = (const uint16_t*)a_big;
-      b = (const uint16_t*)b_big;
-		
-      if (a != a_org && a[-1] == b[-1])
-      {
-         a--;
-         b--;
-      }
-   }
-   return a - a_org;
-}
-
 void state_manager_push_do(state_manager_t *state)
 {
    uint8_t *swap = NULL;
@@ -438,62 +505,7 @@ recheckcapacity:;
       newb = state->nextblock;
       compressed = state->head + sizeof(size_t);
 
-      /* Begin compression code; 'compressed' will point to 
-       * the end of the compressed data (excluding the prev pointer). */
-      old16 = (const uint16_t*)oldb;
-      new16 = (const uint16_t*)newb;
-      compressed16 = (uint16_t*)compressed;
-      num16s = state->blocksize / sizeof(uint16_t);
-
-      while (num16s)
-      {
-         size_t i;
-         size_t skip = find_change(old16, new16);
-
-         if (skip >= num16s)
-            break;
-
-         old16 += skip;
-         new16 += skip;
-         num16s -= skip;
-
-         if (skip > UINT16_MAX)
-         {
-            if (skip > UINT32_MAX)
-            {
-               /* This will make it scan the entire thing again, 
-                * but it only hits on 8GB unchanged data anyways,
-                * and if you're doing that, you've got bigger problems. */
-               skip = UINT32_MAX;
-            }
-            *compressed16++ = 0;
-            *compressed16++ = skip;
-            *compressed16++ = skip >> 16;
-            skip = 0;
-            continue;
-         }
-
-         size_t changed = find_same(old16, new16);
-         if (changed > UINT16_MAX)
-            changed = UINT16_MAX;
-
-         *compressed16++ = changed;
-         *compressed16++ = skip;
-
-         for (i = 0; i < changed; i++)
-            compressed16[i] = old16[i];
-
-         old16 += changed;
-         new16 += changed;
-         num16s -= changed;
-         compressed16 += changed;
-      }
-
-      compressed16[0] = 0;
-      compressed16[1] = 0;
-      compressed16[2] = 0;
-      compressed = (uint8_t*)(compressed16 + 3);
-      /* End compression code. */
+      compressed += state_manager_raw_compress(oldb, newb, state->blocksize, compressed);
 
       if (compressed - state->data + state->maxcompsize > state->capacity)
       {
diff --git a/rewind.h b/rewind.h
index 08052ee6f2..62aa660f56 100644
--- a/rewind.h
+++ b/rewind.h
@@ -24,6 +24,7 @@ extern "C" {
 
 #include <stddef.h>
 #include <boolean.h>
+#include <stdint.h>
 
 typedef struct state_manager state_manager_t;
 
@@ -42,6 +43,31 @@ void state_manager_capacity(state_manager_t *state,
 
 void init_rewind(void);
 
+
+/* Returns the maximum compressed size of a savestate. It is very likely to compress to far less. */
+size_t state_manager_raw_maxsize(size_t uncomp);
+
+/*
+ * See state_manager_raw_compress for information about this.
+ * When you're done with it, send it to free().
+ */
+void *state_manager_raw_alloc(size_t len, uint16_t uniq);
+
+/*
+ * Takes two savestates and creates a patch that turns 'src' into 'dst'.
+ * Both 'src' and 'dst' must be returned from state_manager_raw_alloc(), with the same 'len', and different 'uniq'.
+ * 'patch' must be size 'state_manager_raw_maxsize(len)' or more.
+ * Returns the number of bytes actually written to 'patch'.
+ */
+size_t state_manager_raw_compress(const void *src, const void *dst, size_t len, void *patch);
+
+/*
+ * Takes 'patch' from a previous call to 'state_manager_raw_compress' and applies it to 'data' ('src' from that call),
+ * yielding 'dst' in that call.
+ * If the given arguments do not match a previous call to state_manager_raw_compress(), anything at all can happen.
+ */
+void state_manager_raw_decompress(const void *patch, size_t patchlen, void *data, size_t datalen);
+
 #ifdef __cplusplus
 }
 #endif