diff --git a/Data/Sys/GameSettings/NAL.ini b/Data/Sys/GameSettings/NAL.ini
index 459ee44e13..a03b2d8a22 100644
--- a/Data/Sys/GameSettings/NAL.ini
+++ b/Data/Sys/GameSettings/NAL.ini
@@ -14,3 +14,8 @@
 
 [Video_Stereoscopy]
 StereoConvergence = 5000
+
+[Video_Settings]
+# This game creates a large number of EFB copies at different addresses, resulting
+# in a large texture cache which takes considerable time to save.
+SaveTextureCacheToState = False
\ No newline at end of file
diff --git a/Data/Sys/GameSettings/NAT.ini b/Data/Sys/GameSettings/NAT.ini
new file mode 100644
index 0000000000..ede4c31766
--- /dev/null
+++ b/Data/Sys/GameSettings/NAT.ini
@@ -0,0 +1,18 @@
+# NATJ01, NATP01, NATE01 - Mario Tennis (Virtual Console)
+
+[Core]
+# Values set here will override the main Dolphin settings.
+
+[OnLoad]
+# Add memory patches to be loaded once on boot here.
+
+[OnFrame]
+# Add memory patches to be applied every frame here.
+
+[ActionReplay]
+# Add action replay cheats here.
+
+[Video_Settings]
+# This game creates a large number of EFB copies at different addresses, resulting
+# in a large texture cache which takes considerable time to save.
+SaveTextureCacheToState = False
\ No newline at end of file
diff --git a/Source/Core/Core/Config/GraphicsSettings.cpp b/Source/Core/Core/Config/GraphicsSettings.cpp
index 30ed2d6959..83baf5ae05 100644
--- a/Source/Core/Core/Config/GraphicsSettings.cpp
+++ b/Source/Core/Core/Config/GraphicsSettings.cpp
@@ -91,6 +91,8 @@ const ConfigInfo<int> GFX_SHADER_COMPILER_THREADS{
     {System::GFX, "Settings", "ShaderCompilerThreads"}, 1};
 const ConfigInfo<int> GFX_SHADER_PRECOMPILER_THREADS{
     {System::GFX, "Settings", "ShaderPrecompilerThreads"}, 1};
+const ConfigInfo<bool> GFX_SAVE_TEXTURE_CACHE_TO_STATE{
+    {System::GFX, "Settings", "SaveTextureCacheToState"}, true};
 
 const ConfigInfo<bool> GFX_SW_ZCOMPLOC{{System::GFX, "Settings", "SWZComploc"}, true};
 const ConfigInfo<bool> GFX_SW_ZFREEZE{{System::GFX, "Settings", "SWZFreeze"}, true};
diff --git a/Source/Core/Core/Config/GraphicsSettings.h b/Source/Core/Core/Config/GraphicsSettings.h
index 1946e704fc..dc8d501fab 100644
--- a/Source/Core/Core/Config/GraphicsSettings.h
+++ b/Source/Core/Core/Config/GraphicsSettings.h
@@ -67,6 +67,7 @@ extern const ConfigInfo<bool> GFX_WAIT_FOR_SHADERS_BEFORE_STARTING;
 extern const ConfigInfo<ShaderCompilationMode> GFX_SHADER_COMPILATION_MODE;
 extern const ConfigInfo<int> GFX_SHADER_COMPILER_THREADS;
 extern const ConfigInfo<int> GFX_SHADER_PRECOMPILER_THREADS;
+extern const ConfigInfo<bool> GFX_SAVE_TEXTURE_CACHE_TO_STATE;
 
 extern const ConfigInfo<bool> GFX_SW_ZCOMPLOC;
 extern const ConfigInfo<bool> GFX_SW_ZFREEZE;
diff --git a/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp b/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp
index b60693000b..15711a65b8 100644
--- a/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp
+++ b/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp
@@ -90,6 +90,7 @@ bool IsSettingSaveable(const Config::ConfigLocation& config_location)
       Config::GFX_SHADER_COMPILATION_MODE.location,
       Config::GFX_SHADER_COMPILER_THREADS.location,
       Config::GFX_SHADER_PRECOMPILER_THREADS.location,
+      Config::GFX_SAVE_TEXTURE_CACHE_TO_STATE.location,
 
       Config::GFX_SW_ZCOMPLOC.location,
       Config::GFX_SW_ZFREEZE.location,
diff --git a/Source/Core/Core/Core.cpp b/Source/Core/Core/Core.cpp
index bdd3fcaf10..f1e3161220 100644
--- a/Source/Core/Core/Core.cpp
+++ b/Source/Core/Core/Core.cpp
@@ -21,6 +21,7 @@
 #include "Common/CPUDetect.h"
 #include "Common/CommonPaths.h"
 #include "Common/CommonTypes.h"
+#include "Common/Event.h"
 #include "Common/FileUtil.h"
 #include "Common/Flag.h"
 #include "Common/Logging/LogManager.h"
@@ -110,6 +111,7 @@ struct HostJob
 };
 static std::mutex s_host_jobs_lock;
 static std::queue<HostJob> s_host_jobs_queue;
+static Common::Event s_cpu_thread_job_finished;
 
 static thread_local bool tls_is_cpu_thread = false;
 
@@ -433,6 +435,7 @@ static void EmuThread(std::unique_ptr<BootParameters> boot, WindowSystemInfo wsi
   Common::ScopeGuard movie_guard{Movie::Shutdown};
 
   HW::Init();
+
   Common::ScopeGuard hw_guard{[] {
     // We must set up this flag before executing HW::Shutdown()
     s_hardware_initialized = false;
@@ -771,6 +774,45 @@ void RunAsCPUThread(std::function<void()> function)
     PauseAndLock(false, was_unpaused);
 }
 
+void RunOnCPUThread(std::function<void()> function, bool wait_for_completion)
+{
+  // If the CPU thread is not running, assume there is no active CPU thread we can race against.
+  if (!IsRunning() || IsCPUThread())
+  {
+    function();
+    return;
+  }
+
+  // Pause the CPU (set it to stepping mode).
+  const bool was_running = PauseAndLock(true, true);
+
+  // Queue the job function.
+  if (wait_for_completion)
+  {
+    // Trigger the event after executing the function.
+    s_cpu_thread_job_finished.Reset();
+    CPU::AddCPUThreadJob([&function]() {
+      function();
+      s_cpu_thread_job_finished.Set();
+    });
+  }
+  else
+  {
+    CPU::AddCPUThreadJob(std::move(function));
+  }
+
+  // Release the CPU thread, and let it execute the callback.
+  PauseAndLock(false, was_running);
+
+  // If we're waiting for completion, block until the event fires.
+  if (wait_for_completion)
+  {
+    // Periodically yield to the UI thread, so we don't deadlock.
+    while (!s_cpu_thread_job_finished.WaitFor(std::chrono::milliseconds(10)))
+      Host_YieldToUI();
+  }
+}
+
 // Display FPS info
 // This should only be called from VI
 void VideoThrottle()
diff --git a/Source/Core/Core/Core.h b/Source/Core/Core/Core.h
index fdd30a539a..26e30a2d8a 100644
--- a/Source/Core/Core/Core.h
+++ b/Source/Core/Core/Core.h
@@ -82,6 +82,10 @@ void UpdateTitle();
 // This should only be called from the CPU thread or the host thread.
 void RunAsCPUThread(std::function<void()> function);
 
+// Run a function on the CPU thread, asynchronously.
+// This is only valid to call from the host thread, since it uses PauseAndLock() internally.
+void RunOnCPUThread(std::function<void()> function, bool wait_for_completion);
+
 // for calling back into UI code without introducing a dependency on it in core
 using StateChangedCallbackFunc = std::function<void(Core::State)>;
 void SetOnStateChangedCallback(StateChangedCallbackFunc callback);
diff --git a/Source/Core/Core/HW/CPU.cpp b/Source/Core/Core/HW/CPU.cpp
index a1b21ea7a2..b8243a0452 100644
--- a/Source/Core/Core/HW/CPU.cpp
+++ b/Source/Core/Core/HW/CPU.cpp
@@ -6,6 +6,7 @@
 
 #include <condition_variable>
 #include <mutex>
+#include <queue>
 
 #include "AudioCommon/AudioCommon.h"
 #include "Common/CommonTypes.h"
@@ -44,6 +45,7 @@ static bool s_state_paused_and_locked = false;
 static bool s_state_system_request_stepping = false;
 static bool s_state_cpu_step_instruction = false;
 static Common::Event* s_state_cpu_step_instruction_sync = nullptr;
+static std::queue<std::function<void()>> s_pending_jobs;
 
 void Init(PowerPC::CPUCore cpu_core)
 {
@@ -60,6 +62,9 @@ void Shutdown()
 // Requires holding s_state_change_lock
 static void FlushStepSyncEventLocked()
 {
+  if (!s_state_cpu_step_instruction)
+    return;
+
   if (s_state_cpu_step_instruction_sync)
   {
     s_state_cpu_step_instruction_sync->Set();
@@ -68,12 +73,25 @@ static void FlushStepSyncEventLocked()
   s_state_cpu_step_instruction = false;
 }
 
+static void ExecutePendingJobs(std::unique_lock<std::mutex>& state_lock)
+{
+  while (!s_pending_jobs.empty())
+  {
+    auto callback = s_pending_jobs.front();
+    s_pending_jobs.pop();
+    state_lock.unlock();
+    callback();
+    state_lock.lock();
+  }
+}
+
 void Run()
 {
   std::unique_lock<std::mutex> state_lock(s_state_change_lock);
   while (s_state != State::PowerDown)
   {
     s_state_cpu_cvar.wait(state_lock, [] { return !s_state_paused_and_locked; });
+    ExecutePendingJobs(state_lock);
 
     switch (s_state)
     {
@@ -108,8 +126,10 @@ void Run()
 
     case State::Stepping:
       // Wait for step command.
-      s_state_cpu_cvar.wait(state_lock,
-                            [] { return s_state_cpu_step_instruction || !IsStepping(); });
+      s_state_cpu_cvar.wait(state_lock, [&state_lock] {
+        ExecutePendingJobs(state_lock);
+        return s_state_cpu_step_instruction || !IsStepping();
+      });
       if (!IsStepping())
       {
         // Signal event if the mode changes.
@@ -330,4 +350,11 @@ bool PauseAndLock(bool do_lock, bool unpause_on_unlock, bool control_adjacent)
   }
   return was_unpaused;
 }
+
+void AddCPUThreadJob(std::function<void()> function)
+{
+  std::unique_lock<std::mutex> state_lock(s_state_change_lock);
+  s_pending_jobs.push(std::move(function));
+}
+
 }  // namespace CPU
diff --git a/Source/Core/Core/HW/CPU.h b/Source/Core/Core/HW/CPU.h
index 408b82ace2..26ffa6783c 100644
--- a/Source/Core/Core/HW/CPU.h
+++ b/Source/Core/Core/HW/CPU.h
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #pragma once
+#include <functional>
 
 namespace Common
 {
@@ -74,4 +75,8 @@ const State* GetStatePtr();
 // "control_adjacent" causes PauseAndLock to behave like EnableStepping by modifying the
 //   state of the Audio and FIFO subsystems as well.
 bool PauseAndLock(bool do_lock, bool unpause_on_unlock = true, bool control_adjacent = false);
+
+// Adds a job to be executed during on the CPU thread. This should be combined with PauseAndLock(),
+// as while the CPU is in the run loop, it won't execute the function.
+void AddCPUThreadJob(std::function<void()> function);
 }  // namespace CPU
diff --git a/Source/Core/Core/State.cpp b/Source/Core/Core/State.cpp
index 42a2fa792b..cd4544b8f1 100644
--- a/Source/Core/Core/State.cpp
+++ b/Source/Core/Core/State.cpp
@@ -63,7 +63,7 @@ static AfterLoadCallbackFunc s_on_after_load_callback;
 // Temporary undo state buffer
 static std::vector<u8> g_undo_load_buffer;
 static std::vector<u8> g_current_buffer;
-static int g_loadDepth = 0;
+static bool s_load_or_save_in_progress;
 
 static std::mutex g_cs_undo_load_buffer;
 static std::mutex g_cs_current_buffer;
@@ -72,7 +72,7 @@ static Common::Event g_compressAndDumpStateSyncEvent;
 static std::thread g_save_thread;
 
 // Don't forget to increase this after doing changes on the savestate system
-static const u32 STATE_VERSION = 110;  // Last changed in PR 8036
+static const u32 STATE_VERSION = 111;  // Last changed in PR 6321
 
 // Maps savestate versions to Dolphin versions.
 // Versions after 42 don't need to be added to this list,
@@ -170,6 +170,11 @@ static void DoState(PointerWrap& p)
     return;
   }
 
+  // Movie must be done before the video backend, because the window is redrawn in the video backend
+  // state load, and the frame number must be up-to-date.
+  Movie::DoState(p);
+  p.DoMarker("Movie");
+
   // Begin with video backend, so that it gets a chance to clear its caches and writeback modified
   // things to RAM
   g_video_backend->DoState(p);
@@ -186,8 +191,6 @@ static void DoState(PointerWrap& p)
   if (SConfig::GetInstance().bWii)
     Wiimote::DoState(p);
   p.DoMarker("Wiimote");
-  Movie::DoState(p);
-  p.DoMarker("Movie");
   Gecko::DoState(p);
   p.DoMarker("Gecko");
 
@@ -204,27 +207,31 @@ void LoadFromBuffer(std::vector<u8>& buffer)
     return;
   }
 
-  Core::RunAsCPUThread([&] {
-    u8* ptr = &buffer[0];
-    PointerWrap p(&ptr, PointerWrap::MODE_READ);
-    DoState(p);
-  });
+  Core::RunOnCPUThread(
+      [&] {
+        u8* ptr = &buffer[0];
+        PointerWrap p(&ptr, PointerWrap::MODE_READ);
+        DoState(p);
+      },
+      true);
 }
 
 void SaveToBuffer(std::vector<u8>& buffer)
 {
-  Core::RunAsCPUThread([&] {
-    u8* ptr = nullptr;
-    PointerWrap p(&ptr, PointerWrap::MODE_MEASURE);
+  Core::RunOnCPUThread(
+      [&] {
+        u8* ptr = nullptr;
+        PointerWrap p(&ptr, PointerWrap::MODE_MEASURE);
 
-    DoState(p);
-    const size_t buffer_size = reinterpret_cast<size_t>(ptr);
-    buffer.resize(buffer_size);
+        DoState(p);
+        const size_t buffer_size = reinterpret_cast<size_t>(ptr);
+        buffer.resize(buffer_size);
 
-    ptr = &buffer[0];
-    p.SetMode(PointerWrap::MODE_WRITE);
-    DoState(p);
-  });
+        ptr = &buffer[0];
+        p.SetMode(PointerWrap::MODE_WRITE);
+        DoState(p);
+      },
+      true);
 }
 
 // return state number not in map
@@ -381,42 +388,51 @@ static void CompressAndDumpState(CompressAndDumpState_args save_args)
 
 void SaveAs(const std::string& filename, bool wait)
 {
-  Core::RunAsCPUThread([&] {
-    // Measure the size of the buffer.
-    u8* ptr = nullptr;
-    PointerWrap p(&ptr, PointerWrap::MODE_MEASURE);
-    DoState(p);
-    const size_t buffer_size = reinterpret_cast<size_t>(ptr);
+  if (s_load_or_save_in_progress)
+    return;
 
-    // Then actually do the write.
-    {
-      std::lock_guard<std::mutex> lk(g_cs_current_buffer);
-      g_current_buffer.resize(buffer_size);
-      ptr = &g_current_buffer[0];
-      p.SetMode(PointerWrap::MODE_WRITE);
-      DoState(p);
-    }
+  s_load_or_save_in_progress = true;
 
-    if (p.GetMode() == PointerWrap::MODE_WRITE)
-    {
-      Core::DisplayMessage("Saving State...", 1000);
+  Core::RunOnCPUThread(
+      [&] {
+        // Measure the size of the buffer.
+        u8* ptr = nullptr;
+        PointerWrap p(&ptr, PointerWrap::MODE_MEASURE);
+        DoState(p);
+        const size_t buffer_size = reinterpret_cast<size_t>(ptr);
 
-      CompressAndDumpState_args save_args;
-      save_args.buffer_vector = &g_current_buffer;
-      save_args.buffer_mutex = &g_cs_current_buffer;
-      save_args.filename = filename;
-      save_args.wait = wait;
+        // Then actually do the write.
+        {
+          std::lock_guard<std::mutex> lk(g_cs_current_buffer);
+          g_current_buffer.resize(buffer_size);
+          ptr = &g_current_buffer[0];
+          p.SetMode(PointerWrap::MODE_WRITE);
+          DoState(p);
+        }
 
-      Flush();
-      g_save_thread = std::thread(CompressAndDumpState, save_args);
-      g_compressAndDumpStateSyncEvent.Wait();
-    }
-    else
-    {
-      // someone aborted the save by changing the mode?
-      Core::DisplayMessage("Unable to save: Internal DoState Error", 4000);
-    }
-  });
+        if (p.GetMode() == PointerWrap::MODE_WRITE)
+        {
+          Core::DisplayMessage("Saving State...", 1000);
+
+          CompressAndDumpState_args save_args;
+          save_args.buffer_vector = &g_current_buffer;
+          save_args.buffer_mutex = &g_cs_current_buffer;
+          save_args.filename = filename;
+          save_args.wait = wait;
+
+          Flush();
+          g_save_thread = std::thread(CompressAndDumpState, save_args);
+          g_compressAndDumpStateSyncEvent.Wait();
+        }
+        else
+        {
+          // someone aborted the save by changing the mode?
+          Core::DisplayMessage("Unable to save: Internal DoState Error", 4000);
+        }
+      },
+      true);
+
+  s_load_or_save_in_progress = false;
 }
 
 bool ReadHeader(const std::string& filename, StateHeader& header)
@@ -515,7 +531,7 @@ static void LoadFileStateData(const std::string& filename, std::vector<u8>& ret_
 
 void LoadAs(const std::string& filename)
 {
-  if (!Core::IsRunning())
+  if (!Core::IsRunning() || s_load_or_save_in_progress)
   {
     return;
   }
@@ -525,64 +541,65 @@ void LoadAs(const std::string& filename)
     return;
   }
 
-  Core::RunAsCPUThread([&] {
-    g_loadDepth++;
+  s_load_or_save_in_progress = true;
 
-    // Save temp buffer for undo load state
-    if (!Movie::IsJustStartingRecordingInputFromSaveState())
-    {
-      std::lock_guard<std::mutex> lk(g_cs_undo_load_buffer);
-      SaveToBuffer(g_undo_load_buffer);
-      if (Movie::IsMovieActive())
-        Movie::SaveRecording(File::GetUserPath(D_STATESAVES_IDX) + "undo.dtm");
-      else if (File::Exists(File::GetUserPath(D_STATESAVES_IDX) + "undo.dtm"))
-        File::Delete(File::GetUserPath(D_STATESAVES_IDX) + "undo.dtm");
-    }
+  Core::RunOnCPUThread(
+      [&] {
+        // Save temp buffer for undo load state
+        if (!Movie::IsJustStartingRecordingInputFromSaveState())
+        {
+          std::lock_guard<std::mutex> lk(g_cs_undo_load_buffer);
+          SaveToBuffer(g_undo_load_buffer);
+          if (Movie::IsMovieActive())
+            Movie::SaveRecording(File::GetUserPath(D_STATESAVES_IDX) + "undo.dtm");
+          else if (File::Exists(File::GetUserPath(D_STATESAVES_IDX) + "undo.dtm"))
+            File::Delete(File::GetUserPath(D_STATESAVES_IDX) + "undo.dtm");
+        }
 
-    bool loaded = false;
-    bool loadedSuccessfully = false;
+        bool loaded = false;
+        bool loadedSuccessfully = false;
 
-    // brackets here are so buffer gets freed ASAP
-    {
-      std::vector<u8> buffer;
-      LoadFileStateData(filename, buffer);
+        // brackets here are so buffer gets freed ASAP
+        {
+          std::vector<u8> buffer;
+          LoadFileStateData(filename, buffer);
 
-      if (!buffer.empty())
-      {
-        u8* ptr = &buffer[0];
-        PointerWrap p(&ptr, PointerWrap::MODE_READ);
-        DoState(p);
-        loaded = true;
-        loadedSuccessfully = (p.GetMode() == PointerWrap::MODE_READ);
-      }
-    }
+          if (!buffer.empty())
+          {
+            u8* ptr = &buffer[0];
+            PointerWrap p(&ptr, PointerWrap::MODE_READ);
+            DoState(p);
+            loaded = true;
+            loadedSuccessfully = (p.GetMode() == PointerWrap::MODE_READ);
+          }
+        }
 
-    if (loaded)
-    {
-      if (loadedSuccessfully)
-      {
-        Core::DisplayMessage(StringFromFormat("Loaded state from %s", filename.c_str()), 2000);
-        if (File::Exists(filename + ".dtm"))
-          Movie::LoadInput(filename + ".dtm");
-        else if (!Movie::IsJustStartingRecordingInputFromSaveState() &&
-                 !Movie::IsJustStartingPlayingInputFromSaveState())
-          Movie::EndPlayInput(false);
-      }
-      else
-      {
-        Core::DisplayMessage("The savestate could not be loaded", OSD::Duration::NORMAL);
+        if (loaded)
+        {
+          if (loadedSuccessfully)
+          {
+            Core::DisplayMessage(StringFromFormat("Loaded state from %s", filename.c_str()), 2000);
+            if (File::Exists(filename + ".dtm"))
+              Movie::LoadInput(filename + ".dtm");
+            else if (!Movie::IsJustStartingRecordingInputFromSaveState() &&
+                     !Movie::IsJustStartingPlayingInputFromSaveState())
+              Movie::EndPlayInput(false);
+          }
+          else
+          {
+            Core::DisplayMessage("The savestate could not be loaded", OSD::Duration::NORMAL);
 
-        // since we could be in an inconsistent state now (and might crash or whatever), undo.
-        if (g_loadDepth < 2)
-          UndoLoadState();
-      }
-    }
+            // since we could be in an inconsistent state now (and might crash or whatever), undo.
+            UndoLoadState();
+          }
+        }
 
-    if (s_on_after_load_callback)
-      s_on_after_load_callback();
+        if (s_on_after_load_callback)
+          s_on_after_load_callback();
+      },
+      true);
 
-    g_loadDepth--;
-  });
+  s_load_or_save_in_progress = false;
 }
 
 void SetOnAfterLoadCallback(AfterLoadCallbackFunc callback)
diff --git a/Source/Core/DolphinQt/Config/Graphics/HacksWidget.cpp b/Source/Core/DolphinQt/Config/Graphics/HacksWidget.cpp
index 319b7eb309..a615f60896 100644
--- a/Source/Core/DolphinQt/Config/Graphics/HacksWidget.cpp
+++ b/Source/Core/DolphinQt/Config/Graphics/HacksWidget.cpp
@@ -100,10 +100,13 @@ void HacksWidget::CreateWidgets()
   m_disable_bounding_box =
       new GraphicsBool(tr("Disable Bounding Box"), Config::GFX_HACK_BBOX_ENABLE, true);
   m_vertex_rounding = new GraphicsBool(tr("Vertex Rounding"), Config::GFX_HACK_VERTEX_ROUDING);
+  m_save_texture_cache_state =
+      new GraphicsBool(tr("Save Texture Cache to State"), Config::GFX_SAVE_TEXTURE_CACHE_TO_STATE);
 
   other_layout->addWidget(m_fast_depth_calculation, 0, 0);
   other_layout->addWidget(m_disable_bounding_box, 0, 1);
   other_layout->addWidget(m_vertex_rounding, 1, 0);
+  other_layout->addWidget(m_save_texture_cache_state, 1, 1);
 
   main_layout->addWidget(efb_box);
   main_layout->addWidget(texture_cache_box);
@@ -244,6 +247,10 @@ void HacksWidget::AddDescriptions()
   static const char TR_DISABLE_BOUNDINGBOX_DESCRIPTION[] =
       QT_TR_NOOP("Disables bounding box emulation.\n\nThis may improve GPU performance "
                  "significantly, but some games will break.\n\nIf unsure, leave this checked.");
+  static const char TR_SAVE_TEXTURE_CACHE_TO_STATE_DESCRIPTION[] = QT_TR_NOOP(
+      "Includes the contents of the embedded frame buffer (EFB) and upscaled EFB copies "
+      "in save states. Fixes missing and/or non-upscaled textures/objects when loading "
+      "states at the cost of additional save/load time.\n\nIf unsure, leave this checked.");
   static const char TR_VERTEX_ROUNDING_DESCRIPTION[] =
       QT_TR_NOOP("Rounds 2D vertices to whole pixels.\n\nFixes graphical problems in some games at "
                  "higher internal resolutions. This setting has no effect when native internal "
@@ -259,6 +266,7 @@ void HacksWidget::AddDescriptions()
   AddDescription(m_gpu_texture_decoding, TR_GPU_DECODING_DESCRIPTION);
   AddDescription(m_fast_depth_calculation, TR_FAST_DEPTH_CALC_DESCRIPTION);
   AddDescription(m_disable_bounding_box, TR_DISABLE_BOUNDINGBOX_DESCRIPTION);
+  AddDescription(m_save_texture_cache_state, TR_SAVE_TEXTURE_CACHE_TO_STATE_DESCRIPTION);
   AddDescription(m_vertex_rounding, TR_VERTEX_ROUNDING_DESCRIPTION);
 }
 
diff --git a/Source/Core/DolphinQt/Config/Graphics/HacksWidget.h b/Source/Core/DolphinQt/Config/Graphics/HacksWidget.h
index 47de3c0287..d46cb67932 100644
--- a/Source/Core/DolphinQt/Config/Graphics/HacksWidget.h
+++ b/Source/Core/DolphinQt/Config/Graphics/HacksWidget.h
@@ -42,6 +42,7 @@ private:
   QCheckBox* m_fast_depth_calculation;
   QCheckBox* m_disable_bounding_box;
   QCheckBox* m_vertex_rounding;
+  QCheckBox* m_save_texture_cache_state;
   QCheckBox* m_defer_efb_copies;
 
   void CreateWidgets();
diff --git a/Source/Core/VideoCommon/AsyncRequests.cpp b/Source/Core/VideoCommon/AsyncRequests.cpp
index 11a37afdfa..a824d0020a 100644
--- a/Source/Core/VideoCommon/AsyncRequests.cpp
+++ b/Source/Core/VideoCommon/AsyncRequests.cpp
@@ -11,6 +11,7 @@
 #include "VideoCommon/VertexManagerBase.h"
 #include "VideoCommon/VideoBackendBase.h"
 #include "VideoCommon/VideoCommon.h"
+#include "VideoCommon/VideoState.h"
 
 AsyncRequests AsyncRequests::s_singleton;
 
@@ -154,6 +155,10 @@ void AsyncRequests::HandleEvent(const AsyncRequests::Event& e)
   case Event::PERF_QUERY:
     g_perf_query->FlushResults();
     break;
+
+  case Event::DO_SAVE_STATE:
+    VideoCommon_DoState(*e.do_save_state.p);
+    break;
   }
 }
 
diff --git a/Source/Core/VideoCommon/AsyncRequests.h b/Source/Core/VideoCommon/AsyncRequests.h
index acd665b3b1..dc81667586 100644
--- a/Source/Core/VideoCommon/AsyncRequests.h
+++ b/Source/Core/VideoCommon/AsyncRequests.h
@@ -13,6 +13,7 @@
 #include "Common/Flag.h"
 
 struct EfbPokeData;
+class PointerWrap;
 
 class AsyncRequests
 {
@@ -28,6 +29,7 @@ public:
       SWAP_EVENT,
       BBOX_READ,
       PERF_QUERY,
+      DO_SAVE_STATE,
     } type;
     u64 time;
 
@@ -64,6 +66,11 @@ public:
       struct
       {
       } perf_query;
+
+      struct
+      {
+        PointerWrap* p;
+      } do_save_state;
     };
   };
 
diff --git a/Source/Core/VideoCommon/BPStructs.cpp b/Source/Core/VideoCommon/BPStructs.cpp
index 12fc46b209..fdfca161b9 100644
--- a/Source/Core/VideoCommon/BPStructs.cpp
+++ b/Source/Core/VideoCommon/BPStructs.cpp
@@ -68,9 +68,6 @@ static void BPWritten(const BPCmd& bp)
   ----------------------------------------------------------------------------------------------------------------
   */
 
-  // check for invalid state, else unneeded configuration are built
-  g_video_backend->CheckInvalidState();
-
   if (((s32*)&bpmem)[bp.address] == bp.newvalue)
   {
     if (!(bp.address == BPMEM_TRIGGER_EFB_COPY || bp.address == BPMEM_CLEARBBOX1 ||
diff --git a/Source/Core/VideoCommon/Fifo.cpp b/Source/Core/VideoCommon/Fifo.cpp
index 36a9ee7307..f63aaff805 100644
--- a/Source/Core/VideoCommon/Fifo.cpp
+++ b/Source/Core/VideoCommon/Fifo.cpp
@@ -299,14 +299,15 @@ void RunGpuLoop()
       [] {
         const SConfig& param = SConfig::GetInstance();
 
+        // Run events from the CPU thread.
+        AsyncRequests::GetInstance()->PullEvents();
+
         // Do nothing while paused
         if (!s_emu_running_state.IsSet())
           return;
 
         if (s_use_deterministic_gpu_thread)
         {
-          AsyncRequests::GetInstance()->PullEvents();
-
           // All the fifo/CP stuff is on the CPU.  We just need to run the opcode decoder.
           u8* seen_ptr = s_video_buffer_seen_ptr;
           u8* write_ptr = s_video_buffer_write_ptr;
@@ -321,9 +322,6 @@ void RunGpuLoop()
         else
         {
           CommandProcessor::SCPFifoStruct& fifo = CommandProcessor::fifo;
-
-          AsyncRequests::GetInstance()->PullEvents();
-
           CommandProcessor::SetCPStatusFromGPU();
 
           // check if we are able to run this buffer
diff --git a/Source/Core/VideoCommon/FramebufferManager.cpp b/Source/Core/VideoCommon/FramebufferManager.cpp
index 6f4e132c06..f61ebcf4a9 100644
--- a/Source/Core/VideoCommon/FramebufferManager.cpp
+++ b/Source/Core/VideoCommon/FramebufferManager.cpp
@@ -7,8 +7,10 @@
 #include "VideoCommon/FramebufferShaderGen.h"
 #include "VideoCommon/VertexManagerBase.h"
 
+#include "Common/ChunkFile.h"
 #include "Common/Logging/Log.h"
 #include "Common/MsgHandler.h"
+#include "Core/Config/GraphicsSettings.h"
 #include "VideoCommon/AbstractFramebuffer.h"
 #include "VideoCommon/AbstractPipeline.h"
 #include "VideoCommon/AbstractShader.h"
@@ -464,6 +466,20 @@ bool FramebufferManager::CompileReadbackPipelines()
       return false;
   }
 
+  // EFB restore pipeline
+  auto restore_shader = g_renderer->CreateShaderFromSource(
+      ShaderStage::Pixel, FramebufferShaderGen::GenerateEFBRestorePixelShader());
+  if (!restore_shader)
+    return false;
+
+  config.framebuffer_state = GetEFBFramebufferState();
+  config.framebuffer_state.per_sample_shading = false;
+  config.vertex_shader = g_shader_cache->GetScreenQuadVertexShader();
+  config.pixel_shader = restore_shader.get();
+  m_efb_restore_pipeline = g_renderer->CreatePipeline(config);
+  if (!m_efb_restore_pipeline)
+    return false;
+
   return true;
 }
 
@@ -842,3 +858,107 @@ void FramebufferManager::DestroyPokePipelines()
   m_color_poke_pipeline.reset();
   m_poke_vertex_format.reset();
 }
+
+void FramebufferManager::DoState(PointerWrap& p)
+{
+  FlushEFBPokes();
+
+  bool save_efb_state = Config::Get(Config::GFX_SAVE_TEXTURE_CACHE_TO_STATE);
+  p.Do(save_efb_state);
+  if (!save_efb_state)
+    return;
+
+  if (p.GetMode() == PointerWrap::MODE_WRITE || p.GetMode() == PointerWrap::MODE_MEASURE)
+    DoSaveState(p);
+  else
+    DoLoadState(p);
+}
+
+void FramebufferManager::DoSaveState(PointerWrap& p)
+{
+  // For multisampling, we need to resolve first before we can save.
+  // This won't be bit-exact when loading, which could cause interesting rendering side-effects for
+  // a frame. But whatever, MSAA doesn't exactly behave that well anyway.
+  AbstractTexture* color_texture = ResolveEFBColorTexture(m_efb_color_texture->GetRect());
+  AbstractTexture* depth_texture = ResolveEFBDepthTexture(m_efb_depth_texture->GetRect());
+
+  // We don't want to save these as rendertarget textures, just the data itself when deserializing.
+  const TextureConfig color_texture_config(color_texture->GetWidth(), color_texture->GetHeight(),
+                                           color_texture->GetLevels(), color_texture->GetLayers(),
+                                           1, GetEFBColorFormat(), 0);
+  g_texture_cache->SerializeTexture(color_texture, color_texture_config, p);
+
+  if (GetEFBDepthFormat() == AbstractTextureFormat::D32F)
+  {
+    const TextureConfig depth_texture_config(
+        depth_texture->GetWidth(), depth_texture->GetHeight(), depth_texture->GetLevels(),
+        depth_texture->GetLayers(), 1,
+        AbstractTexture::GetColorFormatForDepthFormat(GetEFBDepthFormat()), 0);
+    g_texture_cache->SerializeTexture(depth_texture, depth_texture_config, p);
+  }
+  else
+  {
+    // If the EFB is backed by a D24S8 texture, we first have to convert it to R32F.
+    const TextureConfig temp_texture_config(depth_texture->GetWidth(), depth_texture->GetHeight(),
+                                            depth_texture->GetLevels(), depth_texture->GetLayers(),
+                                            1, AbstractTextureFormat::R32F,
+                                            AbstractTextureFlag_RenderTarget);
+    std::unique_ptr<AbstractTexture> temp_texture = g_renderer->CreateTexture(temp_texture_config);
+    std::unique_ptr<AbstractFramebuffer> temp_fb =
+        g_renderer->CreateFramebuffer(temp_texture.get(), nullptr);
+    if (temp_texture && temp_fb)
+    {
+      g_renderer->ScaleTexture(temp_fb.get(), temp_texture->GetRect(), depth_texture,
+                               depth_texture->GetRect());
+
+      const TextureConfig depth_texture_config(
+          depth_texture->GetWidth(), depth_texture->GetHeight(), depth_texture->GetLevels(),
+          depth_texture->GetLayers(), 1, temp_texture->GetFormat(), 0);
+      g_texture_cache->SerializeTexture(depth_texture, depth_texture_config, p);
+    }
+    else
+    {
+      PanicAlert("Failed to create temp texture for depth saving");
+      g_texture_cache->SerializeTexture(color_texture, color_texture_config, p);
+    }
+  }
+}
+
+void FramebufferManager::DoLoadState(PointerWrap& p)
+{
+  // Invalidate any peek cache tiles.
+  InvalidatePeekCache(true);
+
+  // Deserialize the color and depth textures. This could fail.
+  auto color_tex = g_texture_cache->DeserializeTexture(p);
+  auto depth_tex = g_texture_cache->DeserializeTexture(p);
+
+  // If the stereo mode is different in the save state, throw it away.
+  if (!color_tex || !depth_tex ||
+      color_tex->texture->GetLayers() != m_efb_color_texture->GetLayers())
+  {
+    WARN_LOG(VIDEO, "Failed to deserialize EFB contents. Clearing instead.");
+    g_renderer->SetAndClearFramebuffer(
+        m_efb_framebuffer.get(), {{0.0f, 0.0f, 0.0f, 0.0f}},
+        g_ActiveConfig.backend_info.bSupportsReversedDepthRange ? 1.0f : 0.0f);
+    return;
+  }
+
+  // Size differences are okay here, since the linear filtering will downscale/upscale it.
+  // Depth buffer is always point sampled, since we don't want to interpolate depth values.
+  const bool rescale = color_tex->texture->GetWidth() != m_efb_color_texture->GetWidth() ||
+                       color_tex->texture->GetHeight() != m_efb_color_texture->GetHeight();
+
+  // Draw the deserialized textures over the EFB.
+  g_renderer->BeginUtilityDrawing();
+  g_renderer->SetAndDiscardFramebuffer(m_efb_framebuffer.get());
+  g_renderer->SetViewportAndScissor(m_efb_framebuffer->GetRect());
+  g_renderer->SetPipeline(m_efb_restore_pipeline.get());
+  g_renderer->SetTexture(0, color_tex->texture.get());
+  g_renderer->SetTexture(1, depth_tex->texture.get());
+  g_renderer->SetSamplerState(0, rescale ? RenderState::GetLinearSamplerState() :
+                                           RenderState::GetPointSamplerState());
+  g_renderer->SetSamplerState(1, RenderState::GetPointSamplerState());
+  g_renderer->Draw(0, 3);
+  g_renderer->EndUtilityDrawing();
+}
diff --git a/Source/Core/VideoCommon/FramebufferManager.h b/Source/Core/VideoCommon/FramebufferManager.h
index b97d45b31e..b4ae99361d 100644
--- a/Source/Core/VideoCommon/FramebufferManager.h
+++ b/Source/Core/VideoCommon/FramebufferManager.h
@@ -17,6 +17,7 @@
 #include "VideoCommon/TextureConfig.h"
 
 class NativeVertexFormat;
+class PointerWrap;
 
 enum class EFBReinterpretType
 {
@@ -95,6 +96,9 @@ public:
   void PokeEFBDepth(u32 x, u32 y, float depth);
   void FlushEFBPokes();
 
+  // Save state load/save.
+  void DoState(PointerWrap& p);
+
 protected:
   struct EFBPokeVertex
   {
@@ -145,6 +149,9 @@ protected:
   void DrawPokeVertices(const EFBPokeVertex* vertices, u32 vertex_count,
                         const AbstractPipeline* pipeline);
 
+  void DoLoadState(PointerWrap& p);
+  void DoSaveState(PointerWrap& p);
+
   std::unique_ptr<AbstractTexture> m_efb_color_texture;
   std::unique_ptr<AbstractTexture> m_efb_convert_color_texture;
   std::unique_ptr<AbstractTexture> m_efb_depth_texture;
@@ -156,6 +163,9 @@ protected:
   std::unique_ptr<AbstractFramebuffer> m_efb_depth_resolve_framebuffer;
   std::unique_ptr<AbstractPipeline> m_efb_depth_resolve_pipeline;
 
+  // Pipeline for restoring the contents of the EFB from a save state
+  std::unique_ptr<AbstractPipeline> m_efb_restore_pipeline;
+
   // Format conversion shaders
   std::array<std::unique_ptr<AbstractPipeline>, 6> m_format_conversion_pipelines;
 
diff --git a/Source/Core/VideoCommon/FramebufferShaderGen.cpp b/Source/Core/VideoCommon/FramebufferShaderGen.cpp
index 5789c0a0d2..00ff753d92 100644
--- a/Source/Core/VideoCommon/FramebufferShaderGen.cpp
+++ b/Source/Core/VideoCommon/FramebufferShaderGen.cpp
@@ -644,4 +644,24 @@ std::string GenerateTextureReinterpretShader(TextureFormat from_format, TextureF
   return ss.str();
 }
 
+std::string GenerateEFBRestorePixelShader()
+{
+  std::stringstream ss;
+  EmitSamplerDeclarations(ss, 0, 2, false);
+  EmitPixelMainDeclaration(ss, 1, 0, "float4",
+                           GetAPIType() == APIType::D3D ? "out float depth : SV_Depth, " : "");
+  ss << "{\n";
+  ss << "  float3 coords = float3(v_tex0.x, "
+     << (g_ActiveConfig.backend_info.bUsesLowerLeftOrigin ? "1.0 - " : "")
+     << "v_tex0.y, v_tex0.z);\n";
+  ss << "  ocol0 = ";
+  EmitSampleTexture(ss, 0, "coords");
+  ss << ";\n";
+  ss << "  " << (GetAPIType() == APIType::D3D ? "depth" : "gl_FragDepth") << " = ";
+  EmitSampleTexture(ss, 1, "coords");
+  ss << ".r;\n";
+  ss << "}\n";
+  return ss.str();
+}
+
 }  // namespace FramebufferShaderGen
diff --git a/Source/Core/VideoCommon/FramebufferShaderGen.h b/Source/Core/VideoCommon/FramebufferShaderGen.h
index b0134b5897..2ec50b4d76 100644
--- a/Source/Core/VideoCommon/FramebufferShaderGen.h
+++ b/Source/Core/VideoCommon/FramebufferShaderGen.h
@@ -30,5 +30,6 @@ std::string GenerateEFBPokeVertexShader();
 std::string GenerateColorPixelShader();
 std::string GenerateFormatConversionShader(EFBReinterpretType convtype, u32 samples);
 std::string GenerateTextureReinterpretShader(TextureFormat from_format, TextureFormat to_format);
+std::string GenerateEFBRestorePixelShader();
 
 }  // namespace FramebufferShaderGen
diff --git a/Source/Core/VideoCommon/RenderBase.cpp b/Source/Core/VideoCommon/RenderBase.cpp
index 40b6f9d51e..48b8e4cfd3 100644
--- a/Source/Core/VideoCommon/RenderBase.cpp
+++ b/Source/Core/VideoCommon/RenderBase.cpp
@@ -25,6 +25,7 @@
 #include <imgui.h>
 
 #include "Common/Assert.h"
+#include "Common/ChunkFile.h"
 #include "Common/CommonTypes.h"
 #include "Common/Config/Config.h"
 #include "Common/Event.h"
@@ -1324,8 +1325,11 @@ void Renderer::Swap(u32 xfb_addr, u32 fb_width, u32 fb_stride, u32 fb_height, u6
     }
 
     // Update our last xfb values
-    m_last_xfb_width = (fb_width < 1 || fb_width > MAX_XFB_WIDTH) ? MAX_XFB_WIDTH : fb_width;
-    m_last_xfb_height = (fb_height < 1 || fb_height > MAX_XFB_HEIGHT) ? MAX_XFB_HEIGHT : fb_height;
+    m_last_xfb_addr = xfb_addr;
+    m_last_xfb_ticks = ticks;
+    m_last_xfb_width = fb_width;
+    m_last_xfb_stride = fb_stride;
+    m_last_xfb_height = fb_height;
   }
   else
   {
@@ -1681,6 +1685,27 @@ bool Renderer::UseVertexDepthRange() const
   return fabs(xfmem.viewport.zRange) > 16777215.0f || fabs(xfmem.viewport.farZ) > 16777215.0f;
 }
 
+void Renderer::DoState(PointerWrap& p)
+{
+  p.Do(m_aspect_wide);
+  p.Do(m_frame_count);
+  p.Do(m_prev_efb_format);
+  p.Do(m_last_xfb_ticks);
+  p.Do(m_last_xfb_addr);
+  p.Do(m_last_xfb_width);
+  p.Do(m_last_xfb_stride);
+  p.Do(m_last_xfb_height);
+
+  if (p.GetMode() == PointerWrap::MODE_READ)
+  {
+    // Force the next xfb to be displayed.
+    m_last_xfb_id = std::numeric_limits<u64>::max();
+
+    // And actually display it.
+    Swap(m_last_xfb_addr, m_last_xfb_width, m_last_xfb_stride, m_last_xfb_height, m_last_xfb_ticks);
+  }
+}
+
 std::unique_ptr<VideoCommon::AsyncShaderCompiler> Renderer::CreateAsyncShaderCompiler()
 {
   return std::make_unique<VideoCommon::AsyncShaderCompiler>();
diff --git a/Source/Core/VideoCommon/RenderBase.h b/Source/Core/VideoCommon/RenderBase.h
index 584f07bc41..116080a1a8 100644
--- a/Source/Core/VideoCommon/RenderBase.h
+++ b/Source/Core/VideoCommon/RenderBase.h
@@ -41,6 +41,7 @@ class AbstractTexture;
 class AbstractStagingTexture;
 class NativeVertexFormat;
 class NetPlayChatUI;
+class PointerWrap;
 struct TextureConfig;
 struct ComputePipelineConfig;
 struct AbstractPipelineConfig;
@@ -237,6 +238,7 @@ public:
   void ChangeSurface(void* new_surface_handle);
   void ResizeSurface();
   bool UseVertexDepthRange() const;
+  void DoState(PointerWrap& p);
 
   virtual std::unique_ptr<VideoCommon::AsyncShaderCompiler> CreateAsyncShaderCompiler();
 
@@ -356,9 +358,10 @@ private:
 
   // Tracking of XFB textures so we don't render duplicate frames.
   u64 m_last_xfb_id = std::numeric_limits<u64>::max();
-
-  // Note: Only used for auto-ir
+  u64 m_last_xfb_ticks = 0;
+  u32 m_last_xfb_addr = 0;
   u32 m_last_xfb_width = 0;
+  u32 m_last_xfb_stride = 0;
   u32 m_last_xfb_height = 0;
 
   // NOTE: The methods below are called on the framedumping thread.
diff --git a/Source/Core/VideoCommon/TextureCacheBase.cpp b/Source/Core/VideoCommon/TextureCacheBase.cpp
index fb21603b6a..2f330bb2bc 100644
--- a/Source/Core/VideoCommon/TextureCacheBase.cpp
+++ b/Source/Core/VideoCommon/TextureCacheBase.cpp
@@ -15,6 +15,7 @@
 
 #include "Common/Align.h"
 #include "Common/Assert.h"
+#include "Common/ChunkFile.h"
 #include "Common/CommonTypes.h"
 #include "Common/FileUtil.h"
 #include "Common/Hash.h"
@@ -23,6 +24,7 @@
 #include "Common/MemoryUtil.h"
 #include "Common/StringUtil.h"
 
+#include "Core/Config/GraphicsSettings.h"
 #include "Core/ConfigManager.h"
 #include "Core/FifoPlayer/FifoPlayer.h"
 #include "Core/FifoPlayer/FifoRecorder.h"
@@ -404,6 +406,329 @@ void TextureCacheBase::ScaleTextureCacheEntryTo(TextureCacheBase::TCacheEntry* e
       config, TexPoolEntry(std::move(new_texture->texture), std::move(new_texture->framebuffer)));
 }
 
+bool TextureCacheBase::CheckReadbackTexture(u32 width, u32 height, AbstractTextureFormat format)
+{
+  if (m_readback_texture && m_readback_texture->GetConfig().width >= width &&
+      m_readback_texture->GetConfig().height >= height &&
+      m_readback_texture->GetConfig().format == format)
+  {
+    return true;
+  }
+
+  TextureConfig staging_config(std::max(width, 128u), std::max(height, 128u), 1, 1, 1, format, 0);
+  m_readback_texture.reset();
+  m_readback_texture =
+      g_renderer->CreateStagingTexture(StagingTextureType::Readback, staging_config);
+  return m_readback_texture != nullptr;
+}
+
+void TextureCacheBase::SerializeTexture(AbstractTexture* tex, const TextureConfig& config,
+                                        PointerWrap& p)
+{
+  // If we're in measure mode, skip the actual readback to save some time.
+  const bool skip_readback = p.GetMode() == PointerWrap::MODE_MEASURE;
+  p.DoPOD(config);
+
+  std::vector<u8> texture_data;
+  if (skip_readback || CheckReadbackTexture(config.width, config.height, config.format))
+  {
+    // Save out each layer of the texture to the staging texture, and then
+    // append it onto the end of the vector. This gives us all the sub-images
+    // in one single buffer which can be written out to the save state.
+    for (u32 layer = 0; layer < config.layers; layer++)
+    {
+      for (u32 level = 0; level < config.levels; level++)
+      {
+        u32 level_width = std::max(config.width >> level, 1u);
+        u32 level_height = std::max(config.height >> level, 1u);
+        auto rect = tex->GetConfig().GetMipRect(level);
+        if (!skip_readback)
+          m_readback_texture->CopyFromTexture(tex, rect, layer, level, rect);
+
+        size_t stride = AbstractTexture::CalculateStrideForFormat(config.format, level_width);
+        size_t size = stride * level_height;
+        size_t start = texture_data.size();
+        texture_data.resize(texture_data.size() + size);
+        if (!skip_readback)
+          m_readback_texture->ReadTexels(rect, &texture_data[start], static_cast<u32>(stride));
+      }
+    }
+  }
+  else
+  {
+    PanicAlert("Failed to create staging texture for serialization");
+  }
+
+  p.Do(texture_data);
+}
+
+std::optional<TextureCacheBase::TexPoolEntry> TextureCacheBase::DeserializeTexture(PointerWrap& p)
+{
+  TextureConfig config;
+  p.Do(config);
+
+  std::vector<u8> texture_data;
+  p.Do(texture_data);
+
+  if (p.GetMode() != PointerWrap::MODE_READ || texture_data.empty())
+    return std::nullopt;
+
+  auto tex = AllocateTexture(config);
+  if (!tex)
+  {
+    PanicAlert("Failed to create texture for deserialization");
+    return std::nullopt;
+  }
+
+  size_t start = 0;
+  for (u32 layer = 0; layer < config.layers; layer++)
+  {
+    for (u32 level = 0; level < config.levels; level++)
+    {
+      u32 level_width = std::max(config.width >> level, 1u);
+      u32 level_height = std::max(config.height >> level, 1u);
+      size_t stride = AbstractTexture::CalculateStrideForFormat(config.format, level_width);
+      size_t size = stride * level_height;
+      if ((start + size) > texture_data.size())
+      {
+        ERROR_LOG(VIDEO, "Insufficient texture data for layer %u level %u", layer, level);
+        return tex;
+      }
+
+      tex->texture->Load(level, level_width, level_height, level_width, &texture_data[start], size);
+      start += size;
+    }
+  }
+
+  return tex;
+}
+
+void TextureCacheBase::DoState(PointerWrap& p)
+{
+  // Flush all pending XFB copies before either loading or saving.
+  FlushEFBCopies();
+
+  p.Do(last_entry_id);
+
+  if (p.GetMode() == PointerWrap::MODE_WRITE || p.GetMode() == PointerWrap::MODE_MEASURE)
+    DoSaveState(p);
+  else
+    DoLoadState(p);
+}
+
+void TextureCacheBase::DoSaveState(PointerWrap& p)
+{
+  std::map<const TCacheEntry*, u32> entry_map;
+  std::vector<TCacheEntry*> entries_to_save;
+  auto ShouldSaveEntry = [](const TCacheEntry* entry) {
+    // We skip non-copies as they can be decoded from RAM when the state is loaded.
+    // Storing them would duplicate data in the save state file, adding to decompression time.
+    return entry->IsCopy();
+  };
+  auto AddCacheEntryToMap = [&entry_map, &entries_to_save, &p](TCacheEntry* entry) -> u32 {
+    auto iter = entry_map.find(entry);
+    if (iter != entry_map.end())
+      return iter->second;
+
+    // Since we are sequentially allocating texture entries, we need to save the textures in the
+    // same order they were collected. This is because of iterating both the address and hash maps.
+    // Therefore, the map is used for fast lookup, and the vector for ordering.
+    u32 id = static_cast<u32>(entry_map.size());
+    entry_map.emplace(entry, id);
+    entries_to_save.push_back(entry);
+    return id;
+  };
+  auto GetCacheEntryId = [&entry_map](const TCacheEntry* entry) -> std::optional<u32> {
+    auto iter = entry_map.find(entry);
+    return iter != entry_map.end() ? std::make_optional(iter->second) : std::nullopt;
+  };
+
+  // Transform the textures_by_address and textures_by_hash maps to a mapping
+  // of address/hash to entry ID.
+  std::vector<std::pair<u32, u32>> textures_by_address_list;
+  std::vector<std::pair<u64, u32>> textures_by_hash_list;
+  if (Config::Get(Config::GFX_SAVE_TEXTURE_CACHE_TO_STATE))
+  {
+    for (const auto& it : textures_by_address)
+    {
+      if (ShouldSaveEntry(it.second))
+      {
+        u32 id = AddCacheEntryToMap(it.second);
+        textures_by_address_list.push_back(std::make_pair(it.first, id));
+      }
+    }
+    for (const auto& it : textures_by_hash)
+    {
+      if (ShouldSaveEntry(it.second))
+      {
+        u32 id = AddCacheEntryToMap(it.second);
+        textures_by_hash_list.push_back(std::make_pair(it.first, id));
+      }
+    }
+  }
+
+  // Save the texture cache entries out in the order the were referenced.
+  u32 size = static_cast<u32>(entries_to_save.size());
+  p.Do(size);
+  for (TCacheEntry* entry : entries_to_save)
+  {
+    g_texture_cache->SerializeTexture(entry->texture.get(), entry->texture->GetConfig(), p);
+    entry->DoState(p);
+  }
+  p.DoMarker("TextureCacheEntries");
+
+  // Save references for each cache entry.
+  // As references are circular, we need to have everything created before linking entries.
+  std::set<std::pair<u32, u32>> reference_pairs;
+  for (const auto& it : entry_map)
+  {
+    const TCacheEntry* entry = it.first;
+    auto id1 = GetCacheEntryId(entry);
+    if (!id1)
+      continue;
+
+    for (const TCacheEntry* referenced_entry : entry->references)
+    {
+      auto id2 = GetCacheEntryId(referenced_entry);
+      if (!id2)
+        continue;
+
+      auto refpair1 = std::make_pair(*id1, *id2);
+      auto refpair2 = std::make_pair(*id2, *id1);
+      if (reference_pairs.count(refpair1) == 0 && reference_pairs.count(refpair2) == 0)
+        reference_pairs.insert(refpair1);
+    }
+  }
+
+  size = static_cast<u32>(reference_pairs.size());
+  p.Do(size);
+  for (const auto& it : reference_pairs)
+  {
+    p.Do(it.first);
+    p.Do(it.second);
+  }
+
+  size = static_cast<u32>(textures_by_address_list.size());
+  p.Do(size);
+  for (const auto& it : textures_by_address_list)
+  {
+    p.Do(it.first);
+    p.Do(it.second);
+  }
+
+  size = static_cast<u32>(textures_by_hash_list.size());
+  p.Do(size);
+  for (const auto& it : textures_by_hash_list)
+  {
+    p.Do(it.first);
+    p.Do(it.second);
+  }
+
+  // Free the readback texture to potentially save host-mapped GPU memory, depending on where
+  // the driver mapped the staging buffer.
+  m_readback_texture.reset();
+}
+
+void TextureCacheBase::DoLoadState(PointerWrap& p)
+{
+  // Helper for getting a cache entry from an ID.
+  std::map<u32, TCacheEntry*> id_map;
+  auto GetEntry = [&id_map](u32 id) {
+    auto iter = id_map.find(id);
+    return iter == id_map.end() ? nullptr : iter->second;
+  };
+
+  // Only clear out state when actually restoring/loading.
+  // Since we throw away entries when not in loading mode now, we don't need to check
+  // before inserting entries into the cache, as GetEntry will always return null.
+  const bool commit_state = p.GetMode() == PointerWrap::MODE_READ;
+  if (commit_state)
+    Invalidate();
+
+  // Preload all cache entries.
+  u32 size = 0;
+  p.Do(size);
+  for (u32 i = 0; i < size; i++)
+  {
+    // Even if the texture isn't valid, we still need to create the cache entry object
+    // to update the point in the state state. We'll just throw it away if it's invalid.
+    auto tex = g_texture_cache->DeserializeTexture(p);
+    TCacheEntry* entry = new TCacheEntry(std::move(tex->texture), std::move(tex->framebuffer));
+    entry->textures_by_hash_iter = g_texture_cache->textures_by_hash.end();
+    entry->DoState(p);
+    if (entry->texture && commit_state)
+      id_map.emplace(i, entry);
+    else
+      delete entry;
+  }
+  p.DoMarker("TextureCacheEntries");
+
+  // Link all cache entry references.
+  p.Do(size);
+  for (u32 i = 0; i < size; i++)
+  {
+    u32 id1 = 0, id2 = 0;
+    p.Do(id1);
+    p.Do(id2);
+    TCacheEntry* e1 = GetEntry(id1);
+    TCacheEntry* e2 = GetEntry(id2);
+    if (e1 && e2)
+      e1->CreateReference(e2);
+  }
+
+  // Fill in address map.
+  p.Do(size);
+  for (u32 i = 0; i < size; i++)
+  {
+    u32 addr = 0;
+    u32 id = 0;
+    p.Do(addr);
+    p.Do(id);
+
+    TCacheEntry* entry = GetEntry(id);
+    if (entry)
+      textures_by_address.emplace(addr, entry);
+  }
+
+  // Fill in hash map.
+  p.Do(size);
+  for (u32 i = 0; i < size; i++)
+  {
+    u64 hash = 0;
+    u32 id = 0;
+    p.Do(hash);
+    p.Do(id);
+
+    TCacheEntry* entry = GetEntry(id);
+    if (entry)
+      entry->textures_by_hash_iter = textures_by_hash.emplace(hash, entry);
+  }
+}
+
+void TextureCacheBase::TCacheEntry::DoState(PointerWrap& p)
+{
+  p.Do(addr);
+  p.Do(size_in_bytes);
+  p.Do(base_hash);
+  p.Do(hash);
+  p.Do(format);
+  p.Do(memory_stride);
+  p.Do(is_efb_copy);
+  p.Do(is_custom_tex);
+  p.Do(may_have_overlapping_textures);
+  p.Do(tmem_only);
+  p.Do(has_arbitrary_mips);
+  p.Do(should_force_safe_hashing);
+  p.Do(is_xfb_copy);
+  p.Do(is_xfb_container);
+  p.Do(id);
+  p.Do(reference_changed);
+  p.Do(native_width);
+  p.Do(native_height);
+  p.Do(native_levels);
+  p.Do(frameCount);
+}
+
 TextureCacheBase::TCacheEntry*
 TextureCacheBase::DoPartialTextureUpdates(TCacheEntry* entry_to_update, u8* palette,
                                           TLUTFormat tlutfmt)
diff --git a/Source/Core/VideoCommon/TextureCacheBase.h b/Source/Core/VideoCommon/TextureCacheBase.h
index 12b39039dd..5e5a28b34b 100644
--- a/Source/Core/VideoCommon/TextureCacheBase.h
+++ b/Source/Core/VideoCommon/TextureCacheBase.h
@@ -24,6 +24,7 @@
 
 class AbstractFramebuffer;
 class AbstractStagingTexture;
+class PointerWrap;
 struct VideoConfig;
 
 struct TextureAndTLUTFormat
@@ -185,6 +186,17 @@ public:
     u32 GetNumLevels() const { return texture->GetConfig().levels; }
     u32 GetNumLayers() const { return texture->GetConfig().layers; }
     AbstractTextureFormat GetFormat() const { return texture->GetConfig().format; }
+    void DoState(PointerWrap& p);
+  };
+
+  // Minimal version of TCacheEntry just for TexPool
+  struct TexPoolEntry
+  {
+    std::unique_ptr<AbstractTexture> texture;
+    std::unique_ptr<AbstractFramebuffer> framebuffer;
+    int frameCount = FRAMECOUNT_INVALID;
+
+    TexPoolEntry(std::unique_ptr<AbstractTexture> tex, std::unique_ptr<AbstractFramebuffer> fb);
   };
 
   TextureCacheBase();
@@ -224,6 +236,13 @@ public:
   // Flushes all pending EFB copies to emulated RAM.
   void FlushEFBCopies();
 
+  // Texture Serialization
+  void SerializeTexture(AbstractTexture* tex, const TextureConfig& config, PointerWrap& p);
+  std::optional<TexPoolEntry> DeserializeTexture(PointerWrap& p);
+
+  // Save States
+  void DoState(PointerWrap& p);
+
   // Returns false if the top/bottom row coefficients are zero.
   static bool NeedsCopyFilterInShader(const EFBCopyFilterCoefficients& coefficients);
 
@@ -256,15 +275,6 @@ protected:
   static std::bitset<8> valid_bind_points;
 
 private:
-  // Minimal version of TCacheEntry just for TexPool
-  struct TexPoolEntry
-  {
-    std::unique_ptr<AbstractTexture> texture;
-    std::unique_ptr<AbstractFramebuffer> framebuffer;
-    int frameCount = FRAMECOUNT_INVALID;
-
-    TexPoolEntry(std::unique_ptr<AbstractTexture> tex, std::unique_ptr<AbstractFramebuffer> fb);
-  };
   using TexAddrCache = std::multimap<u32, TCacheEntry*>;
   using TexHashCache = std::multimap<u64, TCacheEntry*>;
   using TexPool = std::unordered_multimap<TextureConfig, TexPoolEntry>;
@@ -319,6 +329,10 @@ private:
   // Returns an EFB copy staging texture to the pool, so it can be re-used.
   void ReleaseEFBCopyStagingTexture(std::unique_ptr<AbstractStagingTexture> tex);
 
+  bool CheckReadbackTexture(u32 width, u32 height, AbstractTextureFormat format);
+  void DoSaveState(PointerWrap& p);
+  void DoLoadState(PointerWrap& p);
+
   TexAddrCache textures_by_address;
   TexHashCache textures_by_hash;
   TexPool texture_pool;
@@ -354,6 +368,11 @@ private:
   // List of pending EFB copies. It is important that the order is preserved for these,
   // so that overlapping textures are written to guest RAM in the order they are issued.
   std::vector<TCacheEntry*> m_pending_efb_copies;
+
+  // Staging texture used for readbacks.
+  // We store this in the class so that the same staging texture can be used for multiple
+  // readbacks, saving the overhead of allocating a new buffer every time.
+  std::unique_ptr<AbstractStagingTexture> m_readback_texture;
 };
 
 extern std::unique_ptr<TextureCacheBase> g_texture_cache;
diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp
index c9fa12f6bb..f1150dc340 100644
--- a/Source/Core/VideoCommon/VertexManagerBase.cpp
+++ b/Source/Core/VideoCommon/VertexManagerBase.cpp
@@ -338,9 +338,6 @@ void VertexManagerBase::Flush()
 
   m_is_flushed = true;
 
-  // loading a state will invalidate BP, so check for it
-  g_video_backend->CheckInvalidState();
-
 #if defined(_DEBUG) || defined(DEBUGFAST)
   PRIM_LOG("frame%d:\n texgen=%u, numchan=%u, dualtex=%u, ztex=%u, cole=%u, alpe=%u, ze=%u",
            g_ActiveConfig.iSaveTargetId, xfmem.numTexGen.numTexGens, xfmem.numChan.numColorChans,
@@ -464,6 +461,16 @@ void VertexManagerBase::Flush()
 
 void VertexManagerBase::DoState(PointerWrap& p)
 {
+  if (p.GetMode() == PointerWrap::MODE_READ)
+  {
+    // Flush old vertex data before loading state.
+    Flush();
+
+    // Clear all caches that touch RAM
+    // (? these don't appear to touch any emulation state that gets saved. moved to on load only.)
+    VertexLoaderManager::MarkAllDirty();
+  }
+
   p.Do(m_zslope);
 }
 
diff --git a/Source/Core/VideoCommon/VideoBackendBase.cpp b/Source/Core/VideoCommon/VideoBackendBase.cpp
index aa4cd8f6dd..8ee7eb1202 100644
--- a/Source/Core/VideoCommon/VideoBackendBase.cpp
+++ b/Source/Core/VideoCommon/VideoBackendBase.cpp
@@ -40,6 +40,7 @@
 #include "VideoCommon/RenderBase.h"
 #include "VideoCommon/TextureCacheBase.h"
 #include "VideoCommon/VertexLoaderManager.h"
+#include "VideoCommon/VertexManagerBase.h"
 #include "VideoCommon/VertexShaderManager.h"
 #include "VideoCommon/VideoCommon.h"
 #include "VideoCommon/VideoConfig.h"
@@ -236,41 +237,22 @@ void VideoBackendBase::PopulateBackendInfo()
   g_Config.Refresh();
 }
 
-// Run from the CPU thread
 void VideoBackendBase::DoState(PointerWrap& p)
 {
-  bool software = false;
-  p.Do(software);
-
-  if (p.GetMode() == PointerWrap::MODE_READ && software == true)
+  if (!SConfig::GetInstance().bCPUThread)
   {
-    // change mode to abort load of incompatible save state.
-    p.SetMode(PointerWrap::MODE_VERIFY);
+    VideoCommon_DoState(p);
+    return;
   }
 
-  VideoCommon_DoState(p);
-  p.DoMarker("VideoCommon");
+  AsyncRequests::Event ev = {};
+  ev.do_save_state.p = &p;
+  ev.type = AsyncRequests::Event::DO_SAVE_STATE;
+  AsyncRequests::GetInstance()->PushEvent(ev, true);
 
-  // Refresh state.
-  if (p.GetMode() == PointerWrap::MODE_READ)
-  {
-    m_invalid = true;
-
-    // Clear all caches that touch RAM
-    // (? these don't appear to touch any emulation state that gets saved. moved to on load only.)
-    VertexLoaderManager::MarkAllDirty();
-  }
-}
-
-void VideoBackendBase::CheckInvalidState()
-{
-  if (m_invalid)
-  {
-    m_invalid = false;
-
-    BPReload();
-    g_texture_cache->Invalidate();
-  }
+  // Let the GPU thread sleep after loading the state, so we're not spinning if paused after loading
+  // a state. The next GP burst will wake it up again.
+  Fifo::GpuMaySleep();
 }
 
 void VideoBackendBase::InitializeShared()
@@ -282,8 +264,6 @@ void VideoBackendBase::InitializeShared()
   // do not initialize again for the config window
   m_initialized = true;
 
-  m_invalid = false;
-
   CommandProcessor::Init();
   Fifo::Init();
   OpcodeDecoder::Init();
diff --git a/Source/Core/VideoCommon/VideoBackendBase.h b/Source/Core/VideoCommon/VideoBackendBase.h
index d1dada2247..0a248dbd70 100644
--- a/Source/Core/VideoCommon/VideoBackendBase.h
+++ b/Source/Core/VideoCommon/VideoBackendBase.h
@@ -63,18 +63,14 @@ public:
   // Called by the UI thread when the graphics config is opened.
   static void PopulateBackendInfo();
 
-  // the implementation needs not do synchronization logic, because calls to it are surrounded by
-  // PauseAndLock now
+  // Wrapper function which pushes the event to the GPU thread.
   void DoState(PointerWrap& p);
 
-  void CheckInvalidState();
-
 protected:
   void InitializeShared();
   void ShutdownShared();
 
   bool m_initialized = false;
-  bool m_invalid = false;
 };
 
 extern std::vector<std::unique_ptr<VideoBackendBase>> g_available_video_backends;
diff --git a/Source/Core/VideoCommon/VideoState.cpp b/Source/Core/VideoCommon/VideoState.cpp
index ab84e1b001..9b6418f98f 100644
--- a/Source/Core/VideoCommon/VideoState.cpp
+++ b/Source/Core/VideoCommon/VideoState.cpp
@@ -10,9 +10,12 @@
 #include "VideoCommon/CPMemory.h"
 #include "VideoCommon/CommandProcessor.h"
 #include "VideoCommon/Fifo.h"
+#include "VideoCommon/FramebufferManager.h"
 #include "VideoCommon/GeometryShaderManager.h"
 #include "VideoCommon/PixelEngine.h"
 #include "VideoCommon/PixelShaderManager.h"
+#include "VideoCommon/RenderBase.h"
+#include "VideoCommon/TextureCacheBase.h"
 #include "VideoCommon/TextureDecoder.h"
 #include "VideoCommon/VertexManagerBase.h"
 #include "VideoCommon/VertexShaderManager.h"
@@ -21,6 +24,15 @@
 
 void VideoCommon_DoState(PointerWrap& p)
 {
+  bool software = false;
+  p.Do(software);
+
+  if (p.GetMode() == PointerWrap::MODE_READ && software == true)
+  {
+    // change mode to abort load of incompatible save state.
+    p.SetMode(PointerWrap::MODE_VERIFY);
+  }
+
   // BP Memory
   p.Do(bpmem);
   p.DoMarker("BP Memory");
@@ -63,5 +75,19 @@ void VideoCommon_DoState(PointerWrap& p)
   BoundingBox::DoState(p);
   p.DoMarker("BoundingBox");
 
-  // TODO: search for more data that should be saved and add it here
+  g_framebuffer_manager->DoState(p);
+  p.DoMarker("FramebufferManager");
+
+  g_texture_cache->DoState(p);
+  p.DoMarker("TextureCache");
+
+  g_renderer->DoState(p);
+  p.DoMarker("Renderer");
+
+  // Refresh state.
+  if (p.GetMode() == PointerWrap::MODE_READ)
+  {
+    // Inform backend of new state from registers.
+    BPReload();
+  }
 }