From 40b7cc925254d113ec87366db155707df313bbf9 Mon Sep 17 00:00:00 2001
From: degasus <wickmarkus@web.de>
Date: Tue, 7 Feb 2017 09:27:30 +0100
Subject: [PATCH] JitArm64: Use a custom stack with proper guard pages.

---
 Source/Core/Core/PowerPC/JitArm64/Jit.cpp    | 50 ++++++++++++++++++--
 Source/Core/Core/PowerPC/JitArm64/Jit.h      |  8 ++--
 Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 17 +++++--
 3 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
index 190667daad..9cb3940de0 100644
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp
@@ -26,7 +26,15 @@
 
 using namespace Arm64Gen;
 
-static const int AARCH64_FARCODE_SIZE = 1024 * 1024 * 16;
+constexpr size_t CODE_SIZE = 1024 * 1024 * 32;
+constexpr size_t FARCODE_SIZE = 1024 * 1024 * 16;
+constexpr size_t FARCODE_SIZE_MMU = 1024 * 1024 * 48;
+
+constexpr size_t STACK_SIZE = 2 * 1024 * 1024;
+constexpr size_t SAFE_STACK_SIZE = 512 * 1024;
+constexpr size_t GUARD_SIZE = 0x10000;  // two guards - bottom (permanent) and middle (see above)
+constexpr size_t GUARD_OFFSET = STACK_SIZE - SAFE_STACK_SIZE - GUARD_SIZE;
+
 static bool HasCycleCounters()
 {
   // Bit needs to be set to support cycle counters
@@ -38,7 +46,7 @@ static bool HasCycleCounters()
 
 void JitArm64::Init()
 {
-  size_t child_code_size = SConfig::GetInstance().bMMU ? FARCODE_SIZE_MMU : AARCH64_FARCODE_SIZE;
+  size_t child_code_size = SConfig::GetInstance().bMMU ? FARCODE_SIZE_MMU : FARCODE_SIZE;
   AllocCodeSpace(CODE_SIZE + child_code_size);
   AddChildCodeSpace(&farcode, child_code_size);
   jo.enableBlocklink = true;
@@ -56,6 +64,7 @@ void JitArm64::Init()
   analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
   m_enable_blr_optimization = true;
 
+  AllocStack();
   GenerateAsm();
 
   m_supports_cycle_counter = HasCycleCounters();
@@ -78,6 +87,7 @@ void JitArm64::Shutdown()
 {
   FreeCodeSpace();
   blocks.Shutdown();
+  FreeStack();
 }
 
 void JitArm64::FallBackToInterpreter(UGeckoInstruction inst)
@@ -199,7 +209,41 @@ void JitArm64::ResetStack()
     return;
 
   LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
-  SUB(SP, X0, 16);
+  ADD(SP, X0, 0);
+}
+
+void JitArm64::AllocStack()
+{
+  if (!m_enable_blr_optimization)
+    return;
+
+#ifndef _WIN32
+  m_stack_base = static_cast<u8*>(Common::AllocateMemoryPages(STACK_SIZE));
+  if (!m_stack_base)
+  {
+    m_enable_blr_optimization = false;
+    return;
+  }
+
+  m_stack_pointer = m_stack_base + GUARD_OFFSET;
+  Common::ReadProtectMemory(m_stack_base, GUARD_SIZE);
+  Common::ReadProtectMemory(m_stack_pointer, GUARD_SIZE);
+#else
+  // For windows we just keep using the system stack and reserve a large amount of memory at the end
+  // of the stack.
+  ULONG reserveSize = SAFE_STACK_SIZE;
+  SetThreadStackGuarantee(&reserveSize);
+#endif
+}
+
+void JitArm64::FreeStack()
+{
+#ifndef _WIN32
+  if (m_stack_base)
+    Common::FreeMemoryPages(m_stack_base, STACK_SIZE);
+  m_stack_base = nullptr;
+  m_stack_pointer = nullptr;
+#endif
 }
 
 void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return)
diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h
index 92b5ffa4c1..e0613f82b9 100644
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.h
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h
@@ -18,9 +18,6 @@
 #include "Core/PowerPC/JitCommon/JitBase.h"
 #include "Core/PowerPC/PPCAnalyst.h"
 
-constexpr size_t CODE_SIZE = 1024 * 1024 * 32;
-constexpr size_t FARCODE_SIZE_MMU = 1024 * 1024 * 48;
-
 class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonAsmRoutinesBase
 {
 public:
@@ -191,6 +188,9 @@ private:
   bool m_supports_cycle_counter;
 
   bool m_enable_blr_optimization;
+  u8* m_stack_base = nullptr;
+  u8* m_stack_pointer = nullptr;
+  u8* m_saved_stack_pointer = nullptr;
 
   void EmitResetCycleCounters();
   void EmitGetCycles(Arm64Gen::ARM64Reg reg);
@@ -226,6 +226,8 @@ private:
   void DoDownCount();
   void Cleanup();
   void ResetStack();
+  void AllocStack();
+  void FreeStack();
 
   // AsmRoutines
   void GenerateAsm();
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
index 41979be0fa..bd35f2ace4 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp
@@ -28,14 +28,24 @@ void JitArm64::GenerateAsm()
 
   MOVP2R(PPC_REG, &PowerPC::ppcState);
 
-  // Store the stack pointer, so we can reset it if the BLR optimization fails.
+  // Swap the stack pointer, so we have proper guard pages.
   ADD(X0, SP, 0);
-  STR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
+  MOVP2R(X1, &m_saved_stack_pointer);
+  STR(INDEX_UNSIGNED, X0, X1, 0);
+  MOVP2R(X1, &m_stack_pointer);
+  LDR(INDEX_UNSIGNED, X0, X1, 0);
+  FixupBranch no_fake_stack = CBZ(X0);
+  ADD(SP, X0, 0);
+  SetJumpTarget(no_fake_stack);
 
   // Push {nullptr; -1} as invalid destination on the stack.
   MOVI2R(X0, 0xFFFFFFFF);
   STP(INDEX_PRE, ZR, X0, SP, -16);
 
+  // Store the stack pointer, so we can reset it if the BLR optimization fails.
+  ADD(X0, SP, 0);
+  STR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
+
   // The PC will be loaded into DISPATCHER_PC after the call to CoreTiming::Advance().
   // Advance() does an exception check so we don't know what PC to use until afterwards.
   FixupBranch to_start_of_timing_slice = B();
@@ -161,7 +171,8 @@ void JitArm64::GenerateAsm()
   SetJumpTarget(Exit);
 
   // Reset the stack pointer, as the BLR optimization have touched it.
-  LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(stored_stack_pointer));
+  MOVP2R(X1, &m_saved_stack_pointer);
+  LDR(INDEX_UNSIGNED, X0, X1, 0);
   ADD(SP, X0, 0);
 
   ABI_PopRegisters(regs_to_save);