diff --git a/Source/Core/Core/Config/MainSettings.cpp b/Source/Core/Core/Config/MainSettings.cpp index c88849a795..a6f3a890aa 100644 --- a/Source/Core/Core/Config/MainSettings.cpp +++ b/Source/Core/Core/Config/MainSettings.cpp @@ -37,6 +37,7 @@ const Info MAIN_CPU_CORE{{System::Main, "Core", "CPUCore"}, PowerPC::DefaultCPUCore()}; const Info MAIN_JIT_FOLLOW_BRANCH{{System::Main, "Core", "JITFollowBranch"}, true}; const Info MAIN_FASTMEM{{System::Main, "Core", "Fastmem"}, true}; +const Info MAIN_ACCURATE_CPU_CACHE{{System::Main, "Core", "AccurateCPUCache"}, false}; const Info MAIN_DSP_HLE{{System::Main, "Core", "DSPHLE"}, true}; const Info MAIN_TIMING_VARIANCE{{System::Main, "Core", "TimingVariance"}, 40}; const Info MAIN_CPU_THREAD{{System::Main, "Core", "CPUThread"}, true}; diff --git a/Source/Core/Core/Config/MainSettings.h b/Source/Core/Core/Config/MainSettings.h index 0730681f2d..92b909adf5 100644 --- a/Source/Core/Core/Config/MainSettings.h +++ b/Source/Core/Core/Config/MainSettings.h @@ -55,6 +55,7 @@ extern const Info MAIN_SKIP_IPL; extern const Info MAIN_CPU_CORE; extern const Info MAIN_JIT_FOLLOW_BRANCH; extern const Info MAIN_FASTMEM; +extern const Info MAIN_ACCURATE_CPU_CACHE; // Should really be in the DSP section, but we're kind of stuck with bad decisions made in the past. extern const Info MAIN_DSP_HLE; extern const Info MAIN_TIMING_VARIANCE; diff --git a/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp b/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp index c93375c148..fedf6c6078 100644 --- a/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp +++ b/Source/Core/Core/ConfigLoaders/IsSettingSaveable.cpp @@ -127,6 +127,7 @@ bool IsSettingSaveable(const Config::Location& config_location) &Config::MAIN_CPU_THREAD.GetLocation(), &Config::MAIN_MMU.GetLocation(), &Config::MAIN_PAUSE_ON_PANIC.GetLocation(), + &Config::MAIN_ACCURATE_CPU_CACHE.GetLocation(), &Config::MAIN_BB_DUMP_PORT.GetLocation(), &Config::MAIN_SYNC_GPU.GetLocation(), &Config::MAIN_SYNC_GPU_MAX_DISTANCE.GetLocation(), diff --git a/Source/Core/Core/DolphinAnalytics.cpp b/Source/Core/Core/DolphinAnalytics.cpp index 93d42d2db9..ea551143ec 100644 --- a/Source/Core/Core/DolphinAnalytics.cpp +++ b/Source/Core/Core/DolphinAnalytics.cpp @@ -135,8 +135,7 @@ void DolphinAnalytics::ReportGameStart() } // Keep in sync with enum class GameQuirk definition. -constexpr std::array GAME_QUIRKS_NAMES{ - "icache-matters", +constexpr std::array GAME_QUIRKS_NAMES{ "directly-reads-wiimote-input", "uses-DVDLowStopLaser", "uses-DVDLowOffset", diff --git a/Source/Core/Core/DolphinAnalytics.h b/Source/Core/Core/DolphinAnalytics.h index c708ad3a13..98c7d4a973 100644 --- a/Source/Core/Core/DolphinAnalytics.h +++ b/Source/Core/Core/DolphinAnalytics.h @@ -21,12 +21,9 @@ enum class GameQuirk { - // Sometimes code run from ICache is different from its mirror in RAM. - ICACHE_MATTERS = 0, - // The Wii remote hardware makes it possible to bypass normal data reporting and directly // "read" extension or IR data. This would break our current TAS/NetPlay implementation. - DIRECTLY_READS_WIIMOTE_INPUT, + DIRECTLY_READS_WIIMOTE_INPUT = 0, // Several Wii DI commands that are rarely/never used and not implemented by Dolphin USES_DVD_LOW_STOP_LASER, diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp index 88f9f9997c..1c1a341c85 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp @@ -438,14 +438,17 @@ void Interpreter::dcba(UGeckoInstruction inst) void Interpreter::dcbf(UGeckoInstruction inst) { - // TODO: Implement some sort of L2 emulation. - // TODO: Raise DSI if translation fails (except for direct-store segments). - - // Invalidate the JIT cache here as a heuristic to compensate for - // the lack of precise L1 icache emulation in the JIT. (Portable software - // should use icbi consistently, but games aren't portable.) const u32 address = Helper_Get_EA_X(PowerPC::ppcState, inst); - JitInterface::InvalidateICacheLine(address); + if (!PowerPC::ppcState.m_enable_dcache) + { + // Invalidate the JIT cache here as a heuristic to compensate for + // the lack of precise L1 icache emulation in the JIT. (Portable software + // should use icbi consistently, but games aren't portable.) + JitInterface::InvalidateICacheLine(address); + return; + } + + PowerPC::FlushDCacheLine(address); } void Interpreter::dcbi(UGeckoInstruction inst) @@ -456,42 +459,44 @@ void Interpreter::dcbi(UGeckoInstruction inst) return; } - // TODO: Implement some sort of L2 emulation. - // TODO: Raise DSI if translation fails (except for direct-store segments). - - // Invalidate the JIT cache here as a heuristic to compensate for - // the lack of precise L1 icache emulation in the JIT. (Portable software - // should use icbi consistently, but games aren't portable.) const u32 address = Helper_Get_EA_X(PowerPC::ppcState, inst); - JitInterface::InvalidateICacheLine(address); + if (!PowerPC::ppcState.m_enable_dcache) + { + // Invalidate the JIT cache here as a heuristic to compensate for + // the lack of precise L1 icache emulation in the JIT. (Portable software + // should use icbi consistently, but games aren't portable.) + JitInterface::InvalidateICacheLine(address); + return; + } + + PowerPC::InvalidateDCacheLine(address); } void Interpreter::dcbst(UGeckoInstruction inst) { - // TODO: Implement some sort of L2 emulation. - // TODO: Raise DSI if translation fails (except for direct-store segments). - - // Invalidate the JIT cache here as a heuristic to compensate for - // the lack of precise L1 icache emulation in the JIT. (Portable software - // should use icbi consistently, but games aren't portable.) const u32 address = Helper_Get_EA_X(PowerPC::ppcState, inst); - JitInterface::InvalidateICacheLine(address); + if (!PowerPC::ppcState.m_enable_dcache) + { + // Invalidate the JIT cache here as a heuristic to compensate for + // the lack of precise L1 icache emulation in the JIT. (Portable software + // should use icbi consistently, but games aren't portable.) + JitInterface::InvalidateICacheLine(address); + return; + } + + PowerPC::StoreDCacheLine(address); } +// These instructions hint that it might be optimal to prefetch the specified cache line into the +// data cache. But the CPU is never guaranteed to do this fetch, and in practice it's not more +// performant to emulate it. + void Interpreter::dcbt(UGeckoInstruction inst) { - if (HID0.NOOPTI) - return; - - // TODO: Implement some sort of L2 emulation. } void Interpreter::dcbtst(UGeckoInstruction inst) { - if (HID0.NOOPTI) - return; - - // TODO: Implement some sort of L2 emulation. } void Interpreter::dcbz(UGeckoInstruction inst) @@ -504,15 +509,18 @@ void Interpreter::dcbz(UGeckoInstruction inst) return; } - // Hack to stop dcbz/dcbi over low MEM1 trashing memory. - if ((dcbz_addr < 0x80008000) && (dcbz_addr >= 0x80000000) && - Config::Get(Config::MAIN_LOW_DCBZ_HACK)) + if (!PowerPC::ppcState.m_enable_dcache) { - return; + // Hack to stop dcbz/dcbi over low MEM1 trashing memory. This is not needed if data cache + // emulation is enabled. + if ((dcbz_addr < 0x80008000) && (dcbz_addr >= 0x80000000) && + Config::Get(Config::MAIN_LOW_DCBZ_HACK)) + { + return; + } } - // TODO: Implement some sort of L2 emulation. - PowerPC::ClearCacheLine(dcbz_addr & (~31)); + PowerPC::ClearDCacheLine(dcbz_addr & (~31)); } void Interpreter::dcbz_l(UGeckoInstruction inst) @@ -531,8 +539,7 @@ void Interpreter::dcbz_l(UGeckoInstruction inst) return; } - // FAKE: clear memory instead of clearing the cache block - PowerPC::ClearCacheLine(address & (~31)); + PowerPC::ClearDCacheLine(address & (~31)); } // eciwx/ecowx technically should access the specified device diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp index 8699f05b75..09a96aafeb 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp @@ -250,9 +250,33 @@ void Interpreter::mfspr(UGeckoInstruction inst) rSPR(index) &= ~1; } break; + case SPR_XER: rSPR(index) = PowerPC::GetXER().Hex; break; + + case SPR_UPMC1: + rSPR(index) = rSPR(SPR_PMC1); + break; + + case SPR_UPMC2: + rSPR(index) = rSPR(SPR_PMC2); + break; + + case SPR_UPMC3: + rSPR(index) = rSPR(SPR_PMC3); + break; + + case SPR_UPMC4: + rSPR(index) = rSPR(SPR_PMC4); + break; + + case SPR_IABR: + // A strange quirk: reading back this register on hardware will always have the TE (Translation + // enabled) bit set to 0 (despite the bit appearing to function normally when set). This does + // not apply to the DABR. + rGPR[inst.RD] = rSPR(index) & ~1; + return; } rGPR[inst.RD] = rSPR(index); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index 046a0d6d94..fe95271136 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -229,6 +229,8 @@ void Jit64::lXXx(UGeckoInstruction inst) void Jit64::dcbx(UGeckoInstruction inst) { + FALLBACK_IF(m_accurate_cpu_cache_enabled); + INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); @@ -444,7 +446,7 @@ void Jit64::dcbz(UGeckoInstruction inst) MOV(32, PPCSTATE(pc), Imm32(js.compilerPC)); BitSet32 registersInUse = CallerSavedRegistersInUse(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); - ABI_CallFunctionR(PowerPC::ClearCacheLine, RSCRATCH); + ABI_CallFunctionR(PowerPC::ClearDCacheLine, RSCRATCH); ABI_PopRegistersAndAdjustStack(registersInUse, 0); if (emit_fast_path) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp index 912bb6e603..197fce6945 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp @@ -412,6 +412,11 @@ void Jit64::mfspr(UGeckoInstruction inst) case SPR_PMC2: case SPR_PMC3: case SPR_PMC4: + case SPR_UPMC1: + case SPR_UPMC2: + case SPR_UPMC3: + case SPR_UPMC4: + case SPR_IABR: FALLBACK_IF(true); default: { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index 732094e3c1..e32d166be2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -61,6 +61,9 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, { const u32 access_size = BackPatchInfo::GetFlagSize(flags); + if (m_accurate_cpu_cache_enabled) + mode = MemAccessMode::AlwaysSafe; + const bool emit_fastmem = mode != MemAccessMode::AlwaysSafe; const bool emit_slowmem = mode != MemAccessMode::AlwaysUnsafe; @@ -228,7 +231,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, } else if (flags & BackPatchInfo::FLAG_ZERO_256) { - MOVP2R(ARM64Reg::X8, &PowerPC::ClearCacheLine); + MOVP2R(ARM64Reg::X8, &PowerPC::ClearDCacheLine); BLR(ARM64Reg::X8); } else diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index f00a700faf..64ba331b17 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -635,6 +635,8 @@ void JitArm64::stmw(UGeckoInstruction inst) void JitArm64::dcbx(UGeckoInstruction inst) { + FALLBACK_IF(m_accurate_cpu_cache_enabled); + INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp index cc95654b76..6a626f5aef 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp @@ -395,6 +395,15 @@ void JitArm64::mfspr(UGeckoInstruction inst) break; case SPR_WPAR: case SPR_DEC: + case SPR_PMC1: + case SPR_PMC2: + case SPR_PMC3: + case SPR_PMC4: + case SPR_UPMC1: + case SPR_UPMC2: + case SPR_UPMC3: + case SPR_UPMC4: + case SPR_IABR: FALLBACK_IF(true); default: gpr.BindToRegister(d, false); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp b/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp index 679fdea0a8..27bdf3bb13 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.cpp @@ -58,6 +58,13 @@ void JitBase::RefreshConfig() m_fastmem_enabled = Config::Get(Config::MAIN_FASTMEM); m_mmu_enabled = Core::System::GetInstance().IsMMUMode(); m_pause_on_panic_enabled = Core::System::GetInstance().IsPauseOnPanicMode(); + m_accurate_cpu_cache_enabled = Config::Get(Config::MAIN_ACCURATE_CPU_CACHE); + if (m_accurate_cpu_cache_enabled) + { + m_fastmem_enabled = false; + // This hack is unneeded if the data cache is being emulated. + m_low_dcbz_hack = false; + } analyzer.SetDebuggingEnabled(m_enable_debugging); analyzer.SetBranchFollowingEnabled(Config::Get(Config::MAIN_JIT_FOLLOW_BRANCH)); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBase.h b/Source/Core/Core/PowerPC/JitCommon/JitBase.h index 99c4d67485..ad218ed8a3 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBase.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitBase.h @@ -136,6 +136,7 @@ protected: bool m_fastmem_enabled = false; bool m_mmu_enabled = false; bool m_pause_on_panic_enabled = false; + bool m_accurate_cpu_cache_enabled = false; void RefreshConfig(); diff --git a/Source/Core/Core/PowerPC/MMU.cpp b/Source/Core/Core/PowerPC/MMU.cpp index 150b6673ec..d620375823 100644 --- a/Source/Core/Core/PowerPC/MMU.cpp +++ b/Source/Core/Core/PowerPC/MMU.cpp @@ -189,6 +189,8 @@ static T ReadFromHardware(Memory::MemoryManager& memory, u32 em_address) return static_cast(var); } + bool wi = false; + if (!never_translate && MSR.DR) { auto translated_addr = TranslateAddress(em_address); @@ -199,6 +201,7 @@ static T ReadFromHardware(Memory::MemoryManager& memory, u32 em_address) return 0; } em_address = translated_addr.address; + wi = translated_addr.wi; } if (flag == XCheckTLBFlag::Read && (em_address & 0xF8000000) == 0x08000000) @@ -223,7 +226,18 @@ static T ReadFromHardware(Memory::MemoryManager& memory, u32 em_address) // Handle RAM; the masking intentionally discards bits (essentially creating // mirrors of memory). T value; - std::memcpy(&value, &memory.GetRAM()[em_address & memory.GetRamMask()], sizeof(T)); + em_address &= memory.GetRamMask(); + + if (!ppcState.m_enable_dcache || wi) + { + std::memcpy(&value, &memory.GetRAM()[em_address], sizeof(T)); + } + else + { + ppcState.dCache.Read(em_address, &value, sizeof(T), + HID0.DLOCK || flag != XCheckTLBFlag::Read); + } + return bswap(value); } @@ -231,7 +245,18 @@ static T ReadFromHardware(Memory::MemoryManager& memory, u32 em_address) (em_address & 0x0FFFFFFF) < memory.GetExRamSizeReal()) { T value; - std::memcpy(&value, &memory.GetEXRAM()[em_address & 0x0FFFFFFF], sizeof(T)); + em_address &= 0x0FFFFFFF; + + if (!ppcState.m_enable_dcache || wi) + { + std::memcpy(&value, &memory.GetEXRAM()[em_address], sizeof(T)); + } + else + { + ppcState.dCache.Read(em_address + 0x10000000, &value, sizeof(T), + HID0.DLOCK || flag != XCheckTLBFlag::Read); + } + return bswap(value); } @@ -396,14 +421,28 @@ static void WriteToHardware(Core::System& system, Memory::MemoryManager& memory, { // Handle RAM; the masking intentionally discards bits (essentially creating // mirrors of memory). - std::memcpy(&memory.GetRAM()[em_address & memory.GetRamMask()], &swapped_data, size); + em_address &= memory.GetRamMask(); + + if (ppcState.m_enable_dcache && !wi) + ppcState.dCache.Write(em_address, &swapped_data, size, HID0.DLOCK); + + if (!ppcState.m_enable_dcache || wi || flag != XCheckTLBFlag::Write) + std::memcpy(&memory.GetRAM()[em_address], &swapped_data, size); + return; } if (memory.GetEXRAM() && (em_address >> 28) == 0x1 && (em_address & 0x0FFFFFFF) < memory.GetExRamSizeReal()) { - std::memcpy(&memory.GetEXRAM()[em_address & 0x0FFFFFFF], &swapped_data, size); + em_address &= 0x0FFFFFFF; + + if (ppcState.m_enable_dcache && !wi) + ppcState.dCache.Write(em_address + 0x10000000, &swapped_data, size, HID0.DLOCK); + + if (!ppcState.m_enable_dcache || wi || flag != XCheckTLBFlag::Write) + std::memcpy(&memory.GetEXRAM()[em_address], &swapped_data, size); + return; } @@ -1105,7 +1144,7 @@ void DMA_MemoryToLC(const u32 cache_address, const u32 mem_address, const u32 nu memcpy(dst, src, 32 * num_blocks); } -void ClearCacheLine(u32 address) +void ClearDCacheLine(u32 address) { DEBUG_ASSERT((address & 0x1F) == 0); if (MSR.DR) @@ -1136,6 +1175,100 @@ void ClearCacheLine(u32 address) WriteToHardware(system, memory, address + i, 0, 4); } +void StoreDCacheLine(u32 address) +{ + address &= ~0x1F; + + if (MSR.DR) + { + auto translated_address = TranslateAddress(address); + if (translated_address.result == TranslateAddressResultEnum::DIRECT_STORE_SEGMENT) + { + return; + } + if (translated_address.result == TranslateAddressResultEnum::PAGE_FAULT) + { + // If translation fails, generate a DSI. + GenerateDSIException(address, true); + return; + } + address = translated_address.address; + } + + if (ppcState.m_enable_dcache) + ppcState.dCache.Store(address); +} + +void InvalidateDCacheLine(u32 address) +{ + address &= ~0x1F; + + if (MSR.DR) + { + auto translated_address = TranslateAddress(address); + if (translated_address.result == TranslateAddressResultEnum::DIRECT_STORE_SEGMENT) + { + return; + } + if (translated_address.result == TranslateAddressResultEnum::PAGE_FAULT) + { + return; + } + address = translated_address.address; + } + + if (ppcState.m_enable_dcache) + ppcState.dCache.Invalidate(address); +} + +void FlushDCacheLine(u32 address) +{ + address &= ~0x1F; + + if (MSR.DR) + { + auto translated_address = TranslateAddress(address); + if (translated_address.result == TranslateAddressResultEnum::DIRECT_STORE_SEGMENT) + { + return; + } + if (translated_address.result == TranslateAddressResultEnum::PAGE_FAULT) + { + // If translation fails, generate a DSI. + GenerateDSIException(address, true); + return; + } + address = translated_address.address; + } + + if (ppcState.m_enable_dcache) + ppcState.dCache.Flush(address); +} + +void TouchDCacheLine(u32 address, bool store) +{ + address &= ~0x1F; + + if (MSR.DR) + { + auto translated_address = TranslateAddress(address); + if (translated_address.result == TranslateAddressResultEnum::DIRECT_STORE_SEGMENT) + { + return; + } + if (translated_address.result == TranslateAddressResultEnum::PAGE_FAULT) + { + // If translation fails, generate a DSI. + GenerateDSIException(address, true); + return; + } + address = translated_address.address; + } + + if (ppcState.m_enable_dcache) + ppcState.dCache.Touch(address, store); +} + u32 IsOptimizableMMIOAccess(u32 address, u32 access_size) { if (PowerPC::memchecks.HasAny()) diff --git a/Source/Core/Core/PowerPC/MMU.h b/Source/Core/Core/PowerPC/MMU.h index 6eda9a22b7..44b9785611 100644 --- a/Source/Core/Core/PowerPC/MMU.h +++ b/Source/Core/Core/PowerPC/MMU.h @@ -164,7 +164,12 @@ void Write_F64(double var, u32 address); void DMA_LCToMemory(u32 mem_address, u32 cache_address, u32 num_blocks); void DMA_MemoryToLC(u32 cache_address, u32 mem_address, u32 num_blocks); -void ClearCacheLine(u32 address); // Zeroes 32 bytes; address should be 32-byte-aligned + +void ClearDCacheLine(u32 address); // Zeroes 32 bytes; address should be 32-byte-aligned +void StoreDCacheLine(u32 address); +void InvalidateDCacheLine(u32 address); +void FlushDCacheLine(u32 address); +void TouchDCacheLine(u32 address, bool store); // TLB functions void SDRUpdated(); diff --git a/Source/Core/Core/PowerPC/PPCCache.cpp b/Source/Core/Core/PowerPC/PPCCache.cpp index 89b85a60db..d56d77927c 100644 --- a/Source/Core/Core/PowerPC/PPCCache.cpp +++ b/Source/Core/Core/PowerPC/PPCCache.cpp @@ -94,48 +94,299 @@ InstructionCache::~InstructionCache() Config::RemoveConfigChangedCallback(*m_config_callback_id); } -void InstructionCache::Reset() +void Cache::Reset() { valid.fill(0); plru.fill(0); + modified.fill(0); lookup_table.fill(0xFF); lookup_table_ex.fill(0xFF); lookup_table_vmem.fill(0xFF); +} + +void InstructionCache::Reset() +{ + Cache::Reset(); JitInterface::ClearSafe(); } +void Cache::Init() +{ + data.fill({}); + addrs.fill({}); + Reset(); +} + void InstructionCache::Init() { if (!m_config_callback_id) m_config_callback_id = Config::AddConfigChangedCallback([this] { RefreshConfig(); }); RefreshConfig(); - data.fill({}); - tags.fill({}); + Cache::Init(); +} + +void Cache::Store(u32 addr) +{ + auto& system = Core::System::GetInstance(); + auto& memory = system.GetMemory(); + + auto [set, way] = GetCache(addr, true); + + if (way == 0xff) + return; + + if (valid[set] & (1U << way) && modified[set] & (1U << way)) + memory.CopyToEmu((addr & ~0x1f), reinterpret_cast(data[set][way].data()), 32); + modified[set] &= ~(1U << way); +} + +void Cache::FlushAll() +{ + auto& system = Core::System::GetInstance(); + auto& memory = system.GetMemory(); + + for (size_t set = 0; set < CACHE_SETS; set++) + { + for (size_t way = 0; way < CACHE_WAYS; way++) + { + if (valid[set] & (1U << way) && modified[set] & (1U << way)) + memory.CopyToEmu(addrs[set][way], reinterpret_cast(data[set][way].data()), 32); + } + } + Reset(); } -void InstructionCache::Invalidate(u32 addr) +void Cache::Invalidate(u32 addr) { - if (!HID0.ICE || m_disable_icache) + auto [set, way] = GetCache(addr, true); + + if (way == 0xff) return; - // Invalidates the whole set - const u32 set = (addr >> 5) & 0x7f; - for (size_t i = 0; i < 8; i++) + if (valid[set] & (1U << way)) { - if (valid[set] & (1U << i)) + if (addrs[set][way] & CACHE_VMEM_BIT) + lookup_table_vmem[(addrs[set][way] >> 5) & 0xfffff] = 0xff; + else if (addrs[set][way] & CACHE_EXRAM_BIT) + lookup_table_ex[(addrs[set][way] >> 5) & 0x1fffff] = 0xff; + else + lookup_table[(addrs[set][way] >> 5) & 0xfffff] = 0xff; + + valid[set] &= ~(1U << way); + modified[set] &= ~(1U << way); + } +} + +void Cache::Flush(u32 addr) +{ + auto& system = Core::System::GetInstance(); + auto& memory = system.GetMemory(); + + auto [set, way] = GetCache(addr, true); + + if (way == 0xff) + return; + + if (valid[set] & (1U << way)) + { + if (modified[set] & (1U << way)) + memory.CopyToEmu((addr & ~0x1f), reinterpret_cast(data[set][way].data()), 32); + + if (addrs[set][way] & CACHE_VMEM_BIT) + lookup_table_vmem[(addrs[set][way] >> 5) & 0xfffff] = 0xff; + else if (addrs[set][way] & CACHE_EXRAM_BIT) + lookup_table_ex[(addrs[set][way] >> 5) & 0x1fffff] = 0xff; + else + lookup_table[(addrs[set][way] >> 5) & 0xfffff] = 0xff; + + valid[set] &= ~(1U << way); + modified[set] &= ~(1U << way); + } +} + +void Cache::Touch(u32 addr, bool store) +{ + GetCache(addr, false); +} + +std::pair Cache::GetCache(u32 addr, bool locked) +{ + auto& system = Core::System::GetInstance(); + auto& memory = system.GetMemory(); + + addr &= ~31; + u32 set = (addr >> 5) & 0x7f; + u32 way; + + if (addr & CACHE_VMEM_BIT) + { + way = lookup_table_vmem[(addr >> 5) & 0xfffff]; + } + else if (addr & CACHE_EXRAM_BIT) + { + way = lookup_table_ex[(addr >> 5) & 0x1fffff]; + } + else + { + way = lookup_table[(addr >> 5) & 0xfffff]; + } + + // load to the cache + if (!locked && way == 0xff) + { + // select a way + if (valid[set] != 0xff) + way = s_way_from_valid[valid[set]]; + else + way = s_way_from_plru[plru[set]]; + + if (valid[set] & (1 << way)) { - if (tags[set][i] & (ICACHE_VMEM_BIT >> 12)) - lookup_table_vmem[((tags[set][i] << 7) | set) & 0xfffff] = 0xff; - else if (tags[set][i] & (ICACHE_EXRAM_BIT >> 12)) - lookup_table_ex[((tags[set][i] << 7) | set) & 0x1fffff] = 0xff; + // store the cache back to main memory + if (modified[set] & (1 << way)) + memory.CopyToEmu(addrs[set][way], reinterpret_cast(data[set][way].data()), 32); + + if (addrs[set][way] & CACHE_VMEM_BIT) + lookup_table_vmem[(addrs[set][way] >> 5) & 0xfffff] = 0xff; + else if (addrs[set][way] & CACHE_EXRAM_BIT) + lookup_table_ex[(addrs[set][way] >> 5) & 0x1fffff] = 0xff; else - lookup_table[((tags[set][i] << 7) | set) & 0xfffff] = 0xff; + lookup_table[(addrs[set][way] >> 5) & 0xfffff] = 0xff; + } + + // load + memory.CopyFromEmu(reinterpret_cast(data[set][way].data()), (addr & ~0x1f), 32); + + if (addr & CACHE_VMEM_BIT) + lookup_table_vmem[(addr >> 5) & 0xfffff] = way; + else if (addr & CACHE_EXRAM_BIT) + lookup_table_ex[(addr >> 5) & 0x1fffff] = way; + else + lookup_table[(addr >> 5) & 0xfffff] = way; + + addrs[set][way] = addr; + valid[set] |= (1 << way); + modified[set] &= ~(1 << way); + + // update plru + if (way != 0xff) + plru[set] = (plru[set] & ~s_plru_mask[way]) | s_plru_value[way]; + } + + return {set, way}; +} + +void Cache::Read(u32 addr, void* buffer, u32 len, bool locked) +{ + auto& system = Core::System::GetInstance(); + auto& memory = system.GetMemory(); + + auto* value = static_cast(buffer); + + while (len > 0) + { + auto [set, way] = GetCache(addr, locked); + + u32 offset_in_block = addr - (addr & ~31); + u32 len_in_block = std::min(len, ((addr + 32) & ~31) - addr); + + if (way != 0xff) + { + std::memcpy(value, reinterpret_cast(data[set][way].data()) + offset_in_block, + len_in_block); + } + else + { + memory.CopyFromEmu(value, addr, len_in_block); + } + + addr += len_in_block; + len -= len_in_block; + value += len_in_block; + } +} + +void Cache::Write(u32 addr, const void* buffer, u32 len, bool locked) +{ + auto& system = Core::System::GetInstance(); + auto& memory = system.GetMemory(); + + auto* value = static_cast(buffer); + + while (len > 0) + { + auto [set, way] = GetCache(addr, locked); + + u32 offset_in_block = addr - (addr & ~31); + u32 len_in_block = std::min(len, ((addr + 32) & ~31) - addr); + + if (way != 0xff) + { + std::memcpy(reinterpret_cast(data[set][way].data()) + offset_in_block, value, + len_in_block); + modified[set] |= (1 << way); + } + else + { + memory.CopyToEmu(addr, value, len_in_block); + } + + addr += len_in_block; + len -= len_in_block; + value += len_in_block; + } +} + +void Cache::DoState(PointerWrap& p) +{ + if (p.IsReadMode()) + { + // Clear valid parts of the lookup tables (this is done instead of using fill(0xff) to avoid + // loading the entire 4MB of tables into cache) + for (u32 set = 0; set < CACHE_SETS; set++) + { + for (u32 way = 0; way < CACHE_WAYS; way++) + { + if ((valid[set] & (1 << way)) != 0) + { + if (addrs[set][way] & CACHE_VMEM_BIT) + lookup_table_vmem[(addrs[set][way] >> 5) & 0xfffff] = 0xff; + else if (addrs[set][way] & CACHE_EXRAM_BIT) + lookup_table_ex[(addrs[set][way] >> 5) & 0x1fffff] = 0xff; + else + lookup_table[(addrs[set][way] >> 5) & 0xfffff] = 0xff; + } + } + } + } + + p.DoArray(data); + p.DoArray(plru); + p.DoArray(valid); + p.DoArray(addrs); + p.DoArray(modified); + + if (p.IsReadMode()) + { + // Recompute lookup tables + for (u32 set = 0; set < CACHE_SETS; set++) + { + for (u32 way = 0; way < CACHE_WAYS; way++) + { + if ((valid[set] & (1 << way)) != 0) + { + if (addrs[set][way] & CACHE_VMEM_BIT) + lookup_table_vmem[(addrs[set][way] >> 5) & 0xfffff] = 0xff; + else if (addrs[set][way] & CACHE_EXRAM_BIT) + lookup_table_ex[(addrs[set][way] >> 5) & 0x1fffff] = 0xff; + else + lookup_table[(addrs[set][way] >> 5) & 0xfffff] = 0xff; + } + } } } - valid[set] = 0; - JitInterface::InvalidateICacheLine(addr); } u32 InstructionCache::ReadInstruction(u32 addr) @@ -145,116 +396,35 @@ u32 InstructionCache::ReadInstruction(u32 addr) if (!HID0.ICE || m_disable_icache) // instruction cache is disabled return memory.Read_U32(addr); - u32 set = (addr >> 5) & 0x7f; - u32 tag = addr >> 12; - u32 t; - if (addr & ICACHE_VMEM_BIT) - { - t = lookup_table_vmem[(addr >> 5) & 0xfffff]; - } - else if (addr & ICACHE_EXRAM_BIT) - { - t = lookup_table_ex[(addr >> 5) & 0x1fffff]; - } - else - { - t = lookup_table[(addr >> 5) & 0xfffff]; - } - - if (t == 0xff) // load to the cache - { - if (HID0.ILOCK) // instruction cache is locked - return memory.Read_U32(addr); - // select a way - if (valid[set] != 0xff) - t = s_way_from_valid[valid[set]]; - else - t = s_way_from_plru[plru[set]]; - // load - memory.CopyFromEmu(reinterpret_cast(data[set][t].data()), (addr & ~0x1f), 32); - if (valid[set] & (1 << t)) - { - if (tags[set][t] & (ICACHE_VMEM_BIT >> 12)) - lookup_table_vmem[((tags[set][t] << 7) | set) & 0xfffff] = 0xff; - else if (tags[set][t] & (ICACHE_EXRAM_BIT >> 12)) - lookup_table_ex[((tags[set][t] << 7) | set) & 0x1fffff] = 0xff; - else - lookup_table[((tags[set][t] << 7) | set) & 0xfffff] = 0xff; - } - - if (addr & ICACHE_VMEM_BIT) - lookup_table_vmem[(addr >> 5) & 0xfffff] = t; - else if (addr & ICACHE_EXRAM_BIT) - lookup_table_ex[(addr >> 5) & 0x1fffff] = t; - else - lookup_table[(addr >> 5) & 0xfffff] = t; - tags[set][t] = tag; - valid[set] |= (1 << t); - } - // update plru - plru[set] = (plru[set] & ~s_plru_mask[t]) | s_plru_value[t]; - const u32 res = Common::swap32(data[set][t][(addr >> 2) & 7]); - const u32 inmem = memory.Read_U32(addr); - if (res != inmem) - { - INFO_LOG_FMT(POWERPC, - "ICache read at {:08x} returned stale data: CACHED: {:08x} vs. RAM: {:08x}", addr, - res, inmem); - DolphinAnalytics::Instance().ReportGameQuirk(GameQuirk::ICACHE_MATTERS); - } - return res; + u32 value; + Read(addr, &value, sizeof(value), HID0.ILOCK); + return Common::swap32(value); } -void InstructionCache::DoState(PointerWrap& p) +void InstructionCache::Invalidate(u32 addr) { - if (p.IsReadMode()) + if (!HID0.ICE || m_disable_icache) + return; + + // Invalidates the whole set + const u32 set = (addr >> 5) & 0x7f; + for (size_t way = 0; way < 8; way++) { - // Clear valid parts of the lookup tables (this is done instead of using fill(0xff) to avoid - // loading the entire 4MB of tables into cache) - for (u32 set = 0; set < ICACHE_SETS; set++) + if (valid[set] & (1U << way)) { - for (u32 way = 0; way < ICACHE_WAYS; way++) - { - if ((valid[set] & (1 << way)) != 0) - { - const u32 addr = (tags[set][way] << 12) | (set << 5); - if (addr & ICACHE_VMEM_BIT) - lookup_table_vmem[(addr >> 5) & 0xfffff] = 0xff; - else if (addr & ICACHE_EXRAM_BIT) - lookup_table_ex[(addr >> 5) & 0x1fffff] = 0xff; - else - lookup_table[(addr >> 5) & 0xfffff] = 0xff; - } - } + if (addrs[set][way] & CACHE_VMEM_BIT) + lookup_table_vmem[(addrs[set][way] >> 5) & 0xfffff] = 0xff; + else if (addrs[set][way] & CACHE_EXRAM_BIT) + lookup_table_ex[(addrs[set][way] >> 5) & 0x1fffff] = 0xff; + else + lookup_table[(addrs[set][way] >> 5) & 0xfffff] = 0xff; } } + valid[set] = 0; + modified[set] = 0; - p.DoArray(data); - p.DoArray(tags); - p.DoArray(plru); - p.DoArray(valid); - - if (p.IsReadMode()) - { - // Recompute lookup tables - for (u32 set = 0; set < ICACHE_SETS; set++) - { - for (u32 way = 0; way < ICACHE_WAYS; way++) - { - if ((valid[set] & (1 << way)) != 0) - { - const u32 addr = (tags[set][way] << 12) | (set << 5); - if (addr & ICACHE_VMEM_BIT) - lookup_table_vmem[(addr >> 5) & 0xfffff] = way; - else if (addr & ICACHE_EXRAM_BIT) - lookup_table_ex[(addr >> 5) & 0x1fffff] = way; - else - lookup_table[(addr >> 5) & 0xfffff] = way; - } - } - } - } + JitInterface::InvalidateICacheLine(addr); } void InstructionCache::RefreshConfig() diff --git a/Source/Core/Core/PowerPC/PPCCache.h b/Source/Core/Core/PowerPC/PPCCache.h index 4b9906ea42..49843c9b74 100644 --- a/Source/Core/Core/PowerPC/PPCCache.h +++ b/Source/Core/Core/PowerPC/PPCCache.h @@ -12,20 +12,27 @@ class PointerWrap; namespace PowerPC { -constexpr u32 ICACHE_SETS = 128; -constexpr u32 ICACHE_WAYS = 8; +constexpr u32 CACHE_SETS = 128; +constexpr u32 CACHE_WAYS = 8; // size of an instruction cache block in words -constexpr u32 ICACHE_BLOCK_SIZE = 8; +constexpr u32 CACHE_BLOCK_SIZE = 8; -constexpr u32 ICACHE_EXRAM_BIT = 0x10000000; -constexpr u32 ICACHE_VMEM_BIT = 0x20000000; +constexpr u32 CACHE_EXRAM_BIT = 0x10000000; +constexpr u32 CACHE_VMEM_BIT = 0x20000000; -struct InstructionCache +struct Cache { - std::array, ICACHE_WAYS>, ICACHE_SETS> data{}; - std::array, ICACHE_SETS> tags{}; - std::array plru{}; - std::array valid{}; + std::array, CACHE_WAYS>, CACHE_SETS> data{}; + + // Stores the 32-byte aligned address of the start of each cache block. This consists of the cache + // set and tag. Real hardware only needs to store the tag, but also including the set simplifies + // debugging and getting the actual address in the cache, without changing behavior (as the set + // portion of the address is by definition the same for all addresses in a set). + std::array, CACHE_SETS> addrs{}; + + std::array plru{}; + std::array valid{}; + std::array modified{}; // Note: This is only for performance purposes; this same data could be computed at runtime // from the tags and valid fields (and that's how it's done on the actual cache) @@ -33,16 +40,36 @@ struct InstructionCache std::array lookup_table_ex{}; std::array lookup_table_vmem{}; - bool m_disable_icache = false; + void Store(u32 addr); + void Invalidate(u32 addr); + void Flush(u32 addr); + void Touch(u32 addr, bool store); + + void FlushAll(); + + std::pair GetCache(u32 addr, bool locked); + + void Read(u32 addr, void* buffer, u32 len, bool locked); + void Write(u32 addr, const void* buffer, u32 len, bool locked); + + void Init(); + void Reset(); + + void DoState(PointerWrap& p); +}; + +struct InstructionCache : public Cache +{ std::optional m_config_callback_id = std::nullopt; + bool m_disable_icache = false; + InstructionCache() = default; ~InstructionCache(); u32 ReadInstruction(u32 addr); void Invalidate(u32 addr); void Init(); void Reset(); - void DoState(PointerWrap& p); void RefreshConfig(); }; } // namespace PowerPC diff --git a/Source/Core/Core/PowerPC/PowerPC.cpp b/Source/Core/Core/PowerPC/PowerPC.cpp index eafa547e92..411705d833 100644 --- a/Source/Core/Core/PowerPC/PowerPC.cpp +++ b/Source/Core/Core/PowerPC/PowerPC.cpp @@ -132,9 +132,20 @@ void DoState(PointerWrap& p) p.Do(ppcState.reserve_address); ppcState.iCache.DoState(p); + ppcState.dCache.DoState(p); if (p.IsReadMode()) { + if (!ppcState.m_enable_dcache) + { + INFO_LOG_FMT(POWERPC, "Flushing data cache"); + ppcState.dCache.FlushAll(); + } + else + { + ppcState.dCache.Reset(); + } + RoundingModeUpdated(); IBATUpdated(); DBATUpdated(); @@ -266,6 +277,9 @@ void Init(CPUCore cpu_core) InitializeCPUCore(cpu_core); ppcState.iCache.Init(); + ppcState.dCache.Init(); + + ppcState.m_enable_dcache = Config::Get(Config::MAIN_ACCURATE_CPU_CACHE); if (Config::Get(Config::MAIN_ENABLE_DEBUGGING)) breakpoints.ClearAllTemporary(); @@ -279,6 +293,7 @@ void Reset() ResetRegisters(); ppcState.iCache.Reset(); + ppcState.dCache.Reset(); } void ScheduleInvalidateCacheThreadSafe(u32 address) diff --git a/Source/Core/Core/PowerPC/PowerPC.h b/Source/Core/Core/PowerPC/PowerPC.h index df60432fce..4d70ab7439 100644 --- a/Source/Core/Core/PowerPC/PowerPC.h +++ b/Source/Core/Core/PowerPC/PowerPC.h @@ -172,6 +172,8 @@ struct PowerPCState u32 pagetable_hashmask = 0; InstructionCache iCache; + bool m_enable_dcache = false; + Cache dCache; // Reservation monitor for lwarx and its friend stwcxd. bool reserve; diff --git a/Source/Core/Core/State.cpp b/Source/Core/Core/State.cpp index f5e494e6d1..d0cb1d0d3b 100644 --- a/Source/Core/Core/State.cpp +++ b/Source/Core/Core/State.cpp @@ -95,7 +95,7 @@ static size_t s_state_writes_in_queue; static std::condition_variable s_state_write_queue_is_empty; // Don't forget to increase this after doing changes on the savestate system -constexpr u32 STATE_VERSION = 156; // Last changed in PR 11184 +constexpr u32 STATE_VERSION = 157; // Last changed in PR 11183 // Maps savestate versions to Dolphin versions. // Versions after 42 don't need to be added to this list, @@ -223,14 +223,18 @@ static void DoState(PointerWrap& p) g_video_backend->DoState(p); p.DoMarker("video_backend"); - PowerPC::DoState(p); - p.DoMarker("PowerPC"); // CoreTiming needs to be restored before restoring Hardware because // the controller code might need to schedule an event if the controller has changed. system.GetCoreTiming().DoState(p); p.DoMarker("CoreTiming"); + + // HW needs to be restored before PowerPC because the data cache might need to be flushed. HW::DoState(p); p.DoMarker("HW"); + + PowerPC::DoState(p); + p.DoMarker("PowerPC"); + if (SConfig::GetInstance().bWii) Wiimote::DoState(p); p.DoMarker("Wiimote"); diff --git a/Source/Core/DolphinQt/Settings/AdvancedPane.cpp b/Source/Core/DolphinQt/Settings/AdvancedPane.cpp index f08f9d1570..62e234219d 100644 --- a/Source/Core/DolphinQt/Settings/AdvancedPane.cpp +++ b/Source/Core/DolphinQt/Settings/AdvancedPane.cpp @@ -74,6 +74,12 @@ void AdvancedPane::CreateLayout() "affect performance.\nThe performance impact is the same as having Enable MMU on.")); cpu_options_group_layout->addWidget(m_pause_on_panic_checkbox); + m_accurate_cpu_cache_checkbox = new QCheckBox(tr("Enable Write-Back Cache (slow)")); + m_accurate_cpu_cache_checkbox->setToolTip( + tr("Enables emulation of the CPU write-back cache.\nEnabling will have a significant impact " + "on performance.\nThis should be left disabled unless absolutely needed.")); + cpu_options_group_layout->addWidget(m_accurate_cpu_cache_checkbox); + auto* clock_override = new QGroupBox(tr("Clock Override")); auto* clock_override_layout = new QVBoxLayout(); clock_override->setLayout(clock_override_layout); @@ -189,6 +195,9 @@ void AdvancedPane::ConnectLayout() connect(m_pause_on_panic_checkbox, &QCheckBox::toggled, this, [](bool checked) { Config::SetBaseOrCurrent(Config::MAIN_PAUSE_ON_PANIC, checked); }); + connect(m_accurate_cpu_cache_checkbox, &QCheckBox::toggled, this, + [](bool checked) { Config::SetBaseOrCurrent(Config::MAIN_ACCURATE_CPU_CACHE, checked); }); + m_cpu_clock_override_checkbox->setChecked(Config::Get(Config::MAIN_OVERCLOCK_ENABLE)); connect(m_cpu_clock_override_checkbox, &QCheckBox::toggled, [this](bool enable_clock_override) { Config::SetBaseOrCurrent(Config::MAIN_OVERCLOCK_ENABLE, enable_clock_override); @@ -258,6 +267,9 @@ void AdvancedPane::Update() m_pause_on_panic_checkbox->setChecked(Config::Get(Config::MAIN_PAUSE_ON_PANIC)); m_pause_on_panic_checkbox->setEnabled(!running); + m_accurate_cpu_cache_checkbox->setChecked(Config::Get(Config::MAIN_ACCURATE_CPU_CACHE)); + m_accurate_cpu_cache_checkbox->setEnabled(!running); + QFont bf = font(); bf.setBold(Config::GetActiveLayerForConfig(Config::MAIN_OVERCLOCK_ENABLE) != Config::LayerType::Base); diff --git a/Source/Core/DolphinQt/Settings/AdvancedPane.h b/Source/Core/DolphinQt/Settings/AdvancedPane.h index c74aeacf09..b4fdb141cd 100644 --- a/Source/Core/DolphinQt/Settings/AdvancedPane.h +++ b/Source/Core/DolphinQt/Settings/AdvancedPane.h @@ -33,6 +33,7 @@ private: QComboBox* m_cpu_emulation_engine_combobox; QCheckBox* m_enable_mmu_checkbox; QCheckBox* m_pause_on_panic_checkbox; + QCheckBox* m_accurate_cpu_cache_checkbox; QCheckBox* m_cpu_clock_override_checkbox; QSlider* m_cpu_clock_override_slider; QLabel* m_cpu_clock_override_slider_label;