mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-03-14 10:21:21 +00:00
SPU/PPU: Implement Atomic Cache Line Stores
This commit is contained in:
parent
9baef8c705
commit
09cddc84be
@ -25,6 +25,8 @@
|
||||
|
||||
const bool s_use_ssse3 = utils::has_ssse3();
|
||||
|
||||
extern void do_cell_atomic_128_store(u32 addr, const void* to_write);
|
||||
|
||||
inline u64 dup32(u32 x) { return x | static_cast<u64>(x) << 32; }
|
||||
|
||||
// Write values to CR field
|
||||
@ -4435,11 +4437,10 @@ bool ppu_interpreter::DCBZ(ppu_thread& ppu, ppu_opcode_t op)
|
||||
const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
|
||||
const u32 addr0 = vm::cast(addr, HERE) & ~127;
|
||||
|
||||
if (g_cfg.core.spu_accurate_dma)
|
||||
if (g_cfg.core.accurate_cache_line_stores)
|
||||
{
|
||||
auto [res, rtime] = vm::reservation_lock(addr0, 128, vm::dma_lockb);
|
||||
std::memset(vm::base(addr0), 0, 128);
|
||||
res.release(rtime + 128);
|
||||
alignas(64) static constexpr u8 zero_buf[128]{};
|
||||
do_cell_atomic_128_store(addr0, zero_buf);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -92,6 +92,7 @@ void fmt_class_string<ppu_join_status>::format(std::string& out, u64 arg)
|
||||
|
||||
constexpr ppu_decoder<ppu_interpreter_precise> g_ppu_interpreter_precise;
|
||||
constexpr ppu_decoder<ppu_interpreter_fast> g_ppu_interpreter_fast;
|
||||
constexpr ppu_decoder<ppu_itype> g_ppu_itype;
|
||||
|
||||
extern void ppu_initialize();
|
||||
extern void ppu_initialize(const ppu_module& info);
|
||||
@ -99,6 +100,8 @@ static void ppu_initialize2(class jit_compiler& jit, const ppu_module& module_pa
|
||||
extern void ppu_execute_syscall(ppu_thread& ppu, u64 code);
|
||||
static bool ppu_break(ppu_thread& ppu, ppu_opcode_t op);
|
||||
|
||||
extern void do_cell_atomic_128_store(u32 addr, const void* to_write);
|
||||
|
||||
// Get pointer to executable cache
|
||||
template<typename T = u64>
|
||||
static T& ppu_ref(u32 addr)
|
||||
@ -1420,6 +1423,7 @@ extern void ppu_initialize(const ppu_module& info)
|
||||
{ "__lvrx", s_use_ssse3 ? reinterpret_cast<u64>(sse_cellbe_lvrx) : reinterpret_cast<u64>(sse_cellbe_lvrx_v0) },
|
||||
{ "__stvlx", s_use_ssse3 ? reinterpret_cast<u64>(sse_cellbe_stvlx) : reinterpret_cast<u64>(sse_cellbe_stvlx_v0) },
|
||||
{ "__stvrx", s_use_ssse3 ? reinterpret_cast<u64>(sse_cellbe_stvrx) : reinterpret_cast<u64>(sse_cellbe_stvrx_v0) },
|
||||
{ "__dcbz", reinterpret_cast<u64>(+[](u32 addr){ alignas(64) static constexpr u8 z[128]{}; do_cell_atomic_128_store(addr, z); }) },
|
||||
{ "__resupdate", reinterpret_cast<u64>(vm::reservation_update) },
|
||||
{ "sys_config_io_event", reinterpret_cast<u64>(ppu_get_syscall(523)) },
|
||||
};
|
||||
@ -1571,6 +1575,8 @@ extern void ppu_initialize(const ppu_module& info)
|
||||
u8 output[20];
|
||||
sha1_starts(&ctx);
|
||||
|
||||
int has_dcbz = !!g_cfg.core.accurate_cache_line_stores;
|
||||
|
||||
for (const auto& func : part.funcs)
|
||||
{
|
||||
if (func.size == 0)
|
||||
@ -1614,6 +1620,18 @@ extern void ppu_initialize(const ppu_module& info)
|
||||
addr = roff + 4;
|
||||
}
|
||||
|
||||
if (has_dcbz == 1)
|
||||
{
|
||||
for (u32 i = addr, end = block.second + block.first - 1; i <= end; i += 4)
|
||||
{
|
||||
if (g_ppu_itype.decode(vm::read32(i)) == ppu_itype::DCBZ)
|
||||
{
|
||||
has_dcbz = 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Hash from addr to the end of the block
|
||||
sha1_update(&ctx, vm::_ptr<const u8>(addr), block.second - (addr - block.first));
|
||||
}
|
||||
@ -1623,6 +1641,18 @@ extern void ppu_initialize(const ppu_module& info)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (has_dcbz == 1)
|
||||
{
|
||||
for (u32 i = func.addr, end = func.addr + func.size - 1; i <= end; i += 4)
|
||||
{
|
||||
if (g_ppu_itype.decode(vm::read32(i)) == ppu_itype::DCBZ)
|
||||
{
|
||||
has_dcbz = 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sha1_update(&ctx, vm::_ptr<const u8>(func.addr), func.size);
|
||||
}
|
||||
|
||||
@ -1641,6 +1671,7 @@ extern void ppu_initialize(const ppu_module& info)
|
||||
accurate_fma,
|
||||
accurate_ppu_vector_nan,
|
||||
java_mode_handling,
|
||||
accurate_cache_line_stores,
|
||||
|
||||
__bitset_enum_max
|
||||
};
|
||||
@ -1662,6 +1693,10 @@ extern void ppu_initialize(const ppu_module& info)
|
||||
{
|
||||
settings += ppu_settings::java_mode_handling;
|
||||
}
|
||||
if (has_dcbz == 2)
|
||||
{
|
||||
settings += ppu_settings::accurate_cache_line_stores;
|
||||
}
|
||||
|
||||
// Write version, hash, CPU, settings
|
||||
fmt::append(obj_name, "v3-tane-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));
|
||||
|
@ -3502,8 +3502,16 @@ void PPUTranslator::ICBI(ppu_opcode_t op)
|
||||
|
||||
void PPUTranslator::DCBZ(ppu_opcode_t op)
|
||||
{
|
||||
const auto ptr = GetMemory(m_ir->CreateAnd(op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb), -128), GetType<u8>());
|
||||
Call(GetType<void>(), "llvm.memset.p0i8.i32", ptr, m_ir->getInt8(0), m_ir->getInt32(128), m_ir->getTrue());
|
||||
const auto addr = m_ir->CreateAnd(op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb), -128);
|
||||
|
||||
if (g_cfg.core.accurate_cache_line_stores)
|
||||
{
|
||||
Call(GetType<void>(), "__dcbz", addr);
|
||||
}
|
||||
else
|
||||
{
|
||||
Call(GetType<void>(), "llvm.memset.p0i8.i32", GetMemory(addr, GetType<u8>()), m_ir->getInt8(0), m_ir->getInt32(128), m_ir->getTrue());
|
||||
}
|
||||
}
|
||||
|
||||
void PPUTranslator::LWZ(ppu_opcode_t op)
|
||||
|
@ -229,6 +229,8 @@ static FORCE_INLINE rsx::thread* get_rsx_if_needs_res_pause(u32 addr)
|
||||
extern u64 get_timebased_time();
|
||||
extern u64 get_system_time();
|
||||
|
||||
void do_cell_atomic_128_store(u32 addr, const void* to_write);
|
||||
|
||||
extern thread_local u64 g_tls_fault_spu;
|
||||
|
||||
namespace spu
|
||||
@ -606,7 +608,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
|
||||
c.ret();
|
||||
});
|
||||
|
||||
const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rdata, spu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
|
||||
const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rdata, cpu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
|
||||
{
|
||||
using namespace asmjit;
|
||||
|
||||
@ -701,7 +703,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
|
||||
//c.jmp(fall);
|
||||
|
||||
c.bind(fall);
|
||||
c.lock().bts(x86::dword_ptr(args[2], ::offset32(&spu_thread::state)), static_cast<u32>(cpu_flag::wait));
|
||||
c.lock().bts(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::wait));
|
||||
|
||||
// Touch memory if transaction failed without RETRY flag on the first attempt
|
||||
c.cmp(x86::r12, 1);
|
||||
@ -1474,6 +1476,19 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
|
||||
{
|
||||
size0 = std::min<u32>(128 - (eal & 127), std::min<u32>(size, 128));
|
||||
|
||||
if (size0 == 128u && g_cfg.core.accurate_cache_line_stores)
|
||||
{
|
||||
// As atomic as PUTLLUC
|
||||
do_cell_atomic_128_store(eal, src);
|
||||
|
||||
if (size == size0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Lock each cache line execlusively
|
||||
auto [res, time0] = vm::reservation_lock(eal, size0, vm::dma_lockb);
|
||||
|
||||
@ -1937,6 +1952,80 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
|
||||
}
|
||||
}
|
||||
|
||||
void do_cell_atomic_128_store(u32 addr, const void* to_write)
|
||||
{
|
||||
using rdata_t = decltype(spu_thread::rdata);
|
||||
const auto cpu = get_current_cpu_thread();
|
||||
|
||||
if (g_use_rtm) [[likely]]
|
||||
{
|
||||
const u32 result = spu_putlluc_tx(addr, to_write, cpu);
|
||||
|
||||
const auto render = result != 1 ? get_rsx_if_needs_res_pause(addr) : nullptr;
|
||||
|
||||
if (render) render->pause();
|
||||
|
||||
if (result == 2)
|
||||
{
|
||||
cpu_thread::suspend_all cpu_lock(cpu);
|
||||
|
||||
if (vm::reservation_acquire(addr, 128) & 64)
|
||||
{
|
||||
// Wait for PUTLLC to complete
|
||||
while (vm::reservation_acquire(addr, 128) & 63)
|
||||
{
|
||||
busy_wait(100);
|
||||
}
|
||||
|
||||
mov_rdata(vm::_ref<rdata_t>(addr), *static_cast<const rdata_t*>(to_write));
|
||||
vm::reservation_acquire(addr, 128) += 64;
|
||||
}
|
||||
}
|
||||
else if (result == 0)
|
||||
{
|
||||
cpu_thread::suspend_all cpu_lock(cpu);
|
||||
|
||||
while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::putlluc_lockb)))
|
||||
{
|
||||
busy_wait(100);
|
||||
}
|
||||
|
||||
while (vm::reservation_acquire(addr, 128) & 63)
|
||||
{
|
||||
busy_wait(100);
|
||||
}
|
||||
|
||||
mov_rdata(vm::_ref<rdata_t>(addr), *static_cast<const rdata_t*>(to_write));
|
||||
vm::reservation_acquire(addr, 128) += 64;
|
||||
}
|
||||
|
||||
if (render) render->unpause();
|
||||
static_cast<void>(cpu->test_stopped());
|
||||
}
|
||||
else
|
||||
{
|
||||
auto& data = vm::_ref<rdata_t>(addr);
|
||||
auto [res, time0] = vm::reservation_lock(addr, 128);
|
||||
|
||||
*reinterpret_cast<atomic_t<u32>*>(&data) += 0;
|
||||
|
||||
const auto render = get_rsx_if_needs_res_pause(addr);
|
||||
|
||||
if (render) render->pause();
|
||||
|
||||
auto& super_data = *vm::get_super_ptr<rdata_t>(addr);
|
||||
{
|
||||
// Full lock (heavyweight)
|
||||
// TODO: vm::check_addr
|
||||
vm::writer_lock lock(addr);
|
||||
mov_rdata(super_data, *static_cast<const rdata_t*>(to_write));
|
||||
res.release(time0 + 128);
|
||||
}
|
||||
|
||||
if (render) render->unpause();
|
||||
}
|
||||
}
|
||||
|
||||
void spu_thread::do_putlluc(const spu_mfc_cmd& args)
|
||||
{
|
||||
const u32 addr = args.eal & -128;
|
||||
@ -1955,77 +2044,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
|
||||
// Failure, fallback to the main implementation
|
||||
}
|
||||
|
||||
const auto& to_write = _ref<decltype(rdata)>(args.lsa & 0x3ff80);
|
||||
|
||||
// Store unconditionally
|
||||
if (g_use_rtm) [[likely]]
|
||||
{
|
||||
const u32 result = spu_putlluc_tx(addr, to_write.data(), this);
|
||||
|
||||
const auto render = result != 1 ? get_rsx_if_needs_res_pause(addr) : nullptr;
|
||||
|
||||
if (render) render->pause();
|
||||
|
||||
if (result == 2)
|
||||
{
|
||||
cpu_thread::suspend_all cpu_lock(this);
|
||||
|
||||
if (vm::reservation_acquire(addr, 128) & 64)
|
||||
{
|
||||
// Wait for PUTLLC to complete
|
||||
while (vm::reservation_acquire(addr, 128) & 63)
|
||||
{
|
||||
busy_wait(100);
|
||||
}
|
||||
|
||||
mov_rdata(vm::_ref<decltype(rdata)>(addr), to_write);
|
||||
vm::reservation_acquire(addr, 128) += 64;
|
||||
}
|
||||
}
|
||||
else if (result == 0)
|
||||
{
|
||||
cpu_thread::suspend_all cpu_lock(this);
|
||||
|
||||
while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::putlluc_lockb)))
|
||||
{
|
||||
busy_wait(100);
|
||||
}
|
||||
|
||||
while (vm::reservation_acquire(addr, 128) & 63)
|
||||
{
|
||||
busy_wait(100);
|
||||
}
|
||||
|
||||
mov_rdata(vm::_ref<decltype(rdata)>(addr), to_write);
|
||||
vm::reservation_acquire(addr, 128) += 64;
|
||||
}
|
||||
|
||||
if (render) render->unpause();
|
||||
static_cast<void>(test_stopped());
|
||||
}
|
||||
else
|
||||
{
|
||||
auto& data = vm::_ref<decltype(rdata)>(addr);
|
||||
auto [res, time0] = vm::reservation_lock(addr, 128);
|
||||
|
||||
*reinterpret_cast<atomic_t<u32>*>(&data) += 0;
|
||||
|
||||
const auto render = get_rsx_if_needs_res_pause(addr);
|
||||
|
||||
if (render) render->pause();
|
||||
|
||||
auto& super_data = *vm::get_super_ptr<decltype(rdata)>(addr);
|
||||
{
|
||||
// Full lock (heavyweight)
|
||||
// TODO: vm::check_addr
|
||||
vm::writer_lock lock(addr);
|
||||
mov_rdata(super_data, to_write);
|
||||
res.release(time0 + 128);
|
||||
}
|
||||
|
||||
if (render) render->unpause();
|
||||
}
|
||||
|
||||
do_cell_atomic_128_store(addr, _ptr<decltype(rdata)>(args.lsa & 0x3ff80));
|
||||
vm::reservation_notifier(addr, 128).notify_all();
|
||||
}
|
||||
|
||||
|
@ -44,6 +44,7 @@ struct cfg_root : cfg::node
|
||||
cfg::_enum<spu_block_size_type> spu_block_size{ this, "SPU Block Size", spu_block_size_type::safe };
|
||||
cfg::_bool spu_accurate_getllar{ this, "Accurate GETLLAR", false, true };
|
||||
cfg::_bool spu_accurate_dma{ this, "Accurate SPU DMA", false };
|
||||
cfg::_bool accurate_cache_line_stores{ this, "Accurate Cache Line Stores", false };
|
||||
cfg::_bool rsx_accurate_res_access{this, "Accurate RSX reservation access", false, true};
|
||||
cfg::_bool spu_verification{ this, "SPU Verification", true }; // Should be enabled
|
||||
cfg::_bool spu_cache{ this, "SPU Cache", true };
|
||||
|
@ -22,6 +22,7 @@ enum class emu_settings_type
|
||||
EnableTSX,
|
||||
AccurateGETLLAR,
|
||||
AccurateSpuDMA,
|
||||
AccurateClineStores,
|
||||
AccurateLLVMdfma,
|
||||
AccurateVectorNaN,
|
||||
AccurateRSXAccess,
|
||||
@ -163,6 +164,7 @@ static const QMap<emu_settings_type, cfg_location> settings_location =
|
||||
{ emu_settings_type::EnableTSX, { "Core", "Enable TSX"}},
|
||||
{ emu_settings_type::AccurateGETLLAR, { "Core", "Accurate GETLLAR"}},
|
||||
{ emu_settings_type::AccurateSpuDMA, { "Core", "Accurate SPU DMA"}},
|
||||
{ emu_settings_type::AccurateClineStores, { "Core", "Accurate Cache Line Stores"}},
|
||||
{ emu_settings_type::AccurateLLVMdfma, { "Core", "LLVM Accurate DFMA"}},
|
||||
{ emu_settings_type::AccurateVectorNaN, { "Core", "PPU LLVM Accurate Vector NaN values"}},
|
||||
{ emu_settings_type::AccurateRSXAccess, { "Core", "Accurate RSX reservation access"}},
|
||||
|
@ -1735,6 +1735,9 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
|
||||
m_emu_settings->EnhanceCheckBox(ui->accurateSpuDMA, emu_settings_type::AccurateSpuDMA);
|
||||
SubscribeTooltip(ui->accurateSpuDMA, tooltips.settings.accurate_spu_dma);
|
||||
|
||||
m_emu_settings->EnhanceCheckBox(ui->accurateClineStores, emu_settings_type::AccurateClineStores);
|
||||
SubscribeTooltip(ui->accurateClineStores, tooltips.settings.accurate_cache_line_stores);
|
||||
|
||||
m_emu_settings->EnhanceCheckBox(ui->accurateRSXAccess, emu_settings_type::AccurateRSXAccess);
|
||||
SubscribeTooltip(ui->accurateRSXAccess, tooltips.settings.accurate_rsx_access);
|
||||
|
||||
|
@ -3506,6 +3506,13 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<widget class="QCheckBox" name="accurateClineStores">
|
||||
<property name="text">
|
||||
<string>Accurate Cache Line Stores</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<widget class="QCheckBox" name="hookStFunc">
|
||||
<property name="text">
|
||||
|
@ -78,6 +78,7 @@ public:
|
||||
const QString set_daz_and_ftz = tr("Sets special MXCSR flags to debug errors in SSE operations.\nOnly used in PPU thread when it's not precise.\nOnly useful to developers.\nNever use this.");
|
||||
const QString accurate_getllar = tr("Accurately processes SPU MFC_GETLLAR operation.");
|
||||
const QString accurate_spu_dma = tr("Accurately processes SPU DMA operations.");
|
||||
const QString accurate_cache_line_stores = tr("Accurately processes PPU DCBZ instruction.\nIn addition, when combined with Accurate SPU DMA, SPU PUT cache line accesses will be processed atomically.");
|
||||
const QString accurate_llvm_dfma = tr("Provides extra accuracy on FMA instructions at the cost of performance.\nWhile disabling it might give a decent performance boost if your CPU doesn't support FMA, it may also introduce subtle bugs that otherwise do not occur.\nYou can't disable it if your CPU supports FMA.");
|
||||
const QString accurate_vector_nan = tr("Forces the floating point NaN (Not A Number) values outputted from PPU vector instructions to be accurate to the real hardware. (0x7FC00000)");
|
||||
const QString accurate_rsx_access = tr("Forces RSX pauses on SPU MFC_GETLLAR and SPU MFC_PUTLLUC operations.");
|
||||
|
Loading…
x
Reference in New Issue
Block a user