SPU/PPU: Implement Atomic Cache Line Stores

This commit is contained in:
Eladash 2020-09-25 17:29:25 +03:00 committed by Ivan
parent 9baef8c705
commit 09cddc84be
9 changed files with 156 additions and 79 deletions

View File

@ -25,6 +25,8 @@
const bool s_use_ssse3 = utils::has_ssse3();
extern void do_cell_atomic_128_store(u32 addr, const void* to_write);
inline u64 dup32(u32 x) { return x | static_cast<u64>(x) << 32; }
// Write values to CR field
@ -4435,11 +4437,10 @@ bool ppu_interpreter::DCBZ(ppu_thread& ppu, ppu_opcode_t op)
const u64 addr = op.ra ? ppu.gpr[op.ra] + ppu.gpr[op.rb] : ppu.gpr[op.rb];
const u32 addr0 = vm::cast(addr, HERE) & ~127;
if (g_cfg.core.spu_accurate_dma)
if (g_cfg.core.accurate_cache_line_stores)
{
auto [res, rtime] = vm::reservation_lock(addr0, 128, vm::dma_lockb);
std::memset(vm::base(addr0), 0, 128);
res.release(rtime + 128);
alignas(64) static constexpr u8 zero_buf[128]{};
do_cell_atomic_128_store(addr0, zero_buf);
return true;
}

View File

@ -92,6 +92,7 @@ void fmt_class_string<ppu_join_status>::format(std::string& out, u64 arg)
constexpr ppu_decoder<ppu_interpreter_precise> g_ppu_interpreter_precise;
constexpr ppu_decoder<ppu_interpreter_fast> g_ppu_interpreter_fast;
constexpr ppu_decoder<ppu_itype> g_ppu_itype;
extern void ppu_initialize();
extern void ppu_initialize(const ppu_module& info);
@ -99,6 +100,8 @@ static void ppu_initialize2(class jit_compiler& jit, const ppu_module& module_pa
extern void ppu_execute_syscall(ppu_thread& ppu, u64 code);
static bool ppu_break(ppu_thread& ppu, ppu_opcode_t op);
extern void do_cell_atomic_128_store(u32 addr, const void* to_write);
// Get pointer to executable cache
template<typename T = u64>
static T& ppu_ref(u32 addr)
@ -1420,6 +1423,7 @@ extern void ppu_initialize(const ppu_module& info)
{ "__lvrx", s_use_ssse3 ? reinterpret_cast<u64>(sse_cellbe_lvrx) : reinterpret_cast<u64>(sse_cellbe_lvrx_v0) },
{ "__stvlx", s_use_ssse3 ? reinterpret_cast<u64>(sse_cellbe_stvlx) : reinterpret_cast<u64>(sse_cellbe_stvlx_v0) },
{ "__stvrx", s_use_ssse3 ? reinterpret_cast<u64>(sse_cellbe_stvrx) : reinterpret_cast<u64>(sse_cellbe_stvrx_v0) },
{ "__dcbz", reinterpret_cast<u64>(+[](u32 addr){ alignas(64) static constexpr u8 z[128]{}; do_cell_atomic_128_store(addr, z); }) },
{ "__resupdate", reinterpret_cast<u64>(vm::reservation_update) },
{ "sys_config_io_event", reinterpret_cast<u64>(ppu_get_syscall(523)) },
};
@ -1571,6 +1575,8 @@ extern void ppu_initialize(const ppu_module& info)
u8 output[20];
sha1_starts(&ctx);
int has_dcbz = !!g_cfg.core.accurate_cache_line_stores;
for (const auto& func : part.funcs)
{
if (func.size == 0)
@ -1614,6 +1620,18 @@ extern void ppu_initialize(const ppu_module& info)
addr = roff + 4;
}
if (has_dcbz == 1)
{
for (u32 i = addr, end = block.second + block.first - 1; i <= end; i += 4)
{
if (g_ppu_itype.decode(vm::read32(i)) == ppu_itype::DCBZ)
{
has_dcbz = 2;
break;
}
}
}
// Hash from addr to the end of the block
sha1_update(&ctx, vm::_ptr<const u8>(addr), block.second - (addr - block.first));
}
@ -1623,6 +1641,18 @@ extern void ppu_initialize(const ppu_module& info)
continue;
}
if (has_dcbz == 1)
{
for (u32 i = func.addr, end = func.addr + func.size - 1; i <= end; i += 4)
{
if (g_ppu_itype.decode(vm::read32(i)) == ppu_itype::DCBZ)
{
has_dcbz = 2;
break;
}
}
}
sha1_update(&ctx, vm::_ptr<const u8>(func.addr), func.size);
}
@ -1641,6 +1671,7 @@ extern void ppu_initialize(const ppu_module& info)
accurate_fma,
accurate_ppu_vector_nan,
java_mode_handling,
accurate_cache_line_stores,
__bitset_enum_max
};
@ -1662,6 +1693,10 @@ extern void ppu_initialize(const ppu_module& info)
{
settings += ppu_settings::java_mode_handling;
}
if (has_dcbz == 2)
{
settings += ppu_settings::accurate_cache_line_stores;
}
// Write version, hash, CPU, settings
fmt::append(obj_name, "v3-tane-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));

View File

@ -3502,8 +3502,16 @@ void PPUTranslator::ICBI(ppu_opcode_t op)
void PPUTranslator::DCBZ(ppu_opcode_t op)
{
const auto ptr = GetMemory(m_ir->CreateAnd(op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb), -128), GetType<u8>());
Call(GetType<void>(), "llvm.memset.p0i8.i32", ptr, m_ir->getInt8(0), m_ir->getInt32(128), m_ir->getTrue());
const auto addr = m_ir->CreateAnd(op.ra ? m_ir->CreateAdd(GetGpr(op.ra), GetGpr(op.rb)) : GetGpr(op.rb), -128);
if (g_cfg.core.accurate_cache_line_stores)
{
Call(GetType<void>(), "__dcbz", addr);
}
else
{
Call(GetType<void>(), "llvm.memset.p0i8.i32", GetMemory(addr, GetType<u8>()), m_ir->getInt8(0), m_ir->getInt32(128), m_ir->getTrue());
}
}
void PPUTranslator::LWZ(ppu_opcode_t op)

View File

@ -229,6 +229,8 @@ static FORCE_INLINE rsx::thread* get_rsx_if_needs_res_pause(u32 addr)
extern u64 get_timebased_time();
extern u64 get_system_time();
void do_cell_atomic_128_store(u32 addr, const void* to_write);
extern thread_local u64 g_tls_fault_spu;
namespace spu
@ -606,7 +608,7 @@ const auto spu_putllc_tx = build_function_asm<u32(*)(u32 raddr, u64 rtime, const
c.ret();
});
const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rdata, spu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rdata, cpu_thread* _spu)>([](asmjit::X86Assembler& c, auto& args)
{
using namespace asmjit;
@ -701,7 +703,7 @@ const auto spu_putlluc_tx = build_function_asm<u32(*)(u32 raddr, const void* rda
//c.jmp(fall);
c.bind(fall);
c.lock().bts(x86::dword_ptr(args[2], ::offset32(&spu_thread::state)), static_cast<u32>(cpu_flag::wait));
c.lock().bts(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::wait));
// Touch memory if transaction failed without RETRY flag on the first attempt
c.cmp(x86::r12, 1);
@ -1474,6 +1476,19 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
{
size0 = std::min<u32>(128 - (eal & 127), std::min<u32>(size, 128));
if (size0 == 128u && g_cfg.core.accurate_cache_line_stores)
{
// As atomic as PUTLLUC
do_cell_atomic_128_store(eal, src);
if (size == size0)
{
break;
}
continue;
}
// Lock each cache line execlusively
auto [res, time0] = vm::reservation_lock(eal, size0, vm::dma_lockb);
@ -1937,6 +1952,80 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
}
}
void do_cell_atomic_128_store(u32 addr, const void* to_write)
{
using rdata_t = decltype(spu_thread::rdata);
const auto cpu = get_current_cpu_thread();
if (g_use_rtm) [[likely]]
{
const u32 result = spu_putlluc_tx(addr, to_write, cpu);
const auto render = result != 1 ? get_rsx_if_needs_res_pause(addr) : nullptr;
if (render) render->pause();
if (result == 2)
{
cpu_thread::suspend_all cpu_lock(cpu);
if (vm::reservation_acquire(addr, 128) & 64)
{
// Wait for PUTLLC to complete
while (vm::reservation_acquire(addr, 128) & 63)
{
busy_wait(100);
}
mov_rdata(vm::_ref<rdata_t>(addr), *static_cast<const rdata_t*>(to_write));
vm::reservation_acquire(addr, 128) += 64;
}
}
else if (result == 0)
{
cpu_thread::suspend_all cpu_lock(cpu);
while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::putlluc_lockb)))
{
busy_wait(100);
}
while (vm::reservation_acquire(addr, 128) & 63)
{
busy_wait(100);
}
mov_rdata(vm::_ref<rdata_t>(addr), *static_cast<const rdata_t*>(to_write));
vm::reservation_acquire(addr, 128) += 64;
}
if (render) render->unpause();
static_cast<void>(cpu->test_stopped());
}
else
{
auto& data = vm::_ref<rdata_t>(addr);
auto [res, time0] = vm::reservation_lock(addr, 128);
*reinterpret_cast<atomic_t<u32>*>(&data) += 0;
const auto render = get_rsx_if_needs_res_pause(addr);
if (render) render->pause();
auto& super_data = *vm::get_super_ptr<rdata_t>(addr);
{
// Full lock (heavyweight)
// TODO: vm::check_addr
vm::writer_lock lock(addr);
mov_rdata(super_data, *static_cast<const rdata_t*>(to_write));
res.release(time0 + 128);
}
if (render) render->unpause();
}
}
void spu_thread::do_putlluc(const spu_mfc_cmd& args)
{
const u32 addr = args.eal & -128;
@ -1955,77 +2044,7 @@ void spu_thread::do_putlluc(const spu_mfc_cmd& args)
// Failure, fallback to the main implementation
}
const auto& to_write = _ref<decltype(rdata)>(args.lsa & 0x3ff80);
// Store unconditionally
if (g_use_rtm) [[likely]]
{
const u32 result = spu_putlluc_tx(addr, to_write.data(), this);
const auto render = result != 1 ? get_rsx_if_needs_res_pause(addr) : nullptr;
if (render) render->pause();
if (result == 2)
{
cpu_thread::suspend_all cpu_lock(this);
if (vm::reservation_acquire(addr, 128) & 64)
{
// Wait for PUTLLC to complete
while (vm::reservation_acquire(addr, 128) & 63)
{
busy_wait(100);
}
mov_rdata(vm::_ref<decltype(rdata)>(addr), to_write);
vm::reservation_acquire(addr, 128) += 64;
}
}
else if (result == 0)
{
cpu_thread::suspend_all cpu_lock(this);
while (vm::reservation_acquire(addr, 128).bts(std::countr_zero<u32>(vm::putlluc_lockb)))
{
busy_wait(100);
}
while (vm::reservation_acquire(addr, 128) & 63)
{
busy_wait(100);
}
mov_rdata(vm::_ref<decltype(rdata)>(addr), to_write);
vm::reservation_acquire(addr, 128) += 64;
}
if (render) render->unpause();
static_cast<void>(test_stopped());
}
else
{
auto& data = vm::_ref<decltype(rdata)>(addr);
auto [res, time0] = vm::reservation_lock(addr, 128);
*reinterpret_cast<atomic_t<u32>*>(&data) += 0;
const auto render = get_rsx_if_needs_res_pause(addr);
if (render) render->pause();
auto& super_data = *vm::get_super_ptr<decltype(rdata)>(addr);
{
// Full lock (heavyweight)
// TODO: vm::check_addr
vm::writer_lock lock(addr);
mov_rdata(super_data, to_write);
res.release(time0 + 128);
}
if (render) render->unpause();
}
do_cell_atomic_128_store(addr, _ptr<decltype(rdata)>(args.lsa & 0x3ff80));
vm::reservation_notifier(addr, 128).notify_all();
}

View File

@ -44,6 +44,7 @@ struct cfg_root : cfg::node
cfg::_enum<spu_block_size_type> spu_block_size{ this, "SPU Block Size", spu_block_size_type::safe };
cfg::_bool spu_accurate_getllar{ this, "Accurate GETLLAR", false, true };
cfg::_bool spu_accurate_dma{ this, "Accurate SPU DMA", false };
cfg::_bool accurate_cache_line_stores{ this, "Accurate Cache Line Stores", false };
cfg::_bool rsx_accurate_res_access{this, "Accurate RSX reservation access", false, true};
cfg::_bool spu_verification{ this, "SPU Verification", true }; // Should be enabled
cfg::_bool spu_cache{ this, "SPU Cache", true };

View File

@ -22,6 +22,7 @@ enum class emu_settings_type
EnableTSX,
AccurateGETLLAR,
AccurateSpuDMA,
AccurateClineStores,
AccurateLLVMdfma,
AccurateVectorNaN,
AccurateRSXAccess,
@ -163,6 +164,7 @@ static const QMap<emu_settings_type, cfg_location> settings_location =
{ emu_settings_type::EnableTSX, { "Core", "Enable TSX"}},
{ emu_settings_type::AccurateGETLLAR, { "Core", "Accurate GETLLAR"}},
{ emu_settings_type::AccurateSpuDMA, { "Core", "Accurate SPU DMA"}},
{ emu_settings_type::AccurateClineStores, { "Core", "Accurate Cache Line Stores"}},
{ emu_settings_type::AccurateLLVMdfma, { "Core", "LLVM Accurate DFMA"}},
{ emu_settings_type::AccurateVectorNaN, { "Core", "PPU LLVM Accurate Vector NaN values"}},
{ emu_settings_type::AccurateRSXAccess, { "Core", "Accurate RSX reservation access"}},

View File

@ -1735,6 +1735,9 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
m_emu_settings->EnhanceCheckBox(ui->accurateSpuDMA, emu_settings_type::AccurateSpuDMA);
SubscribeTooltip(ui->accurateSpuDMA, tooltips.settings.accurate_spu_dma);
m_emu_settings->EnhanceCheckBox(ui->accurateClineStores, emu_settings_type::AccurateClineStores);
SubscribeTooltip(ui->accurateClineStores, tooltips.settings.accurate_cache_line_stores);
m_emu_settings->EnhanceCheckBox(ui->accurateRSXAccess, emu_settings_type::AccurateRSXAccess);
SubscribeTooltip(ui->accurateRSXAccess, tooltips.settings.accurate_rsx_access);

View File

@ -3506,6 +3506,13 @@
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="accurateClineStores">
<property name="text">
<string>Accurate Cache Line Stores</string>
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="hookStFunc">
<property name="text">

View File

@ -78,6 +78,7 @@ public:
const QString set_daz_and_ftz = tr("Sets special MXCSR flags to debug errors in SSE operations.\nOnly used in PPU thread when it's not precise.\nOnly useful to developers.\nNever use this.");
const QString accurate_getllar = tr("Accurately processes SPU MFC_GETLLAR operation.");
const QString accurate_spu_dma = tr("Accurately processes SPU DMA operations.");
const QString accurate_cache_line_stores = tr("Accurately processes PPU DCBZ instruction.\nIn addition, when combined with Accurate SPU DMA, SPU PUT cache line accesses will be processed atomically.");
const QString accurate_llvm_dfma = tr("Provides extra accuracy on FMA instructions at the cost of performance.\nWhile disabling it might give a decent performance boost if your CPU doesn't support FMA, it may also introduce subtle bugs that otherwise do not occur.\nYou can't disable it if your CPU supports FMA.");
const QString accurate_vector_nan = tr("Forces the floating point NaN (Not A Number) values outputted from PPU vector instructions to be accurate to the real hardware. (0x7FC00000)");
const QString accurate_rsx_access = tr("Forces RSX pauses on SPU MFC_GETLLAR and SPU MFC_PUTLLUC operations.");