SPU LLVM: LS Memory Mirrors (Optimize loads/stores)

This commit is contained in:
Eladash 2020-07-15 21:57:39 +03:00 committed by Ivan
parent c1a80b8146
commit af1ceb1151
5 changed files with 104 additions and 31 deletions

View File

@ -173,7 +173,7 @@ bool spu_thread::write_reg(const u32 addr, const u32 value)
{
case MFC_LSA_offs:
{
if (value >= 0x40000)
if (value >= SPU_LS_SIZE)
{
break;
}
@ -321,7 +321,7 @@ bool spu_thread::write_reg(const u32 addr, const u32 value)
void spu_load_exec(const spu_exec_object& elf)
{
auto ls0 = vm::cast(vm::falloc(RAW_SPU_BASE_ADDR, 0x80000, vm::spu));
auto ls0 = vm::cast(vm::falloc(RAW_SPU_BASE_ADDR, SPU_LS_SIZE, vm::spu));
auto spu = idm::make_ptr<named_thread<spu_thread>>("TEST_SPU", ls0, nullptr, 0, "", 0);
spu_thread::g_raw_spu_ctr++;
@ -331,7 +331,7 @@ void spu_load_exec(const spu_exec_object& elf)
{
if (prog.p_type == 0x1u /* LOAD */ && prog.p_memsz)
{
std::memcpy(vm::base(spu->offset + prog.p_vaddr), prog.bin.data(), prog.p_filesz);
std::memcpy(spu->_ptr<void>(prog.p_vaddr), prog.bin.data(), prog.p_filesz);
}
}

View File

@ -272,8 +272,7 @@ DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, sp
// Tail call, GHC CC (second arg)
c.mov(x86::r13, args[0]);
c.mov(x86::ebp, x86::dword_ptr(args[0], ::offset32(&spu_thread::offset)));
c.add(x86::rbp, x86::qword_ptr(args[0], ::offset32(&spu_thread::memory_base_addr)));
c.mov(x86::rbp, x86::qword_ptr(args[0], ::offset32(&spu_thread::ls)));
c.mov(x86::r12, args[2]);
c.xor_(x86::ebx, x86::ebx);
c.jmp(args[1]);
@ -1138,7 +1137,7 @@ void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)
}
// Find function
const auto func = spu.jit->get_runtime().find(static_cast<u32*>(vm::base(spu.offset)), spu.pc);
const auto func = spu.jit->get_runtime().find(static_cast<u32*>(spu._ptr<void>(0)), spu.pc);
if (!func)
{
@ -7902,13 +7901,51 @@ public:
void STQX(spu_opcode_t op)
{
value_t<u64> addr = eval(zext<u64>((extract(get_vr(op.ra), 3) + extract(get_vr(op.rb), 3)) & 0x3fff0));
const auto a = get_vr(op.ra);
const auto b = get_vr(op.rb);
for (auto pair : std::initializer_list<std::pair<value_t<u32[4]>, value_t<u32[4]>>>{{a, b}, {b, a}})
{
if (auto cv = llvm::dyn_cast<llvm::Constant>(pair.first.value))
{
v128 data = get_const_vector(cv, m_pos, 10000);
data._u32[3] %= SPU_LS_SIZE;
if (data._u32[3] % 0x10 == 0)
{
value_t<u64> addr = eval(splat<u64>(data._u32[3]) + zext<u64>(extract(pair.second, 3) & 0x3fff0));
make_store_ls(addr, get_vr<u8[16]>(op.rt));
return;
}
}
}
value_t<u64> addr = eval(zext<u64>((extract(a, 3) + extract(b, 3)) & 0x3fff0));
make_store_ls(addr, get_vr<u8[16]>(op.rt));
}
void LQX(spu_opcode_t op)
{
value_t<u64> addr = eval(zext<u64>((extract(get_vr(op.ra), 3) + extract(get_vr(op.rb), 3)) & 0x3fff0));
const auto a = get_vr(op.ra);
const auto b = get_vr(op.rb);
for (auto pair : std::initializer_list<std::pair<value_t<u32[4]>, value_t<u32[4]>>>{{a, b}, {b, a}})
{
if (auto cv = llvm::dyn_cast<llvm::Constant>(pair.first.value))
{
v128 data = get_const_vector(cv, m_pos, 10000);
data._u32[3] %= SPU_LS_SIZE;
if (data._u32[3] % 0x10 == 0)
{
value_t<u64> addr = eval(splat<u64>(data._u32[3]) + zext<u64>(extract(pair.second, 3) & 0x3fff0));
set_vr(op.rt, make_load_ls(addr));
return;
}
}
}
value_t<u64> addr = eval(zext<u64>((extract(a, 3) + extract(b, 3)) & 0x3fff0));
set_vr(op.rt, make_load_ls(addr));
}
@ -7928,7 +7965,7 @@ public:
{
value_t<u64> addr;
addr.value = m_ir->CreateZExt(m_interp_magn ? m_interp_pc : get_pc(m_pos), get_type<u64>());
addr = eval(((get_imm<u64>(op.i16, false) << 2) + addr) & 0x3fff0);
addr = eval(((get_imm<u64>(op.i16, false) << 2) + addr) & (m_interp_magn ? 0x3fff0 : ~0xf));
make_store_ls(addr, get_vr<u8[16]>(op.rt));
}
@ -7936,7 +7973,7 @@ public:
{
value_t<u64> addr;
addr.value = m_ir->CreateZExt(m_interp_magn ? m_interp_pc : get_pc(m_pos), get_type<u64>());
addr = eval(((get_imm<u64>(op.i16, false) << 2) + addr) & 0x3fff0);
addr = eval(((get_imm<u64>(op.i16, false) << 2) + addr) & (m_interp_magn ? 0x3fff0 : ~0xf));
set_vr(op.rt, make_load_ls(addr));
}
@ -7953,13 +7990,13 @@ public:
}
}
value_t<u64> addr = eval(zext<u64>((extract(get_vr(op.ra), 3) + (get_imm<u32>(op.si10) << 4)) & 0x3fff0));
value_t<u64> addr = eval(zext<u64>(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm<u64>(op.si10) << 4));
make_store_ls(addr, get_vr<u8[16]>(op.rt));
}
void LQD(spu_opcode_t op)
{
value_t<u64> addr = eval(zext<u64>((extract(get_vr(op.ra), 3) + (get_imm<u32>(op.si10) << 4)) & 0x3fff0));
value_t<u64> addr = eval(zext<u64>(extract(get_vr(op.ra), 3) & 0x3fff0) + (get_imm<u64>(op.si10) << 4));
set_vr(op.rt, make_load_ls(addr));
}

View File

@ -2,6 +2,7 @@
#include "Utilities/JIT.h"
#include "Utilities/asm.h"
#include "Utilities/sysinfo.h"
#include "Emu/Memory/vm.h"
#include "Emu/Memory/vm_ptr.h"
#include "Emu/Memory/vm_reservation.h"
@ -1111,7 +1112,7 @@ void spu_thread::cpu_task()
continue;
}
spu_runtime::g_gateway(*this, vm::_ptr<u8>(offset), nullptr);
spu_runtime::g_gateway(*this, _ptr<u8>(0), nullptr);
}
// Print some stats
@ -1129,7 +1130,7 @@ void spu_thread::cpu_task()
break;
}
spu_runtime::g_interpreter(*this, vm::_ptr<u8>(offset), nullptr);
spu_runtime::g_interpreter(*this, _ptr<u8>(0), nullptr);
}
}
@ -1148,8 +1149,21 @@ void spu_thread::cpu_unmem()
spu_thread::~spu_thread()
{
// Deallocate Local Storage
vm::dealloc_verbose_nothrow(offset);
{
const auto [_, shm] = vm::get(vm::any, offset)->get(offset);
for (s32 i = -1; i < 2; i++)
{
// Unmap LS mirrors
shm->unmap_critical(ls + (i * SPU_LS_SIZE));
}
// Deallocate Local Storage
vm::dealloc_verbose_nothrow(offset);
}
// Release LS mirrors area
utils::memory_release(ls - SPU_LS_SIZE, SPU_LS_SIZE * 3);
// Deallocate RawSPU ID
if (!group && offset >= RAW_SPU_BASE_ADDR)
@ -1159,11 +1173,26 @@ spu_thread::~spu_thread()
}
}
spu_thread::spu_thread(vm::addr_t ls, lv2_spu_group* group, u32 index, std::string_view name, u32 lv2_id, bool is_isolated)
spu_thread::spu_thread(vm::addr_t _ls, lv2_spu_group* group, u32 index, std::string_view name, u32 lv2_id, bool is_isolated)
: cpu_thread(idm::last_id())
, is_isolated(is_isolated)
, index(index)
, offset(ls)
, offset(_ls)
, ls([&]()
{
const auto [_, shm] = vm::get(vm::any, _ls)->get(_ls);
const auto addr = static_cast<u8*>(utils::memory_reserve(SPU_LS_SIZE * 3));
for (u32 i = 0; i < 3; i++)
{
// Map LS mirrors
const auto ptr = addr + (i * SPU_LS_SIZE);
verify(HERE), shm->map_critical(ptr) == ptr;
}
// Use the middle mirror
return addr + SPU_LS_SIZE;
}())
, group(group)
, lv2_id(lv2_id)
, spu_tname(stx::shared_cptr<std::string>::make(name))
@ -1233,7 +1262,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
}
u32 value;
if ((eal - RAW_SPU_BASE_ADDR) % RAW_SPU_OFFSET + args.size - 1 < 0x40000) // LS access
if ((eal - RAW_SPU_BASE_ADDR) % RAW_SPU_OFFSET + args.size - 1 < SPU_LS_SIZE) // LS access
{
}
else if (args.size == 4 && is_get && thread->read_reg(eal, value))
@ -1258,7 +1287,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
{
auto& spu = static_cast<spu_thread&>(*group->threads[group->threads_map[index]]);
if (offset + args.size - 1 < 0x40000) // LS access
if (offset + args.size - 1 < SPU_LS_SIZE) // LS access
{
eal = spu.offset + offset; // redirect access
}
@ -1282,7 +1311,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
auto [dst, src] = [&]() -> std::pair<u8*, const u8*>
{
u8* dst = vm::_ptr<u8>(eal);
u8* src = vm::_ptr<u8>(offset + lsa);
u8* src = _ptr<u8>(lsa);
if (is_get)
{
@ -1638,6 +1667,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
transfer.cmd = MFC(args.cmd & ~MFC_LIST_MASK);
args.lsa &= 0x3fff0;
args.eal &= 0x3fff8;
u32 index = fetch_size;
@ -1650,7 +1680,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
// Reset to elements array head
index = 0;
const auto src = _ptr<const void>(args.eal & 0x3fff8);
const auto src = _ptr<const void>(args.eal);
const v128 data0 = v128::loadu(src, 0);
const v128 data1 = v128::loadu(src, 1);
const v128 data2 = v128::loadu(src, 2);
@ -2947,7 +2977,7 @@ bool spu_thread::stop_and_signal(u32 code)
spu_log.warning("STOP 0x0");
// HACK: find an ILA instruction
for (u32 addr = pc; addr < 0x40000; addr += 4)
for (u32 addr = pc; addr < SPU_LS_SIZE; addr += 4)
{
const u32 instr = _ref<u32>(addr);

View File

@ -118,6 +118,11 @@ enum : u32
SPU_STATUS_IS_ISOLATED = 0x80,
};
enum : s32
{
SPU_LS_SIZE = 0x40000,
};
enum : u32
{
SYS_SPU_THREAD_BASE_LOW = 0xf0000000,
@ -636,6 +641,7 @@ public:
const u32 index; // SPU index
const u32 offset; // SPU LS offset
const std::add_pointer_t<u8> ls; // SPU LS pointer
private:
lv2_spu_group* const group; // SPU Thread Group (only safe to access in the spu thread itself)
public:
@ -682,7 +688,7 @@ public:
template<typename T>
inline to_be_t<T>* _ptr(u32 lsa)
{
return static_cast<to_be_t<T>*>(vm::base(offset + lsa));
return reinterpret_cast<to_be_t<T>*>(ls + lsa);
}
// Convert specified SPU LS address to a reference of specified (possibly converted to BE) type

View File

@ -397,7 +397,7 @@ error_code sys_spu_thread_initialize(ppu_thread& ppu, vm::ptr<u32> thread, u32 g
sys_spu.warning("Unimplemented SPU Thread options (0x%x)", option);
}
const vm::addr_t ls_addr{verify("SPU LS" HERE, vm::alloc(0x80000, vm::main))};
const vm::addr_t ls_addr{verify("SPU LS" HERE, vm::alloc(SPU_LS_SIZE, vm::main))};
const u32 inited = group->init;
@ -579,7 +579,7 @@ error_code sys_spu_thread_group_create(ppu_thread& ppu, vm::ptr<u32> id, u32 num
if (type & SYS_SPU_THREAD_GROUP_TYPE_COOPERATE_WITH_SYSTEM)
{
// Constant size, unknown what it means but it's definitely not for each spu thread alone
mem_size = 0x40000;
mem_size = SPU_LS_SIZE;
use_scheduler = false;
}
else if (type & SYS_SPU_THREAD_GROUP_TYPE_NON_CONTEXT)
@ -591,7 +591,7 @@ error_code sys_spu_thread_group_create(ppu_thread& ppu, vm::ptr<u32> id, u32 num
else
{
// 256kb for each spu thread, probably for saving and restoring SPU LS (used by scheduler?)
mem_size = 0x40000 * num;
mem_size = SPU_LS_SIZE * num;
}
if (num < min_threads || num > max_threads ||
@ -1225,7 +1225,7 @@ error_code sys_spu_thread_write_ls(ppu_thread& ppu, u32 id, u32 lsa, u64 value,
sys_spu.trace("sys_spu_thread_write_ls(id=0x%x, lsa=0x%05x, value=0x%llx, type=%d)", id, lsa, value, type);
if (lsa >= 0x40000 || type > 8 || !type || (type | lsa) & (type - 1)) // check range and alignment
if (lsa >= SPU_LS_SIZE || type > 8 || !type || (type | lsa) & (type - 1)) // check range and alignment
{
return CELL_EINVAL;
}
@ -1268,7 +1268,7 @@ error_code sys_spu_thread_read_ls(ppu_thread& ppu, u32 id, u32 lsa, vm::ptr<u64>
sys_spu.trace("sys_spu_thread_read_ls(id=0x%x, lsa=0x%05x, value=*0x%x, type=%d)", id, lsa, value, type);
if (lsa >= 0x40000 || type > 8 || !type || (type | lsa) & (type - 1)) // check range and alignment
if (lsa >= SPU_LS_SIZE || type > 8 || !type || (type | lsa) & (type - 1)) // check range and alignment
{
return CELL_EINVAL;
}
@ -1831,7 +1831,7 @@ error_code sys_raw_spu_create(ppu_thread& ppu, vm::ptr<u32> id, vm::ptr<void> at
index = 0;
}
const vm::addr_t ls_addr{verify(HERE, vm::falloc(RAW_SPU_BASE_ADDR + RAW_SPU_OFFSET * index, 0x40000, vm::spu))};
const vm::addr_t ls_addr{verify(HERE, vm::falloc(RAW_SPU_BASE_ADDR + RAW_SPU_OFFSET * index, SPU_LS_SIZE, vm::spu))};
const u32 tid = idm::make<named_thread<spu_thread>>(fmt::format("RawSPU[0x%x] ", index), ls_addr, nullptr, index, "", index);
@ -1879,7 +1879,7 @@ error_code sys_isolated_spu_create(ppu_thread& ppu, vm::ptr<u32> id, vm::ptr<voi
index = 0;
}
const vm::addr_t ls_addr{verify(HERE, vm::falloc(RAW_SPU_BASE_ADDR + RAW_SPU_OFFSET * index, 0x40000, vm::spu))};
const vm::addr_t ls_addr{verify(HERE, vm::falloc(RAW_SPU_BASE_ADDR + RAW_SPU_OFFSET * index, SPU_LS_SIZE, vm::spu))};
const auto thread = idm::make_ptr<named_thread<spu_thread>>(fmt::format("IsoSPU[0x%x] ", index), ls_addr, nullptr, index, "", index, true);