SPU Re: more precise jt generation

Improve analyser, set v1
Fix branch indirect conditional
This commit is contained in:
Nekotekina 2018-05-10 19:38:07 +03:00
parent 737db90058
commit be5c18cc85
4 changed files with 190 additions and 68 deletions

View File

@ -1156,27 +1156,36 @@ void spu_recompiler::branch_fixed(u32 target)
c->jmp(x86::rax);
}
void spu_recompiler::branch_indirect(spu_opcode_t op)
void spu_recompiler::branch_indirect(spu_opcode_t op, bool jt)
{
using namespace asmjit;
if (!instr_table.isValid())
if (g_cfg.core.spu_block_size == spu_block_size_type::safe && !jt)
{
// Request instruction table
instr_table = c->newLabel();
// Simply external call (return or indirect call)
c->mov(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
c->xor_(qw0->r32(), qw0->r32());
}
else
{
if (!instr_table.isValid())
{
// Request instruction table
instr_table = c->newLabel();
}
const u32 start = instr_labels.begin()->first;
const u32 end = instr_labels.rbegin()->first + 4;
const u32 start = instr_labels.begin()->first;
const u32 end = instr_labels.rbegin()->first + 4;
// Load indirect jump address, choose between local and external
c->lea(x86::r10, x86::qword_ptr(instr_table));
c->lea(*qw1, x86::qword_ptr(*addr, 0 - start));
c->xor_(qw0->r32(), qw0->r32());
c->cmp(qw1->r32(), end - start);
c->cmovae(qw1->r32(), qw0->r32());
c->cmovb(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0));
c->cmovae(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
// Load indirect jump address, choose between local and external
c->lea(x86::r10, x86::qword_ptr(instr_table));
c->lea(*qw1, x86::qword_ptr(*addr, 0 - start));
c->xor_(qw0->r32(), qw0->r32());
c->cmp(qw1->r32(), end - start);
c->cmovae(qw1->r32(), qw0->r32());
c->cmovb(x86::r10, x86::qword_ptr(x86::r10, *qw1, 1, 0));
c->cmovae(x86::r10, x86::qword_ptr(*cpu, addr->r64(), 1, offset32(&SPUThread::jit_dispatcher)));
}
if (op.d)
{
@ -2741,7 +2750,7 @@ void spu_recompiler::BI(spu_opcode_t op)
{
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->and_(*addr, 0x3fffc);
branch_indirect(op);
branch_indirect(op, verify(HERE, m_targets[m_pos].size()) > 2);
m_pos = -1;
}

View File

@ -107,7 +107,7 @@ private:
asmjit::X86Mem XmmConst(__m128i data);
void branch_fixed(u32 target);
void branch_indirect(spu_opcode_t op);
void branch_indirect(spu_opcode_t op, bool jt = false);
void fall(spu_opcode_t op);
void save_rcx();
void load_rcx();

View File

@ -89,7 +89,7 @@ void spu_cache::initialize()
}
// SPU cache file (version + block size type)
const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v0.dat";
const std::string loc = _main->cache + u8"spu-§" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v1.dat";
auto cache = std::make_shared<spu_cache>(loc);
@ -272,14 +272,16 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
result.push_back(lsa);
// Initialize block entries
std::bitset<0x10000>& blocks = m_block_info;
blocks.reset();
blocks.set(lsa / 4);
m_block_info.reset();
m_block_info.set(lsa / 4);
// Simple block entry workload list
std::vector<u32> wl;
wl.push_back(lsa);
m_regmod.fill(0xff);
m_targets.clear();
// Value flags (TODO)
enum class vf : u32
{
@ -310,9 +312,9 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
if (target > lsa)
{
// Check for redundancy
if (!blocks[target / 4])
if (!m_block_info[target / 4])
{
blocks[target / 4] = true;
m_block_info[target / 4] = true;
wl.push_back(target);
return;
}
@ -325,6 +327,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
wl[wi] += 4;
m_targets.erase(pos);
// Analyse instruction
switch (const auto type = s_spu_itype.decode(data))
{
@ -336,7 +340,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
case spu_itype::DFTSV:
{
// Stop before invalid instructions (TODO)
blocks[pos / 4] = true;
m_targets[pos].push_back(-1);
m_block_info[pos / 4] = true;
next_block();
continue;
}
@ -349,7 +354,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
if (data == 0 || data == 3)
{
// Stop before null data
blocks[pos / 4] = true;
m_targets[pos].push_back(-1);
m_block_info[pos / 4] = true;
next_block();
continue;
}
@ -357,6 +363,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
{
// Stop on special instructions (TODO)
m_targets[pos].push_back(-1);
next_block();
break;
}
@ -366,6 +373,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
case spu_itype::IRET:
{
m_targets[pos].push_back(-1);
next_block();
break;
}
@ -382,6 +390,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
if (type == spu_itype::BISL)
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = pos + 4;
}
@ -389,23 +398,24 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
if (test(af, vf::is_const))
{
const u32 target = spu_branch_target(av);
LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x", lsa, pos, target);
if (target == pos + 4)
{
// Nop (unless BISL)
break;
LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to next!", lsa, pos);
}
m_targets[pos].push_back(target);
if (type != spu_itype::BISL || g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
LOG_WARNING(SPU, "[0x%x] At 0x%x: indirect branch to 0x%x", lsa, pos, target);
add_block(target);
}
if (type == spu_itype::BISL && target < lsa)
if (type == spu_itype::BISL && target >= lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
next_block();
break;
add_block(pos + 4);
}
}
else if (type == spu_itype::BI && !op.d && !op.e)
@ -488,6 +498,8 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
add_block(jt_abs[i]);
result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_abs[i]);
}
m_targets.emplace(pos, std::move(jt_abs));
}
if (jt_rel.size() >= jt_abs.size())
@ -504,19 +516,33 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
add_block(jt_rel[i]);
result[(start - lsa) / 4 + 1 + i] = se_storage<u32>::swap(jt_rel[i] - start);
}
m_targets.emplace(pos, std::move(jt_rel));
}
}
}
if (type == spu_itype::BI || type == spu_itype::BISL || g_cfg.core.spu_block_size == spu_block_size_type::safe)
if (type == spu_itype::BI || type == spu_itype::BISL)
{
if (type == spu_itype::BI || g_cfg.core.spu_block_size != spu_block_size_type::giga)
{
next_block();
break;
if (m_targets[pos].empty())
{
m_targets[pos].push_back(-1);
}
}
else
{
add_block(pos + 4);
}
}
else
{
m_targets[pos].push_back(pos + 4);
add_block(pos + 4);
}
next_block();
break;
}
@ -525,6 +551,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
{
const u32 target = spu_branch_target(type == spu_itype::BRASL ? 0 : pos, op.i16);
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = pos + 4;
@ -534,11 +561,11 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
break;
}
if (target < lsa || g_cfg.core.spu_block_size != spu_block_size_type::giga)
m_targets[pos].push_back(target);
if (target >= lsa && g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
// Stop on direct calls
next_block();
break;
add_block(pos + 4);
}
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
@ -546,6 +573,7 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
add_block(target);
}
next_block();
break;
}
@ -564,15 +592,16 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
break;
}
m_targets[pos].push_back(target);
add_block(target);
if (type == spu_itype::BR || type == spu_itype::BRA)
if (type != spu_itype::BR && type != spu_itype::BRA)
{
// Stop on direct branches
next_block();
break;
m_targets[pos].push_back(pos + 4);
add_block(pos + 4);
}
next_block();
break;
}
@ -601,61 +630,131 @@ std::vector<u32> spu_recompiler_base::block(const be_t<u32>* ls, u32 lsa)
case spu_itype::IL:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = op.si16;
break;
}
case spu_itype::ILA:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = op.i18;
break;
}
case spu_itype::ILH:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = op.i16 << 16 | op.i16;
break;
}
case spu_itype::ILHU:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = op.i16 << 16;
break;
}
case spu_itype::IOHL:
{
m_regmod[pos / 4] = op.rt;
values[op.rt] = values[op.rt] | op.i16;
break;
}
case spu_itype::ORI:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] | op.si10;
break;
}
case spu_itype::OR:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.ra] | values[op.rb];
break;
}
case spu_itype::ANDI:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] & op.si10;
break;
}
case spu_itype::AND:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.ra] & values[op.rb];
break;
}
case spu_itype::AI:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] + op.si10;
break;
}
case spu_itype::A:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.ra] + values[op.rb];
break;
}
case spu_itype::SFI:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = op.si10 - values[op.ra];
break;
}
case spu_itype::SF:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.rb] - values[op.ra];
break;
}
case spu_itype::ROTMI:
{
m_regmod[pos / 4] = op.rt;
if (-op.i7 & 0x20)
{
vflags[op.rt] = +vf::is_const;
values[op.rt] = 0;
break;
}
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] >> (-op.i7 & 0x1f);
break;
}
case spu_itype::SHLI:
{
m_regmod[pos / 4] = op.rt;
if (op.i7 & 0x20)
{
vflags[op.rt] = +vf::is_const;
values[op.rt] = 0;
break;
}
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] << (op.i7 & 0x1f);
break;
}
default:
{
// Unconst
vflags[type & spu_itype::_quadrop ? +op.rt4 : +op.rt] = {};
const u32 op_rt = type & spu_itype::_quadrop ? +op.rt4 : +op.rt;
m_regmod[pos / 4] = op_rt;
vflags[op_rt] = {};
break;
}
}
@ -783,7 +882,6 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
llvm::Value* m_lsptr;
llvm::BasicBlock* m_stop;
llvm::GlobalVariable* m_jt;
std::array<std::pair<llvm::Value*, llvm::Value*>, 128> m_gpr;
std::array<llvm::Instruction*, 128> m_flush_gpr;
@ -1047,27 +1145,15 @@ public:
m_stop = BasicBlock::Create(m_context, "", m_function);
const auto jtt = ArrayType::get(GetType<u8*>(), m_size / 4);
std::vector<llvm::Constant*> jt;
jt.reserve(m_size / 4);
// Create instruction blocks
for (u32 i = 1, pos = start; i < func.size(); i++, pos += 4)
{
if (func[i] && m_block_info[pos / 4])
{
const auto b = BasicBlock::Create(m_context, "", m_function);
jt.push_back(llvm::BlockAddress::get(b));
m_instr_map.emplace(pos, b);
}
else
{
jt.push_back(llvm::BlockAddress::get(m_stop));
m_instr_map.emplace(pos, BasicBlock::Create(m_context, "", m_function));
}
}
m_jt = new GlobalVariable(*module, jtt, true, GlobalValue::PrivateLinkage, llvm::ConstantArray::get(jtt, jt), "jt");
update_pc();
const auto label_test = BasicBlock::Create(m_context, "", m_function);
@ -2764,24 +2850,43 @@ public:
addr.value = call(&exec_check_interrupts, m_thread, addr.value);
}
if (llvm::isa<llvm::ConstantInt>(addr.value))
if (const auto _int = llvm::dyn_cast<llvm::ConstantInt>(addr.value))
{
return branch_fixed(llvm::cast<llvm::ConstantInt>(addr.value)->getZExtValue());
LOG_WARNING(SPU, "[0x%x] Fixed branch to 0x%x", m_pos, _int->getZExtValue());
return branch_fixed(_int->getZExtValue());
}
m_ir->CreateStore(addr.value, spu_ptr<u32>(&SPUThread::pc));
const u32 start = m_instr_map.begin()->first;
const auto local = llvm::BasicBlock::Create(m_context, "", m_function);
const auto exter = llvm::BasicBlock::Create(m_context, "", m_function);
const auto off = m_ir->CreateSub(addr.value, m_ir->getInt32(start));
m_ir->CreateCondBr(m_ir->CreateICmpULT(off, m_ir->getInt32(m_size)), local, exter);
m_ir->SetInsertPoint(local);
const auto table = m_ir->CreateIndirectBr(m_ir->CreateLoad(m_ir->CreateGEP(m_jt, {(llvm::Value*)m_ir->getInt32(0), m_ir->CreateLShr(off, 2)})), m_instr_map.size() + 1);
for (const auto& pair : m_instr_map)
table->addDestination(pair.second);
table->addDestination(m_stop);
m_ir->SetInsertPoint(exter);
const auto tfound = m_targets.find(m_pos);
if (tfound != m_targets.end() && tfound->second.size() >= 3)
{
const u32 start = m_instr_map.begin()->first;
const std::set<u32> targets(tfound->second.begin(), tfound->second.end());
const auto exter = llvm::BasicBlock::Create(m_context, "", m_function);
const auto sw = m_ir->CreateSwitch(m_ir->CreateLShr(addr.value, 2, "", true), exter, m_size / 4);
for (u32 pos = start; pos < start + m_size; pos += 4)
{
const auto found = m_instr_map.find(pos);
if (found != m_instr_map.end() && targets.count(pos))
{
sw->addCase(m_ir->getInt32(pos / 4), found->second);
}
else
{
sw->addCase(m_ir->getInt32(pos / 4), m_stop);
}
}
m_ir->SetInsertPoint(exter);
}
const auto disp = m_ir->CreateAdd(m_thread, m_ir->getInt64(::offset32(&SPUThread::jit_dispatcher)));
const auto type = llvm::FunctionType::get(get_type<void>(), {get_type<u64>(), get_type<u64>(), get_type<u32>()}, false)->getPointerTo()->getPointerTo();
tail(m_ir->CreateLoad(m_ir->CreateIntToPtr(m_ir->CreateAdd(disp, zext<u64>(addr << 1).value), type)));

View File

@ -5,6 +5,7 @@
#include <vector>
#include <bitset>
#include <memory>
#include <string>
// Helper class
class spu_cache
@ -35,8 +36,15 @@ protected:
u32 m_pos;
u32 m_size;
// Bit indicating start of the block
std::bitset<0x10000> m_block_info;
// GPR modified by the instruction (-1 = not set)
std::array<u8, 0x10000> m_regmod;
// List of possible targets for the instruction ({} = next instruction, {-1} = no targets)
std::unordered_map<u32, std::basic_string<u32>, value_hash<u32, 2>> m_targets;
std::shared_ptr<spu_cache> m_cache;
public: