SPU analyser: basic function detection in Giga mode

Misc: fix EH frame registration (LLVM, non-Windows).
Misc: constant-folding bitcast (cpu_translator).
Misc: add syntax for LLVM arrays (cpu_translator).
Misc: use function names for proper linkage (SPU LLVM).

Changed function search and verification in Giga mode.
Basic stack frame layout analysis.
Function detection in Giga mode.
Basic use of new information in SPU LLVM.
Fixed jump table compilation in SPU LLVM.
Disable broken optimization in Accurate xfloat mode.
Make compiled SPU modules position-independent in SPU LLVM.

Optimizations include but not limited to:
 * Compiling SPU functions as native functions when eligible
 * Avoiding register context write-out
 * Aligned stack assumption (CWD alike instruction)
This commit is contained in:
Nekotekina 2019-05-05 16:28:41 +03:00
parent fce9d6a7b8
commit 7492f335e9
15 changed files with 1588 additions and 492 deletions

View File

@ -474,7 +474,7 @@ struct MemoryManager : llvm::RTDyldMemoryManager
s_unfire.push_front(std::make_pair(addr, size));
#endif
return RTDyldMemoryManager::registerEHFrames(addr, load_addr, size);
return RTDyldMemoryManager::registerEHFramesInProcess(addr, size);
}
void deregisterEHFrames() override
@ -508,6 +508,10 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager
void registerEHFrames(u8* addr, u64 load_addr, std::size_t size) override
{
#ifndef _WIN32
RTDyldMemoryManager::registerEHFramesInProcess(addr, size);
s_unfire.push_front(std::make_pair(addr, size));
#endif
}
void deregisterEHFrames() override
@ -770,25 +774,6 @@ jit_compiler::~jit_compiler()
{
}
bool jit_compiler::has_ssse3() const
{
if (m_cpu == "generic" ||
m_cpu == "k8" ||
m_cpu == "opteron" ||
m_cpu == "athlon64" ||
m_cpu == "athlon-fx" ||
m_cpu == "k8-sse3" ||
m_cpu == "opteron-sse3" ||
m_cpu == "athlon64-sse3" ||
m_cpu == "amdfam10" ||
m_cpu == "barcelona")
{
return false;
}
return true;
}
void jit_compiler::add(std::unique_ptr<llvm::Module> module, const std::string& path)
{
ObjectCache cache{path};

View File

@ -142,9 +142,6 @@ public:
return *m_engine;
}
// Test SSSE3 feature
bool has_ssse3() const;
// Add module (path to obj cache dir)
void add(std::unique_ptr<llvm::Module> module, const std::string& path);

View File

@ -9,7 +9,54 @@ cpu_translator::cpu_translator(llvm::Module* module, bool is_be)
, m_module(module)
, m_is_be(is_be)
{
}
void cpu_translator::initialize(llvm::LLVMContext& context, llvm::ExecutionEngine& engine)
{
m_context = context;
m_engine = &engine;
const auto cpu = m_engine->getTargetMachine()->getTargetCPU();
m_use_ssse3 = true;
// Test SSSE3 feature (TODO)
if (cpu == "generic" ||
cpu == "k8" ||
cpu == "opteron" ||
cpu == "athlon64" ||
cpu == "athlon-fx" ||
cpu == "k8-sse3" ||
cpu == "opteron-sse3" ||
cpu == "athlon64-sse3" ||
cpu == "amdfam10" ||
cpu == "barcelona")
{
m_use_ssse3 = false;
}
}
llvm::Value* cpu_translator::bitcast(llvm::Value* val, llvm::Type* type)
{
uint s1 = type->getScalarSizeInBits();
uint s2 = val->getType()->getScalarSizeInBits();
if (type->isVectorTy())
s1 *= type->getVectorNumElements();
if (val->getType()->isVectorTy())
s2 *= val->getType()->getVectorNumElements();
if (s1 != s2)
{
fmt::throw_exception("cpu_translator::bitcast(): incompatible type sizes (%u vs %u)", s1, s2);
}
if (const auto c1 = llvm::dyn_cast<llvm::Constant>(val))
{
return verify(HERE, llvm::ConstantFoldCastOperand(llvm::Instruction::BitCast, c1, type, m_module->getDataLayout()));
}
return m_ir->CreateBitCast(val, type);
}
template <>

View File

@ -9,6 +9,7 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Analysis/ConstantFolding.h"
#ifdef _MSC_VER
#pragma warning(pop)
@ -19,6 +20,8 @@
#include "../Utilities/StrFmt.h"
#include "../Utilities/BEType.h"
#include "../Utilities/BitField.h"
#include "../Utilities/Log.h"
#include "../Utilities/JIT.h"
#include <unordered_map>
#include <map>
@ -47,6 +50,7 @@ struct llvm_value_t
static constexpr bool is_sint = false;
static constexpr bool is_uint = false;
static constexpr bool is_float = false;
static constexpr uint is_array = false;
static constexpr uint is_vector = false;
static constexpr uint is_pointer = false;
@ -314,6 +318,7 @@ struct llvm_value_t<T*> : llvm_value_t<T>
static constexpr bool is_sint = false;
static constexpr bool is_uint = false;
static constexpr bool is_float = false;
static constexpr uint is_array = false;
static constexpr uint is_vector = false;
static constexpr uint is_pointer = llvm_value_t<T>::is_pointer + 1;
@ -333,6 +338,7 @@ struct llvm_value_t<T[N]> : llvm_value_t<T>
using base = llvm_value_t<T>;
using base::base;
static constexpr uint is_array = 0;
static constexpr uint is_vector = N;
static constexpr uint is_pointer = 0;
@ -342,6 +348,48 @@ struct llvm_value_t<T[N]> : llvm_value_t<T>
}
};
template <typename T, uint N>
struct llvm_value_t<T[0][N]> : llvm_value_t<T>
{
using type = T[0][N];
using base = llvm_value_t<T>;
using base::base;
static constexpr bool is_int = false;
static constexpr bool is_sint = false;
static constexpr bool is_uint = false;
static constexpr bool is_float = false;
static constexpr uint is_array = N;
static constexpr uint is_vector = false;
static constexpr uint is_pointer = false;
static llvm::Type* get_type(llvm::LLVMContext& context)
{
return llvm::ArrayType::get(llvm_value_t<T>::get_type(context), N);
}
};
template <typename T, uint V, uint N>
struct llvm_value_t<T[V][N]> : llvm_value_t<T[V]>
{
using type = T[V][N];
using base = llvm_value_t<T[V]>;
using base::base;
static constexpr bool is_int = false;
static constexpr bool is_sint = false;
static constexpr bool is_uint = false;
static constexpr bool is_float = false;
static constexpr uint is_array = N;
static constexpr uint is_vector = false;
static constexpr uint is_pointer = false;
static llvm::Type* get_type(llvm::LLVMContext& context)
{
return llvm::ArrayType::get(llvm_value_t<T[V]>::get_type(context), N);
}
};
template <typename T>
using llvm_expr_t = std::decay_t<T>;
@ -2368,6 +2416,9 @@ protected:
// Module to which all generated code is output to
llvm::Module* m_module;
// Execution engine from JIT instance
llvm::ExecutionEngine* m_engine{};
// Endianness, affects vector element numbering (TODO)
bool m_is_be;
@ -2377,6 +2428,8 @@ protected:
// IR builder
llvm::IRBuilder<>* m_ir;
void initialize(llvm::LLVMContext& context, llvm::ExecutionEngine& engine);
public:
// Convert a C++ type to an LLVM type (TODO: remove)
template <typename T>
@ -2421,6 +2474,26 @@ public:
return result;
}
// Call external function: provide name and function pointer
template <typename RT, typename... FArgs, typename... Args>
llvm::CallInst* call(std::string_view lame, RT(*_func)(FArgs...), Args... args)
{
static_assert(sizeof...(FArgs) == sizeof...(Args), "spu_llvm_recompiler::call(): unexpected arg number");
const auto type = llvm::FunctionType::get(get_type<RT>(), {args->getType()...}, false);
const auto func = llvm::cast<llvm::Function>(m_module->getOrInsertFunction({lame.data(), lame.size()}, type).getCallee());
m_engine->addGlobalMapping({lame.data(), lame.size()}, reinterpret_cast<std::uintptr_t>(_func));
return m_ir->CreateCall(func, {args...});
}
// Bitcast with immediate constant folding
llvm::Value* bitcast(llvm::Value* val, llvm::Type* type);
template <typename T>
llvm::Value* bitcast(llvm::Value* val)
{
return bitcast(val, get_type<T>());
}
template <typename T>
static llvm_placeholder_t<T> match()
{

View File

@ -4677,7 +4677,7 @@ bool ppu_interpreter::MTFSB0(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::MTFSFI(ppu_thread& ppu, ppu_opcode_t op)
{
const u32 bf = op.crfd * 4;
if (bf != 4 * 4)
if (bf != 4 * 4)
{
// Do nothing on non-FPCC field (TODO)
LOG_WARNING(PPU, "MTFSFI(%d)", op.crfd);

View File

@ -1711,7 +1711,7 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
module->setDataLayout(jit.get_engine().getTargetMachine()->createDataLayout());
// Initialize translator
PPUTranslator translator(jit.get_context(), module.get(), module_part, jit.has_ssse3());
PPUTranslator translator(jit.get_context(), module.get(), module_part, jit.get_engine());
// Define some types
const auto _void = Type::getVoidTy(jit.get_context());

View File

@ -79,7 +79,7 @@ public:
result |= bit;
}
return result;
return result;
}
// Unpack CR bits

View File

@ -11,14 +11,13 @@ using namespace llvm;
const ppu_decoder<PPUTranslator> s_ppu_decoder;
PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_module& info, bool ssse3)
PPUTranslator::PPUTranslator(LLVMContext& context, Module* module, const ppu_module& info, ExecutionEngine& engine)
: cpu_translator(module, false)
, m_info(info)
, m_pure_attr(AttributeList::get(m_context, AttributeList::FunctionIndex, {Attribute::NoUnwind, Attribute::ReadNone}))
{
// Bind context
m_context = context;
m_use_ssse3 = ssse3;
cpu_translator::initialize(context, engine);
// There is no weak linkage on JIT, so let's create variables with different names for each module part
const u32 gsuffix = m_info.name.empty() ? info.funcs[0].addr : info.funcs[0].addr - m_info.segs[0].addr;

View File

@ -315,7 +315,7 @@ public:
// Handle compilation errors
void CompilationError(const std::string& error);
PPUTranslator(llvm::LLVMContext& context, llvm::Module* module, const ppu_module& info, bool ssse3);
PPUTranslator(llvm::LLVMContext& context, llvm::Module* module, const ppu_module& info, llvm::ExecutionEngine& engine);
~PPUTranslator();
// Get thread context struct type

View File

@ -260,7 +260,7 @@ bool spu_thread::write_reg(const u32 addr, const u32 value)
void spu_load_exec(const spu_exec_object& elf)
{
auto ls0 = vm::cast(vm::falloc(RAW_SPU_BASE_ADDR, 0x40000, vm::spu));
auto ls0 = vm::cast(vm::falloc(RAW_SPU_BASE_ADDR, 0x80000, vm::spu));
auto spu = idm::make_ptr<named_thread<spu_thread>>("TEST_SPU", ls0, nullptr, 0, "");
spu_thread::g_raw_spu_ctr++;

View File

@ -11,6 +11,7 @@ struct spu_itype
static constexpr struct branch_tag{} branch{}; // Branch Instructions
static constexpr struct floating_tag{} floating{}; // Floating-Point Instructions
static constexpr struct quadrop_tag{} _quadrop{}; // 4-op Instructions
static constexpr struct xfloat_tag{} xfloat{}; // Instructions producing xfloat values
enum type : unsigned char
{
@ -146,24 +147,26 @@ struct spu_itype
FMS, // quadrop_tag last
FA,
DFA,
FS,
DFS,
FM,
FREST,
FRSQEST,
FI,
CSFLT,
CUFLT,
FRDS, // xfloat_tag last
DFA,
DFS,
DFM,
DFMA,
DFNMS,
DFMS,
DFNMA,
FREST,
FRSQEST,
FI,
CSFLT,
CFLTS,
CUFLT,
CFLTU,
FRDS,
FESD,
CFLTS,
CFLTU,
FCEQ,
FCMEQ,
FCGT,
@ -252,6 +255,12 @@ struct spu_itype
{
return value >= MPYA && value <= FMS;
}
// Test for xfloat instruction
friend constexpr bool operator &(type value, xfloat_tag)
{
return value >= FMA && value <= FRDS;
}
};
struct spu_iflag

File diff suppressed because it is too large Load Diff

View File

@ -44,8 +44,14 @@ class spu_runtime
atomic_t<u64> m_reset_count{0};
struct func_compare
{
// Comparison function for SPU programs
bool operator()(const std::vector<u32>& lhs, const std::vector<u32>& rhs) const;
};
// All functions
std::map<std::vector<u32>, spu_function_t> m_map;
std::map<std::vector<u32>, spu_function_t, func_compare> m_map;
// Debug module output location
std::string m_cache_path;
@ -57,8 +63,8 @@ class spu_runtime
u16 from;
u16 level;
u8* rel32;
std::map<std::vector<u32>, spu_function_t>::iterator beg;
std::map<std::vector<u32>, spu_function_t>::iterator end;
decltype(m_map)::iterator beg;
decltype(m_map)::iterator end;
};
// Scratch vector
@ -199,6 +205,17 @@ public:
s_reg_max
};
// Classify terminator instructions
enum class term_type : unsigned char
{
br,
ret,
call,
fallthrough,
indirect_call,
interrupt_call,
};
protected:
std::shared_ptr<spu_runtime> m_spurt;
@ -239,12 +256,39 @@ protected:
// Internal use flag
bool analysed = false;
// Terminator instruction type
term_type terminator;
// Bit mask of the registers modified in the block
std::bitset<s_reg_max> reg_mod{};
// Set if last modifying instruction produces xfloat
std::bitset<s_reg_max> reg_mod_xf{};
// Set if the initial register value in this block may be xfloat
std::bitset<s_reg_max> reg_maybe_xf{};
// Bit mask of the registers used (before modified)
std::bitset<s_reg_max> reg_use{};
// Bit mask of the trivial (u32 x 4) constant value resulting in this block
std::bitset<s_reg_max> reg_const{};
// Bit mask of register saved onto the stack before use
std::bitset<s_reg_max> reg_save_dom{};
// Address of the function
u32 func = 0x40000;
// Value subtracted from $SP in this block, negative if something funny is done on $SP
u32 stack_sub = 0;
// Constant values associated with reg_const
std::array<u32, s_reg_max> reg_val32;
// Registers loaded from the stack in this block (stack offset)
std::array<u32, s_reg_max> reg_load_mod{};
// Single source of the reg value (dominating block address within the same chunk) or a negative number
std::array<u32, s_reg_max> reg_origin, reg_origin_abs;
@ -258,13 +302,27 @@ protected:
// Sorted basic block info
std::map<u32, block_info> m_bbs;
// Advanced block (chunk) information
struct chunk_info
// Sorted advanced block (chunk) list
std::basic_string<u32> m_chunks;
// Function information
struct func_info
{
// Size to the end of last basic block
u16 size = 0;
// Determines whether a function is eligible for optimizations
bool good = false;
// Call targets
std::basic_string<u32> calls;
// Register save info (stack offset)
std::array<u32, s_reg_max> reg_save_off{};
};
// Sorted chunk info
std::map<u32, chunk_info> m_chunks;
// Sorted function info
std::map<u32, func_info> m_funcs;
std::shared_ptr<spu_cache> m_cache;
@ -272,6 +330,9 @@ private:
// For private use
std::bitset<0x10000> m_bits;
// For private use
std::vector<u32> workload;
// Result of analyse(), to avoid copying and allocation
std::vector<u32> result;

View File

@ -579,6 +579,10 @@ public:
u64 block_recover = 0;
u64 block_failure = 0;
u64 saved_native_sp = 0; // Host thread's stack pointer for emulated longjmp
u8* memory_base_addr = vm::g_base_addr;
std::array<v128, 0x4000> stack_mirror; // Return address information
void push_snr(u32 number, u32 value);

View File

@ -232,7 +232,7 @@ error_code sys_spu_thread_initialize(vm::ptr<u32> thread, u32 group_id, u32 spu_
sys_spu.todo("Unimplemented SPU Thread options (0x%x)", option);
}
const vm::addr_t ls_addr{verify("SPU LS" HERE, vm::alloc(0x40000, vm::main))};
const vm::addr_t ls_addr{verify("SPU LS" HERE, vm::alloc(0x80000, vm::main))};
const u32 tid = idm::import<named_thread<spu_thread>>([&]()
{