Rewrites ARMv7 fastmem entirely.

This is a fairly lengthy change that can't be separated out to multiple commits well due to the nature of fastmem being a bit of an intertangled mess.
This makes my life easier for maintaining fastmem on ARMv7 because I now don't have to do any terrible instruction counting and NOP padding. Really
makes my brain stop hurting when working with it.

This enables fastmem for a whole bunch of new instructions, which basically means that all instructions now have fastmem working for them. This also
rewrites the floating point loadstores again because the last implementation was pretty crap when it comes to performance, even if they were the
cleanest implementation from my point of view.

This initially started with me rewriting the fastmem routines to work just like the previous/current implementation of floating loadstores. That was
when I noticed that the performance tanked and decided to rewrite all of it.

This also happens to implement gatherpipe optimizations alongside constant address optimization.

Overall this comment brings a fairly large speedboost when using fastmem.
This commit is contained in:
Ryan Houdek 2014-11-20 13:56:50 +00:00
parent da962a3d2b
commit bfbbddd76f
5 changed files with 1065 additions and 520 deletions

View File

@ -40,6 +40,7 @@ void JitArm::Init()
code_block.m_gpa = &js.gpa; code_block.m_gpa = &js.gpa;
code_block.m_fpa = &js.fpa; code_block.m_fpa = &js.fpa;
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE); analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
InitBackpatch();
} }
void JitArm::ClearCache() void JitArm::ClearCache()

View File

@ -48,6 +48,26 @@ private:
ArmFPRCache fpr; ArmFPRCache fpr;
PPCAnalyst::CodeBuffer code_buffer; PPCAnalyst::CodeBuffer code_buffer;
struct BackPatchInfo
{
enum
{
FLAG_STORE = (1 << 0),
FLAG_LOAD = (1 << 1),
FLAG_SIZE_8 = (1 << 2),
FLAG_SIZE_16 = (1 << 3),
FLAG_SIZE_32 = (1 << 4),
FLAG_SIZE_F32 = (1 << 5),
FLAG_SIZE_F64 = (1 << 6),
FLAG_REVERSE = (1 << 7),
};
u32 m_fastmem_size;
u32 m_fastmem_trouble_inst_offset;
u32 m_slowmem_size;
};
// The key is the flags
std::map<u32, BackPatchInfo> m_backpatch_info;
void DoDownCount(); void DoDownCount();
@ -57,11 +77,19 @@ private:
ArmGen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set); ArmGen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set);
bool BackPatch(SContext* ctx);
void BeginTimeProfile(JitBlock* b); void BeginTimeProfile(JitBlock* b);
void EndTimeProfile(JitBlock* b); void EndTimeProfile(JitBlock* b);
bool BackPatch(SContext* ctx);
bool DisasmLoadStore(const u8* ptr, u32* flags, ArmGen::ARMReg* rD, ArmGen::ARMReg* V1);
// Initializes the information that backpatching needs
// This is required so we know the backpatch routine sizes and trouble offsets
void InitBackpatch();
// Returns the trouble instruction offset
// Zero if it isn't a fastmem routine
u32 EmitBackpatchRoutine(ARMXEmitter* emit, u32 flags, bool fastmem, bool do_padding, ArmGen::ARMReg RS, ArmGen::ARMReg V1 = ArmGen::ARMReg::INVALID_REG);
public: public:
JitArm() : code_buffer(32000) {} JitArm() : code_buffer(32000) {}
~JitArm() {} ~JitArm() {}
@ -118,13 +146,8 @@ public:
void GetCarryAndClear(ArmGen::ARMReg reg); void GetCarryAndClear(ArmGen::ARMReg reg);
void FinalizeCarry(ArmGen::ARMReg reg); void FinalizeCarry(ArmGen::ARMReg reg);
// TODO: This shouldn't be here void SafeStoreFromReg(s32 dest, u32 value, s32 offsetReg, int accessSize, s32 offset);
void UnsafeStoreFromReg(ArmGen::ARMReg dest, ArmGen::ARMReg value, int accessSize, s32 offset); void SafeLoadToReg(ArmGen::ARMReg dest, s32 addr, s32 offsetReg, int accessSize, s32 offset, bool signExtend, bool reverse, bool update);
void SafeStoreFromReg(bool fastmem, s32 dest, u32 value, s32 offsetReg, int accessSize, s32 offset);
void UnsafeLoadToReg(ArmGen::ARMReg dest, ArmGen::ARMReg addr, int accessSize, s32 offsetReg, s32 offset);
void SafeLoadToReg(bool fastmem, u32 dest, s32 addr, s32 offsetReg, int accessSize, s32 offset, bool signExtend, bool reverse);
// OPCODES // OPCODES
void unknown_instruction(UGeckoInstruction _inst); void unknown_instruction(UGeckoInstruction _inst);

View File

@ -16,47 +16,65 @@ using namespace ArmGen;
// 1) It's really necessary. We don't know anything about the context. // 1) It's really necessary. We don't know anything about the context.
// 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be // 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be
// that many of them in a typical program/game. // that many of them in a typical program/game.
static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Store, bool *new_system) bool JitArm::DisasmLoadStore(const u8* ptr, u32* flags, ARMReg* rD, ARMReg* V1)
{ {
u32 inst = *(u32*)ptr;
u32 prev_inst = *(u32*)(ptr - 4);
u32 next_inst = *(u32*)(ptr + 4);
u8 op = (inst >> 20) & 0xFF; u8 op = (inst >> 20) & 0xFF;
rD = (ARMReg)((inst >> 12) & 0xF); *rD = (ARMReg)((inst >> 12) & 0xF);
switch (op) switch (op)
{ {
case 0x58: // STR case 0x58: // STR
{ {
Store = true; *flags |=
accessSize = 32; BackPatchInfo::FLAG_STORE |
BackPatchInfo::FLAG_SIZE_32;
*rD = (ARMReg)(prev_inst & 0xF);
} }
break; break;
case 0x59: // LDR case 0x59: // LDR
{ {
Store = false; *flags |=
accessSize = 32; BackPatchInfo::FLAG_LOAD |
BackPatchInfo::FLAG_SIZE_32;
// REV
if ((next_inst & 0x0FFF0FF0) != 0x06BF0F30)
*flags |= BackPatchInfo::FLAG_REVERSE;
} }
break; break;
case 0x1D: // LDRH case 0x1D: // LDRH
{ {
Store = false; *flags |=
accessSize = 16; BackPatchInfo::FLAG_LOAD |
BackPatchInfo::FLAG_SIZE_16;
// REV16
if((next_inst & 0x0FFF0FF0) != 0x06BF0FB0)
*flags |= BackPatchInfo::FLAG_REVERSE;
} }
break; break;
case 0x45 + 0x18: // LDRB case 0x45 + 0x18: // LDRB
{ {
Store = false; *flags |=
accessSize = 8; BackPatchInfo::FLAG_LOAD |
BackPatchInfo::FLAG_SIZE_8;
} }
break; break;
case 0x5C: // STRB case 0x5C: // STRB
{ {
Store = true; *flags |=
accessSize = 8; BackPatchInfo::FLAG_STORE |
BackPatchInfo::FLAG_SIZE_8;
*rD = (ARMReg)((inst >> 12) & 0xF);
} }
break; break;
case 0x1C: // STRH case 0x1C: // STRH
{ {
Store = true; *flags |=
accessSize = 16; BackPatchInfo::FLAG_STORE |
BackPatchInfo::FLAG_SIZE_16;
*rD = (ARMReg)(prev_inst & 0xF);
} }
break; break;
default: default:
@ -66,10 +84,92 @@ static bool DisamLoadStore(const u32 inst, ARMReg &rD, u8 &accessSize, bool &Sto
switch (op2) switch (op2)
{ {
case 0xD: // VLDR/VSTR case 0xD: // VLDR/VSTR
*new_system = true; {
bool load = (inst >> 20) & 1;
bool single = !((inst >> 8) & 1);
if (load)
*flags |= BackPatchInfo::FLAG_LOAD;
else
*flags |= BackPatchInfo::FLAG_STORE;
if (single)
*flags |= BackPatchInfo::FLAG_SIZE_F32;
else
*flags |= BackPatchInfo::FLAG_SIZE_F64;
if (single)
{
if (!load)
{
u32 vcvt = *(u32*)(ptr - 8);
u32 src_register = vcvt & 0xF;
src_register |= (vcvt >> 1) & 0x10;
*rD = (ARMReg)(src_register + D0);
}
}
}
break; break;
case 0x4: // VST1/VLD1 case 0x4: // VST1/VLD1
*new_system = true; {
u32 size = (inst >> 6) & 0x3;
bool load = (inst >> 21) & 1;
if (load)
*flags |= BackPatchInfo::FLAG_LOAD;
else
*flags |= BackPatchInfo::FLAG_STORE;
if (size == 2) // 32bit
{
if (load)
{
// For 32bit loads we are loading to a temporary
// So we need to read PC+8,PC+12 to get the two destination registers
u32 vcvt_1 = *(u32*)(ptr + 8);
u32 vcvt_2 = *(u32*)(ptr + 12);
u32 dest_register_1 = (vcvt_1 >> 12) & 0xF;
dest_register_1 |= (vcvt_1 >> 18) & 0x10;
u32 dest_register_2 = (vcvt_2 >> 12) & 0xF;
dest_register_2 |= (vcvt_2 >> 18) & 0x10;
// Make sure to encode the destination register to something our emitter understands
*rD = (ARMReg)(dest_register_1 + D0);
*V1 = (ARMReg)(dest_register_2 + D0);
}
else
{
// For 32bit stores we are storing from a temporary
// So we need to check the VCVT at PC-8 for the source register
u32 vcvt = *(u32*)(ptr - 8);
u32 src_register = vcvt & 0xF;
src_register |= (vcvt >> 1) & 0x10;
*rD = (ARMReg)(src_register + D0);
}
*flags |= BackPatchInfo::FLAG_SIZE_F32;
}
else if (size == 3) // 64bit
{
if (load)
{
// For 64bit loads we load directly in to the VFP register
u32 dest_register = (inst >> 12) & 0xF;
dest_register |= (inst >> 18) & 0x10;
// Make sure to encode the destination register to something our emitter understands
*rD = (ARMReg)(dest_register + D0);
}
else
{
// For 64bit stores we are storing from a temporary
// Check the previous VREV64 instruction for the real register
u32 src_register = prev_inst & 0xF;
src_register |= (prev_inst >> 1) & 0x10;
*rD = (ARMReg)(src_register + D0);
}
*flags |= BackPatchInfo::FLAG_SIZE_F64;
}
}
break; break;
default: default:
printf("Op is 0x%02x\n", op); printf("Op is 0x%02x\n", op);
@ -95,94 +195,484 @@ bool JitArm::BackPatch(SContext* ctx)
// We need to get the destination register before we start // We need to get the destination register before we start
u8* codePtr = (u8*)ctx->CTX_PC; u8* codePtr = (u8*)ctx->CTX_PC;
u32 Value = *(u32*)codePtr; u32 Value = *(u32*)codePtr;
ARMReg rD; ARMReg rD = INVALID_REG;
u8 accessSize; ARMReg V1 = INVALID_REG;
bool Store; u32 flags = 0;
bool new_system = false;
if (!DisamLoadStore(Value, rD, accessSize, Store, &new_system)) if (!DisasmLoadStore(codePtr, &flags, &rD, &V1))
{ {
printf("Invalid backpatch at location 0x%08lx(0x%08x)\n", ctx->CTX_PC, Value); printf("Invalid backpatch at location 0x%08lx(0x%08x)\n", ctx->CTX_PC, Value);
exit(0); exit(0);
} }
if (new_system) BackPatchInfo& info = m_backpatch_info[flags];
{ ARMXEmitter emitter(codePtr - info.m_fastmem_trouble_inst_offset * 4);
// The new system is a lot easier to backpatch than the old crap. u32 new_pc = (u32)emitter.GetCodePtr();
// Instead of backpatching over code and making sure we NOP pad and other crap EmitBackpatchRoutine(&emitter, flags, false, true, rD, V1);
// We emit both the slow and fast path and branch over the slow path each time emitter.FlushIcache();
// We search backwards until we find the second branch instruction ctx->CTX_PC = new_pc;
// Then proceed to replace it with a NOP and set that to the new PC. return true;
// This ensures that we run the slow path and then branch over the fast path. }
// Run backwards until we find the branch we want to NOP u32 JitArm::EmitBackpatchRoutine(ARMXEmitter* emit, u32 flags, bool fastmem, bool do_padding, ARMReg RS, ARMReg V1)
for (int branches = 2; branches > 0; ctx->CTX_PC -= 4) {
if ((*(u32*)ctx->CTX_PC & 0x0F000000) == 0x0A000000) // B ARMReg addr = R12;
--branches; ARMReg temp = R11;
u32 trouble_offset = 0;
const u8* code_base = emit->GetCodePtr();
ctx->CTX_PC += 4; if (fastmem)
ARMXEmitter emitter((u8*)ctx->CTX_PC);
emitter.NOP(1);
emitter.FlushIcache();
return true;
}
else
{ {
if (Store) ARMReg temp2 = R10;
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
emit->BIC(temp, addr, mask); // 1
emit->MOVI2R(temp2, (u32)Memory::base); // 2-3
emit->ADD(temp, temp, temp2); // 4
if (flags & BackPatchInfo::FLAG_STORE &&
flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64))
{ {
const u32 ARMREGOFFSET = 4 * 5; NEONXEmitter nemit(emit);
ARMXEmitter emitter(codePtr - ARMREGOFFSET); if (flags & BackPatchInfo::FLAG_SIZE_F32)
switch (accessSize)
{ {
case 8: // 8bit emit->VCVT(S0, RS, 0);
emitter.MOVI2R(R14, (u32)&Memory::Write_U8, false); // 1-2 nemit.VREV32(I_8, D0, D0);
return 0; trouble_offset = (emit->GetCodePtr() - code_base) / 4;
break; emit->VSTR(S0, temp, 0);
case 16: // 16bit
emitter.MOVI2R(R14, (u32)&Memory::Write_U16, false); // 1-2
return 0;
break;
case 32: // 32bit
emitter.MOVI2R(R14, (u32)&Memory::Write_U32, false); // 1-2
break;
} }
emitter.PUSH(4, R0, R1, R2, R3); // 3 else
emitter.MOV(R0, rD); // Value - 4 {
emitter.MOV(R1, R10); // Addr- 5 nemit.VREV64(I_8, D0, RS);
emitter.BL(R14); // 6 trouble_offset = (emit->GetCodePtr() - code_base) / 4;
emitter.POP(4, R0, R1, R2, R3); // 7 nemit.VST1(I_64, D0, temp);
u32 newPC = ctx->CTX_PC - (ARMREGOFFSET + 4 * 4); }
ctx->CTX_PC = newPC; }
emitter.FlushIcache(); else if (flags & BackPatchInfo::FLAG_LOAD &&
return true; flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64))
{
NEONXEmitter nemit(emit);
trouble_offset = (emit->GetCodePtr() - code_base) / 4;
if (flags & BackPatchInfo::FLAG_SIZE_F32)
{
nemit.VLD1(F_32, D0, temp);
nemit.VREV32(I_8, D0, D0); // Byte swap to result
emit->VCVT(RS, S0, 0);
emit->VCVT(V1, S0, 0);
}
else
{
nemit.VLD1(I_64, RS, temp);
nemit.VREV64(I_8, RS, RS); // Byte swap to result
}
}
else if (flags & BackPatchInfo::FLAG_STORE)
{
if (flags & BackPatchInfo::FLAG_SIZE_32)
emit->REV(temp2, RS);
else if (flags & BackPatchInfo::FLAG_SIZE_16)
emit->REV16(temp2, RS);
trouble_offset = (emit->GetCodePtr() - code_base) / 4;
if (flags & BackPatchInfo::FLAG_SIZE_32)
emit->STR(temp2, temp);
else if (flags & BackPatchInfo::FLAG_SIZE_16)
emit->STRH(temp2, temp);
else
emit->STRB(RS, temp);
} }
else else
{ {
const u32 ARMREGOFFSET = 4 * 4; trouble_offset = (emit->GetCodePtr() - code_base) / 4;
ARMXEmitter emitter(codePtr - ARMREGOFFSET);
switch (accessSize) if (flags & BackPatchInfo::FLAG_SIZE_32)
emit->LDR(RS, temp); // 5
else if (flags & BackPatchInfo::FLAG_SIZE_16)
emit->LDRH(RS, temp);
else if (flags & BackPatchInfo::FLAG_SIZE_8)
emit->LDRB(RS, temp);
if (!(flags & BackPatchInfo::FLAG_REVERSE))
{ {
case 8: // 8bit if (flags & BackPatchInfo::FLAG_SIZE_32)
emitter.MOVI2R(R14, (u32)&Memory::Read_U8, false); // 2 emit->REV(RS, RS); // 6
break; else if (flags & BackPatchInfo::FLAG_SIZE_16)
case 16: // 16bit emit->REV16(RS, RS);
emitter.MOVI2R(R14, (u32)&Memory::Read_U16, false); // 2
break;
case 32: // 32bit
emitter.MOVI2R(R14, (u32)&Memory::Read_U32, false); // 2
break;
} }
emitter.PUSH(4, R0, R1, R2, R3); // 3
emitter.MOV(R0, R10); // 4
emitter.BL(R14); // 5
emitter.MOV(R14, R0); // 6
emitter.POP(4, R0, R1, R2, R3); // 7
emitter.MOV(rD, R14); // 8
ctx->CTX_PC -= ARMREGOFFSET + (4 * 4);
emitter.FlushIcache();
return true;
} }
} }
return 0; else
{
if (flags & BackPatchInfo::FLAG_STORE &&
flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64))
{
emit->PUSH(4, R0, R1, R2, R3);
if (flags & BackPatchInfo::FLAG_SIZE_F32)
{
emit->MOV(R1, addr);
emit->VCVT(S0, RS, 0);
emit->VMOV(R0, S0);
emit->MOVI2R(temp, (u32)&Memory::Write_U32);
emit->BL(temp);
}
else
{
emit->MOVI2R(temp, (u32)&Memory::Write_F64);
#if !defined(__ARM_PCS_VFP) // SoftFP returns in R0 and R1
emit->VMOV(R0, RS);
emit->MOV(R2, addr);
#else
emit->VMOV(D0, RS);
emit->MOV(R0, addr);
#endif
emit->BL(temp);
}
emit->POP(4, R0, R1, R2, R3);
}
else if (flags & BackPatchInfo::FLAG_LOAD &&
flags & (BackPatchInfo::FLAG_SIZE_F32 | BackPatchInfo::FLAG_SIZE_F64))
{
emit->PUSH(4, R0, R1, R2, R3);
emit->MOV(R0, addr);
if (flags & BackPatchInfo::FLAG_SIZE_F32)
{
emit->MOVI2R(temp, (u32)&Memory::Read_U32);
emit->BL(temp);
emit->VMOV(S0, R0);
emit->VCVT(RS, S0, 0);
emit->VCVT(V1, S0, 0);
}
else
{
emit->MOVI2R(temp, (u32)&Memory::Read_F64);
emit->BL(temp);
#if !defined(__ARM_PCS_VFP) // SoftFP returns in R0 and R1
emit->VMOV(RS, R0);
#else
emit->VMOV(RS, D0);
#endif
}
emit->POP(4, R0, R1, R2, R3);
}
else if (flags & BackPatchInfo::FLAG_STORE)
{
emit->PUSH(4, R0, R1, R2, R3);
emit->MOV(R0, RS);
emit->MOV(R1, addr);
if (flags & BackPatchInfo::FLAG_SIZE_32)
emit->MOVI2R(temp, (u32)&Memory::Write_U32);
else if (flags & BackPatchInfo::FLAG_SIZE_16)
emit->MOVI2R(temp, (u32)&Memory::Write_U16);
else
emit->MOVI2R(temp, (u32)&Memory::Write_U8);
emit->BL(temp);
emit->POP(4, R0, R1, R2, R3);
}
else
{
emit->PUSH(4, R0, R1, R2, R3);
emit->MOV(R0, addr);
if (flags & BackPatchInfo::FLAG_SIZE_32)
emit->MOVI2R(temp, (u32)&Memory::Read_U32);
else if (flags & BackPatchInfo::FLAG_SIZE_16)
emit->MOVI2R(temp, (u32)&Memory::Read_U16);
else if (flags & BackPatchInfo::FLAG_SIZE_8)
emit->MOVI2R(temp, (u32)&Memory::Read_U8);
emit->BL(temp);
emit->MOV(temp, R0);
emit->POP(4, R0, R1, R2, R3);
if (!(flags & BackPatchInfo::FLAG_REVERSE))
{
emit->MOV(RS, temp);
}
else
{
if (flags & BackPatchInfo::FLAG_SIZE_32)
emit->REV(RS, temp); // 6
else if (flags & BackPatchInfo::FLAG_SIZE_16)
emit->REV16(RS, temp);
}
}
}
if (do_padding)
{
BackPatchInfo& info = m_backpatch_info[flags];
u32 num_insts_max = std::max(info.m_fastmem_size, info.m_slowmem_size);
u32 code_size = emit->GetCodePtr() - code_base;
code_size /= 4;
emit->NOP(num_insts_max - code_size);
}
return trouble_offset;
} }
void JitArm::InitBackpatch()
{
u32 flags = 0;
BackPatchInfo info;
u8* code_base = GetWritableCodePtr();
u8* code_end;
// Writes
{
// 8bit
{
flags =
BackPatchInfo::FLAG_STORE |
BackPatchInfo::FLAG_SIZE_8;
EmitBackpatchRoutine(this, flags, false, false, R0);
code_end = GetWritableCodePtr();
info.m_slowmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
info.m_fastmem_trouble_inst_offset =
EmitBackpatchRoutine(this, flags, true, false, R0);
code_end = GetWritableCodePtr();
info.m_fastmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
m_backpatch_info[flags] = info;
}
// 16bit
{
flags =
BackPatchInfo::FLAG_STORE |
BackPatchInfo::FLAG_SIZE_16;
EmitBackpatchRoutine(this, flags, false, false, R0);
code_end = GetWritableCodePtr();
info.m_slowmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
info.m_fastmem_trouble_inst_offset =
EmitBackpatchRoutine(this, flags, true, false, R0);
code_end = GetWritableCodePtr();
info.m_fastmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
m_backpatch_info[flags] = info;
}
// 32bit
{
flags =
BackPatchInfo::FLAG_STORE |
BackPatchInfo::FLAG_SIZE_32;
EmitBackpatchRoutine(this, flags, false, false, R0);
code_end = GetWritableCodePtr();
info.m_slowmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
info.m_fastmem_trouble_inst_offset =
EmitBackpatchRoutine(this, flags, true, false, R0);
code_end = GetWritableCodePtr();
info.m_fastmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
m_backpatch_info[flags] = info;
}
// 32bit float
{
flags =
BackPatchInfo::FLAG_STORE |
BackPatchInfo::FLAG_SIZE_F32;
EmitBackpatchRoutine(this, flags, false, false, D0);
code_end = GetWritableCodePtr();
info.m_slowmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
info.m_fastmem_trouble_inst_offset =
EmitBackpatchRoutine(this, flags, true, false, D0);
code_end = GetWritableCodePtr();
info.m_fastmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
m_backpatch_info[flags] = info;
}
// 64bit float
{
flags =
BackPatchInfo::FLAG_STORE |
BackPatchInfo::FLAG_SIZE_F64;
EmitBackpatchRoutine(this, flags, false, false, D0);
code_end = GetWritableCodePtr();
info.m_slowmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
info.m_fastmem_trouble_inst_offset =
EmitBackpatchRoutine(this, flags, true, false, D0);
code_end = GetWritableCodePtr();
info.m_fastmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
m_backpatch_info[flags] = info;
}
}
// Loads
{
// 8bit
{
flags =
BackPatchInfo::FLAG_LOAD |
BackPatchInfo::FLAG_SIZE_8;
EmitBackpatchRoutine(this, flags, false, false, R0);
code_end = GetWritableCodePtr();
info.m_slowmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
info.m_fastmem_trouble_inst_offset =
EmitBackpatchRoutine(this, flags, true, false, R0);
code_end = GetWritableCodePtr();
info.m_fastmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
m_backpatch_info[flags] = info;
}
// 16bit
{
flags =
BackPatchInfo::FLAG_LOAD |
BackPatchInfo::FLAG_SIZE_16;
EmitBackpatchRoutine(this, flags, false, false, R0);
code_end = GetWritableCodePtr();
info.m_slowmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
info.m_fastmem_trouble_inst_offset =
EmitBackpatchRoutine(this, flags, true, false, R0);
code_end = GetWritableCodePtr();
info.m_fastmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
m_backpatch_info[flags] = info;
}
// 32bit
{
flags =
BackPatchInfo::FLAG_LOAD |
BackPatchInfo::FLAG_SIZE_32;
EmitBackpatchRoutine(this, flags, false, false, R0);
code_end = GetWritableCodePtr();
info.m_slowmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
info.m_fastmem_trouble_inst_offset =
EmitBackpatchRoutine(this, flags, true, false, R0);
code_end = GetWritableCodePtr();
info.m_fastmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
m_backpatch_info[flags] = info;
}
// 16bit - reverse
{
flags =
BackPatchInfo::FLAG_LOAD |
BackPatchInfo::FLAG_SIZE_16 |
BackPatchInfo::FLAG_REVERSE;
EmitBackpatchRoutine(this, flags, false, false, R0);
code_end = GetWritableCodePtr();
info.m_slowmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
info.m_fastmem_trouble_inst_offset =
EmitBackpatchRoutine(this, flags, true, false, R0);
code_end = GetWritableCodePtr();
info.m_fastmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
m_backpatch_info[flags] = info;
}
// 32bit - reverse
{
flags =
BackPatchInfo::FLAG_LOAD |
BackPatchInfo::FLAG_SIZE_32 |
BackPatchInfo::FLAG_REVERSE;
EmitBackpatchRoutine(this, flags, false, false, R0);
code_end = GetWritableCodePtr();
info.m_slowmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
info.m_fastmem_trouble_inst_offset =
EmitBackpatchRoutine(this, flags, true, false, R0);
code_end = GetWritableCodePtr();
info.m_fastmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
m_backpatch_info[flags] = info;
}
// 32bit float
{
flags =
BackPatchInfo::FLAG_LOAD |
BackPatchInfo::FLAG_SIZE_F32;
EmitBackpatchRoutine(this, flags, false, false, D0, D1);
code_end = GetWritableCodePtr();
info.m_slowmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
info.m_fastmem_trouble_inst_offset =
EmitBackpatchRoutine(this, flags, true, false, D0, D1);
code_end = GetWritableCodePtr();
info.m_fastmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
m_backpatch_info[flags] = info;
}
// 64bit float
{
flags =
BackPatchInfo::FLAG_LOAD |
BackPatchInfo::FLAG_SIZE_F64;
EmitBackpatchRoutine(this, flags, false, false, D0);
code_end = GetWritableCodePtr();
info.m_slowmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
info.m_fastmem_trouble_inst_offset =
EmitBackpatchRoutine(this, flags, true, false, D0);
code_end = GetWritableCodePtr();
info.m_fastmem_size = (code_end - code_base) / 4;
SetCodePtr(code_base);
m_backpatch_info[flags] = info;
}
}
}

View File

@ -18,114 +18,149 @@
using namespace ArmGen; using namespace ArmGen;
void JitArm::UnsafeStoreFromReg(ARMReg dest, ARMReg value, int accessSize, s32 offset) void JitArm::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, int accessSize, s32 offset)
{ {
// All this gets replaced on backpatch // We want to make sure to not get LR as a temp register
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK) ARMReg rA = R12;
BIC(dest, dest, mask); // 1
MOVI2R(R14, (u32)Memory::base, false); // 2-3
ADD(dest, dest, R14); // 4
switch (accessSize)
{
case 32:
REV(value, value); // 5
break;
case 16:
REV16(value, value);
break;
case 8:
NOP(1);
break;
}
switch (accessSize)
{
case 32:
STR(value, dest); // 6
break;
case 16:
STRH(value, dest);
break;
case 8:
STRB(value, dest);
break;
}
NOP(1); // 7
}
void JitArm::SafeStoreFromReg(bool fastmem, s32 dest, u32 value, s32 regOffset, int accessSize, s32 offset) u32 imm_addr = 0;
{ bool is_immediate = false;
if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem && fastmem)
{
ARMReg RA;
ARMReg RB;
ARMReg RS = gpr.R(value);
if (dest != -1)
RA = gpr.R(dest);
if (regOffset != -1)
{
RB = gpr.R(regOffset);
MOV(R10, RB);
NOP(1);
}
else
{
MOVI2R(R10, (u32)offset, false);
}
if (dest != -1)
ADD(R10, R10, RA);
else
NOP(1);
MOV(R12, RS);
UnsafeStoreFromReg(R10, R12, accessSize, 0);
return;
}
ARMReg rA = gpr.GetReg();
ARMReg rB = gpr.GetReg();
ARMReg rC = gpr.GetReg();
ARMReg RA = INVALID_REG;
ARMReg RB = INVALID_REG;
if (dest != -1)
RA = gpr.R(dest);
if (regOffset != -1)
RB = gpr.R(regOffset);
ARMReg RS = gpr.R(value);
switch (accessSize)
{
case 32:
MOVI2R(rA, (u32)&Memory::Write_U32);
break;
case 16:
MOVI2R(rA, (u32)&Memory::Write_U16);
break;
case 8:
MOVI2R(rA, (u32)&Memory::Write_U8);
break;
}
MOV(rB, RS);
if (regOffset == -1) if (regOffset == -1)
{ {
MOVI2R(rC, offset);
if (dest != -1) if (dest != -1)
ADD(rC, rC, RA); {
if (gpr.IsImm(dest))
{
is_immediate = true;
imm_addr = gpr.GetImm(dest) + offset;
}
else
{
Operand2 off;
if (TryMakeOperand2(offset, off))
{
ADD(rA, gpr.R(dest), off);
}
else
{
MOVI2R(rA, offset);
ADD(rA, rA, gpr.R(dest));
}
}
}
else
{
is_immediate = true;
imm_addr = offset;
}
} }
else else
{ {
if (dest != -1) if (dest != -1)
ADD(rC, RA, RB); {
if (gpr.IsImm(dest) && gpr.IsImm(regOffset))
{
is_immediate = true;
imm_addr = gpr.GetImm(dest) + gpr.GetImm(regOffset);
}
else if (gpr.IsImm(dest) && !gpr.IsImm(regOffset))
{
Operand2 off;
if (TryMakeOperand2(gpr.GetImm(dest), off))
{
ADD(rA, gpr.R(regOffset), off);
}
else
{
MOVI2R(rA, gpr.GetImm(dest));
ADD(rA, rA, gpr.R(regOffset));
}
}
else if (!gpr.IsImm(dest) && gpr.IsImm(regOffset))
{
Operand2 off;
if (TryMakeOperand2(gpr.GetImm(regOffset), off))
{
ADD(rA, gpr.R(dest), off);
}
else
{
MOVI2R(rA, gpr.GetImm(regOffset));
ADD(rA, rA, gpr.R(dest));
}
}
else
{
ADD(rA, gpr.R(dest), gpr.R(regOffset));
}
}
else else
MOV(rC, RB); {
if (gpr.IsImm(regOffset))
{
is_immediate = true;
imm_addr = gpr.GetImm(regOffset);
}
else
{
MOV(rA, gpr.R(regOffset));
}
}
}
ARMReg RS = gpr.R(value);
u32 flags = BackPatchInfo::FLAG_STORE;
if (accessSize == 32)
flags |= BackPatchInfo::FLAG_SIZE_32;
else if (accessSize == 16)
flags |= BackPatchInfo::FLAG_SIZE_16;
else
flags |= BackPatchInfo::FLAG_SIZE_8;
if (is_immediate)
{
if ((imm_addr & 0xFFFFF000) == 0xCC008000 && jit->jo.optimizeGatherPipe)
{
MOVI2R(R14, (u32)&GPFifo::m_gatherPipeCount);
MOVI2R(R10, (u32)GPFifo::m_gatherPipe);
LDR(R11, R14);
if (accessSize == 32)
{
REV(RS, RS);
STR(RS, R10, R11);
REV(RS, RS);
}
else if (accessSize == 16)
{
REV16(RS, RS);
STRH(RS, R10, R11);
REV16(RS, RS);
}
else
{
STRB(RS, R10, R11);
}
ADD(R11, R11, accessSize >> 3);
STR(R11, R14);
jit->js.fifoBytesThisBlock += accessSize >> 3;
}
else if (Memory::IsRAMAddress(imm_addr))
{
MOVI2R(rA, imm_addr);
EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, false, RS);
}
else
{
MOVI2R(rA, imm_addr);
EmitBackpatchRoutine(this, flags, false, false, RS);
}
}
else
{
EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, true, RS);
} }
PUSH(4, R0, R1, R2, R3);
MOV(R0, rB);
MOV(R1, rC);
BL(rA);
POP(4, R0, R1, R2, R3);
gpr.Unlock(rA, rB, rC);
} }
void JitArm::stX(UGeckoInstruction inst) void JitArm::stX(UGeckoInstruction inst)
@ -138,7 +173,6 @@ void JitArm::stX(UGeckoInstruction inst)
u32 accessSize = 0; u32 accessSize = 0;
s32 regOffset = -1; s32 regOffset = -1;
bool update = false; bool update = false;
bool fastmem = false;
switch (inst.OPCD) switch (inst.OPCD)
{ {
case 45: // sthu case 45: // sthu
@ -152,7 +186,6 @@ void JitArm::stX(UGeckoInstruction inst)
case 183: // stwux case 183: // stwux
update = true; update = true;
case 151: // stwx case 151: // stwx
fastmem = true;
accessSize = 32; accessSize = 32;
regOffset = b; regOffset = b;
break; break;
@ -173,7 +206,6 @@ void JitArm::stX(UGeckoInstruction inst)
case 37: // stwu case 37: // stwu
update = true; update = true;
case 36: // stw case 36: // stw
fastmem = true;
accessSize = 32; accessSize = 32;
break; break;
case 39: // stbu case 39: // stbu
@ -182,7 +214,9 @@ void JitArm::stX(UGeckoInstruction inst)
accessSize = 8; accessSize = 8;
break; break;
} }
SafeStoreFromReg(fastmem, update ? a : (a ? a : -1), s, regOffset, accessSize, offset);
SafeStoreFromReg(update ? a : (a ? a : -1), s, regOffset, accessSize, offset);
if (update) if (update)
{ {
ARMReg rA = gpr.GetReg(); ARMReg rA = gpr.GetReg();
@ -193,143 +227,135 @@ void JitArm::stX(UGeckoInstruction inst)
// Check for DSI exception prior to writing back address // Check for DSI exception prior to writing back address
LDR(rA, R9, PPCSTATE_OFF(Exceptions)); LDR(rA, R9, PPCSTATE_OFF(Exceptions));
TST(rA, EXCEPTION_DSI); TST(rA, EXCEPTION_DSI);
FixupBranch DoNotWrite = B_CC(CC_NEQ); SetCC(CC_EQ);
if (a) if (regOffset == -1)
{ {
if (regOffset == -1) MOVI2R(rA, offset);
{ ADD(RA, RA, rA);
MOVI2R(rA, offset);
ADD(RA, RA, rA);
}
else
{
ADD(RA, RA, RB);
}
} }
else else
{ {
if (regOffset == -1) ADD(RA, RA, RB);
MOVI2R(RA, (u32)offset);
else
MOV(RA, RB);
} }
SetJumpTarget(DoNotWrite); SetCC();
gpr.Unlock(rA); gpr.Unlock(rA);
} }
} }
void JitArm::UnsafeLoadToReg(ARMReg dest, ARMReg addr, int accessSize, s32 offsetReg, s32 offset) void JitArm::SafeLoadToReg(ARMReg dest, s32 addr, s32 offsetReg, int accessSize, s32 offset, bool signExtend, bool reverse, bool update)
{ {
ARMReg rA = gpr.GetReg(); // We want to make sure to not get LR as a temp register
ARMReg rA = R12;
u32 imm_addr = 0;
bool is_immediate = false;
if (offsetReg == -1) if (offsetReg == -1)
{ {
MOVI2R(rA, offset, false); // -3
ADD(addr, addr, rA); // - 1
}
else
{
NOP(2); // -3, -2
// offsetReg is preloaded here
ADD(addr, addr, gpr.R(offsetReg)); // -1
}
// All this gets replaced on backpatch
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
BIC(addr, addr, mask); // 1
MOVI2R(rA, (u32)Memory::base, false); // 2-3
ADD(addr, addr, rA); // 4
switch (accessSize)
{
case 32:
LDR(dest, addr); // 5
break;
case 16:
LDRH(dest, addr);
break;
case 8:
LDRB(dest, addr);
break;
}
switch (accessSize)
{
case 32:
REV(dest, dest); // 6
break;
case 16:
REV16(dest, dest);
break;
case 8:
NOP(1);
break;
}
NOP(2); // 7-8
gpr.Unlock(rA);
}
void JitArm::SafeLoadToReg(bool fastmem, u32 dest, s32 addr, s32 offsetReg, int accessSize, s32 offset, bool signExtend, bool reverse)
{
ARMReg RD = gpr.R(dest);
if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem && fastmem)
{
// Preload for fastmem
if (offsetReg != -1)
gpr.R(offsetReg);
if (addr != -1) if (addr != -1)
MOV(R10, gpr.R(addr)); {
if (gpr.IsImm(addr))
{
is_immediate = true;
imm_addr = gpr.GetImm(addr) + offset;
}
else
{
Operand2 off;
if (TryMakeOperand2(offset, off))
{
ADD(rA, gpr.R(addr), off);
}
else
{
MOVI2R(rA, offset);
ADD(rA, rA, gpr.R(addr));
}
}
}
else else
MOV(R10, 0); {
is_immediate = true;
UnsafeLoadToReg(RD, R10, accessSize, offsetReg, offset); imm_addr = offset;
return; }
}
ARMReg rA = gpr.GetReg();
ARMReg rB = gpr.GetReg();
if (offsetReg == -1)
{
MOVI2R(rA, offset);
if (addr != -1)
ADD(rA, rA, gpr.R(addr));
} }
else else
{ {
if (addr != -1) if (addr != -1)
ADD(rA, gpr.R(addr), gpr.R(offsetReg)); {
if (gpr.IsImm(addr) && gpr.IsImm(offsetReg))
{
is_immediate = true;
imm_addr = gpr.GetImm(addr) + gpr.GetImm(offsetReg);
}
else if (gpr.IsImm(addr) && !gpr.IsImm(offsetReg))
{
Operand2 off;
if (TryMakeOperand2(gpr.GetImm(addr), off))
{
ADD(rA, gpr.R(offsetReg), off);
}
else
{
MOVI2R(rA, gpr.GetImm(addr));
ADD(rA, rA, gpr.R(offsetReg));
}
}
else if (!gpr.IsImm(addr) && gpr.IsImm(offsetReg))
{
Operand2 off;
if (TryMakeOperand2(gpr.GetImm(offsetReg), off))
{
ADD(rA, gpr.R(addr), off);
}
else
{
MOVI2R(rA, gpr.GetImm(offsetReg));
ADD(rA, rA, gpr.R(addr));
}
}
else
{
ADD(rA, gpr.R(addr), gpr.R(offsetReg));
}
}
else else
MOV(rA, gpr.R(offsetReg)); {
if (gpr.IsImm(offsetReg))
{
is_immediate = true;
imm_addr = gpr.GetImm(offsetReg);
}
else
{
MOV(rA, gpr.R(offsetReg));
}
}
} }
switch (accessSize) if (is_immediate)
{ MOVI2R(rA, imm_addr);
case 8:
MOVI2R(rB, (u32)&Memory::Read_U8); u32 flags = BackPatchInfo::FLAG_LOAD;
break; if (accessSize == 32)
case 16: flags |= BackPatchInfo::FLAG_SIZE_32;
MOVI2R(rB, (u32)&Memory::Read_U16); else if (accessSize == 16)
break; flags |= BackPatchInfo::FLAG_SIZE_16;
case 32: else
MOVI2R(rB, (u32)&Memory::Read_U32); flags |= BackPatchInfo::FLAG_SIZE_8;
break;
}
PUSH(4, R0, R1, R2, R3);
MOV(R0, rA);
BL(rB);
MOV(rA, R0);
POP(4, R0, R1, R2, R3);
MOV(RD, rA);
if (signExtend) // Only on 16 loads
SXTH(RD, RD);
if (reverse) if (reverse)
{ flags |= BackPatchInfo::FLAG_REVERSE;
if (accessSize == 32)
REV(RD, RD); EmitBackpatchRoutine(this, flags,
else if (accessSize == 16) SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem,
REV16(RD, RD); !(is_immediate && Memory::IsRAMAddress(imm_addr)), dest);
}
gpr.Unlock(rA, rB); if (signExtend) // Only on 16 loads
SXTH(dest, dest);
if (update)
MOV(gpr.R(addr), rA);
} }
void JitArm::lXX(UGeckoInstruction inst) void JitArm::lXX(UGeckoInstruction inst)
@ -344,7 +370,6 @@ void JitArm::lXX(UGeckoInstruction inst)
bool update = false; bool update = false;
bool signExtend = false; bool signExtend = false;
bool reverse = false; bool reverse = false;
bool fastmem = false;
switch (inst.OPCD) switch (inst.OPCD)
{ {
@ -354,21 +379,18 @@ void JitArm::lXX(UGeckoInstruction inst)
case 55: // lwzux case 55: // lwzux
update = true; update = true;
case 23: // lwzx case 23: // lwzx
fastmem = true;
accessSize = 32; accessSize = 32;
offsetReg = b; offsetReg = b;
break; break;
case 119: //lbzux case 119: //lbzux
update = true; update = true;
case 87: // lbzx case 87: // lbzx
fastmem = true;
accessSize = 8; accessSize = 8;
offsetReg = b; offsetReg = b;
break; break;
case 311: // lhzux case 311: // lhzux
update = true; update = true;
case 279: // lhzx case 279: // lhzx
fastmem = true;
accessSize = 16; accessSize = 16;
offsetReg = b; offsetReg = b;
break; break;
@ -392,19 +414,16 @@ void JitArm::lXX(UGeckoInstruction inst)
case 33: // lwzu case 33: // lwzu
update = true; update = true;
case 32: // lwz case 32: // lwz
fastmem = true;
accessSize = 32; accessSize = 32;
break; break;
case 35: // lbzu case 35: // lbzu
update = true; update = true;
case 34: // lbz case 34: // lbz
fastmem = true;
accessSize = 8; accessSize = 8;
break; break;
case 41: // lhzu case 41: // lhzu
update = true; update = true;
case 40: // lhz case 40: // lhz
fastmem = true;
accessSize = 16; accessSize = 16;
break; break;
case 43: // lhau case 43: // lhau
@ -417,27 +436,13 @@ void JitArm::lXX(UGeckoInstruction inst)
// Check for exception before loading // Check for exception before loading
ARMReg rA = gpr.GetReg(false); ARMReg rA = gpr.GetReg(false);
ARMReg RD = gpr.R(d);
LDR(rA, R9, PPCSTATE_OFF(Exceptions)); LDR(rA, R9, PPCSTATE_OFF(Exceptions));
TST(rA, EXCEPTION_DSI); TST(rA, EXCEPTION_DSI);
FixupBranch DoNotLoad = B_CC(CC_NEQ); FixupBranch DoNotLoad = B_CC(CC_NEQ);
SafeLoadToReg(fastmem, d, update ? a : (a ? a : -1), offsetReg, accessSize, offset, signExtend, reverse); SafeLoadToReg(RD, update ? a : (a ? a : -1), offsetReg, accessSize, offset, signExtend, reverse, update);
if (update)
{
ARMReg RA = gpr.R(a);
if (offsetReg == -1)
{
rA = gpr.GetReg(false);
MOVI2R(rA, offset);
ADD(RA, RA, rA);
}
else
{
ADD(RA, RA, gpr.R(offsetReg));
}
}
SetJumpTarget(DoNotLoad); SetJumpTarget(DoNotLoad);
@ -449,8 +454,6 @@ void JitArm::lXX(UGeckoInstruction inst)
(SConfig::GetInstance().m_LocalCoreStartupParameter.bWii && Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x2C000000)) && (SConfig::GetInstance().m_LocalCoreStartupParameter.bWii && Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x2C000000)) &&
Memory::ReadUnchecked_U32(js.compilerPC + 8) == 0x4182fff8) Memory::ReadUnchecked_U32(js.compilerPC + 8) == 0x4182fff8)
{ {
ARMReg RD = gpr.R(d);
// if it's still 0, we can wait until the next event // if it's still 0, we can wait until the next event
TST(RD, RD); TST(RD, RD);
FixupBranch noIdle = B_CC(CC_NEQ); FixupBranch noIdle = B_CC(CC_NEQ);

View File

@ -24,16 +24,13 @@ void JitArm::lfXX(UGeckoInstruction inst)
INSTRUCTION_START INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff); JITDISABLE(bJITLoadStoreFloatingOff);
ARMReg rA = gpr.GetReg();
ARMReg rB = gpr.GetReg();
ARMReg RA; ARMReg RA;
u32 a = inst.RA, b = inst.RB; u32 a = inst.RA, b = inst.RB;
s32 offset = inst.SIMM_16; s32 offset = inst.SIMM_16;
bool single = false; u32 flags = BackPatchInfo::FLAG_LOAD;
bool update = false; bool update = false;
bool zeroA = false;
s32 offsetReg = -1; s32 offsetReg = -1;
switch (inst.OPCD) switch (inst.OPCD)
@ -42,157 +39,152 @@ void JitArm::lfXX(UGeckoInstruction inst)
switch (inst.SUBOP10) switch (inst.SUBOP10)
{ {
case 567: // lfsux case 567: // lfsux
single = true; flags |= BackPatchInfo::FLAG_SIZE_F32;
update = true; update = true;
offsetReg = b; offsetReg = b;
break; break;
case 535: // lfsx case 535: // lfsx
single = true; flags |= BackPatchInfo::FLAG_SIZE_F32;
zeroA = true;
offsetReg = b; offsetReg = b;
break; break;
case 631: // lfdux case 631: // lfdux
flags |= BackPatchInfo::FLAG_SIZE_F64;
update = true; update = true;
offsetReg = b; offsetReg = b;
break; break;
case 599: // lfdx case 599: // lfdx
zeroA = true; flags |= BackPatchInfo::FLAG_SIZE_F64;
offsetReg = b; offsetReg = b;
break; break;
} }
break; break;
case 49: // lfsu case 49: // lfsu
flags |= BackPatchInfo::FLAG_SIZE_F32;
update = true; update = true;
single = true;
break; break;
case 48: // lfs case 48: // lfs
single = true; flags |= BackPatchInfo::FLAG_SIZE_F32;
zeroA = true;
break; break;
case 51: // lfdu case 51: // lfdu
flags |= BackPatchInfo::FLAG_SIZE_F64;
update = true; update = true;
break; break;
case 50: // lfd case 50: // lfd
zeroA = true; flags |= BackPatchInfo::FLAG_SIZE_F64;
break; break;
} }
ARMReg v0 = fpr.R0(inst.FD, false), v1; ARMReg v0 = fpr.R0(inst.FD, false), v1 = INVALID_REG;
if (single) if (flags & BackPatchInfo::FLAG_SIZE_F32)
v1 = fpr.R1(inst.FD, false); v1 = fpr.R1(inst.FD, false);
ARMReg rA = R11;
ARMReg addr = R12;
u32 imm_addr = 0;
bool is_immediate = false;
if (update) if (update)
{ {
RA = gpr.R(a); // Always uses RA
// Update path /always/ uses RA if (gpr.IsImm(a) && offsetReg == -1)
if (offsetReg == -1) // uses SIMM_16
{ {
MOVI2R(rB, offset); is_immediate = true;
ADD(rB, rB, RA); imm_addr = offset + gpr.GetImm(a);
}
else if (gpr.IsImm(a) && offsetReg != -1 && gpr.IsImm(offsetReg))
{
is_immediate = true;
imm_addr = gpr.GetImm(a) + gpr.GetImm(offsetReg);
} }
else else
{
ADD(rB, gpr.R(offsetReg), RA);
}
}
else
{
if (zeroA)
{ {
if (offsetReg == -1) if (offsetReg == -1)
{ {
if (a) Operand2 off;
if (TryMakeOperand2(offset, off))
{ {
RA = gpr.R(a); ADD(addr, gpr.R(a), off);
MOVI2R(rB, offset);
ADD(rB, rB, RA);
} }
else else
{ {
MOVI2R(rB, (u32)offset); MOVI2R(addr, offset);
ADD(addr, addr, gpr.R(a));
} }
} }
else else
{ {
ARMReg RB = gpr.R(offsetReg); ADD(addr, gpr.R(offsetReg), gpr.R(a));
if (a)
{
RA = gpr.R(a);
ADD(rB, RB, RA);
}
else
{
MOV(rB, RB);
}
} }
} }
} }
else
{
if (offsetReg == -1)
{
if (a && gpr.IsImm(a))
{
is_immediate = true;
imm_addr = gpr.GetImm(a) + offset;
}
else if (a)
{
Operand2 off;
if (TryMakeOperand2(offset, off))
{
ADD(addr, gpr.R(a), off);
}
else
{
MOVI2R(addr, offset);
ADD(addr, addr, gpr.R(a));
}
}
else
{
is_immediate = true;
imm_addr = offset;
}
}
else
{
if (a && gpr.IsImm(a) && gpr.IsImm(offsetReg))
{
is_immediate = true;
imm_addr = gpr.GetImm(a) + gpr.GetImm(offsetReg);
}
else if (!a && gpr.IsImm(offsetReg))
{
is_immediate = true;
imm_addr = gpr.GetImm(offsetReg);
}
else if (a)
{
ADD(addr, gpr.R(a), gpr.R(offsetReg));
}
else
{
MOV(addr, gpr.R(offsetReg));
}
}
}
if (update)
RA = gpr.R(a);
if (is_immediate)
MOVI2R(addr, imm_addr);
LDR(rA, R9, PPCSTATE_OFF(Exceptions)); LDR(rA, R9, PPCSTATE_OFF(Exceptions));
CMP(rA, EXCEPTION_DSI); CMP(rA, EXCEPTION_DSI);
FixupBranch DoNotLoad = B_CC(CC_EQ); FixupBranch DoNotLoad = B_CC(CC_EQ);
if (update) if (update)
MOV(RA, rB); MOV(RA, addr);
// This branch gets changed to a NOP when the fastpath fails EmitBackpatchRoutine(this, flags,
FixupBranch fast_path; SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem,
if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem) !(is_immediate && Memory::IsRAMAddress(imm_addr)), v0, v1);
fast_path = B();
{
PUSH(4, R0, R1, R2, R3);
MOV(R0, rB);
if (single)
{
MOVI2R(rA, (u32)&Memory::Read_U32);
BL(rA);
VMOV(S0, R0);
VCVT(v0, S0, 0);
VCVT(v1, S0, 0);
}
else
{
MOVI2R(rA, (u32)&Memory::Read_F64);
BL(rA);
#if !defined(__ARM_PCS_VFP) // SoftFP returns in R0 and R1
VMOV(v0, R0);
#else
VMOV(v0, D0);
#endif
}
POP(4, R0, R1, R2, R3);
}
if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem)
{
FixupBranch slow_out = B();
SetJumpTarget(fast_path);
{
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
ARMReg rC = gpr.GetReg();
BIC(rC, rB, mask);
MOVI2R(rA, (u32)Memory::base);
ADD(rC, rC, rA);
NEONXEmitter nemit(this);
if (single)
{
nemit.VLD1(F_32, D0, rC);
nemit.VREV32(I_8, D0, D0); // Byte swap to result
VCVT(v0, S0, 0);
VCVT(v1, S0, 0);
}
else
{
nemit.VLD1(I_64, v0, rC);
nemit.VREV64(I_8, v0, v0); // Byte swap to result
}
gpr.Unlock(rC);
}
SetJumpTarget(slow_out);
}
gpr.Unlock(rA, rB);
SetJumpTarget(DoNotLoad); SetJumpTarget(DoNotLoad);
} }
@ -201,16 +193,13 @@ void JitArm::stfXX(UGeckoInstruction inst)
INSTRUCTION_START INSTRUCTION_START
JITDISABLE(bJITLoadStoreFloatingOff); JITDISABLE(bJITLoadStoreFloatingOff);
ARMReg rA = gpr.GetReg();
ARMReg rB = gpr.GetReg();
ARMReg RA; ARMReg RA;
u32 a = inst.RA, b = inst.RB; u32 a = inst.RA, b = inst.RB;
s32 offset = inst.SIMM_16; s32 offset = inst.SIMM_16;
bool single = false; u32 flags = BackPatchInfo::FLAG_STORE;
bool update = false; bool update = false;
bool zeroA = false;
s32 offsetReg = -1; s32 offsetReg = -1;
switch (inst.OPCD) switch (inst.OPCD)
@ -219,157 +208,196 @@ void JitArm::stfXX(UGeckoInstruction inst)
switch (inst.SUBOP10) switch (inst.SUBOP10)
{ {
case 663: // stfsx case 663: // stfsx
single = true; flags |= BackPatchInfo::FLAG_SIZE_F32;
zeroA = true;
offsetReg = b; offsetReg = b;
break; break;
case 695: // stfsux case 695: // stfsux
single = true; flags |= BackPatchInfo::FLAG_SIZE_F32;
offsetReg = b; offsetReg = b;
break; break;
case 727: // stfdx case 727: // stfdx
zeroA = true; flags |= BackPatchInfo::FLAG_SIZE_F64;
offsetReg = b; offsetReg = b;
break; break;
case 759: // stfdux case 759: // stfdux
flags |= BackPatchInfo::FLAG_SIZE_F64;
update = true; update = true;
offsetReg = b; offsetReg = b;
break; break;
} }
break; break;
case 53: // stfsu case 53: // stfsu
flags |= BackPatchInfo::FLAG_SIZE_F32;
update = true; update = true;
single = true;
break; break;
case 52: // stfs case 52: // stfs
single = true; flags |= BackPatchInfo::FLAG_SIZE_F32;
zeroA = true;
break; break;
case 55: // stfdu case 55: // stfdu
flags |= BackPatchInfo::FLAG_SIZE_F64;
update = true; update = true;
break; break;
case 54: // stfd case 54: // stfd
zeroA = true; flags |= BackPatchInfo::FLAG_SIZE_F64;
break; break;
} }
ARMReg v0 = fpr.R0(inst.FS); ARMReg v0 = fpr.R0(inst.FS);
ARMReg rA = R11;
ARMReg addr = R12;
u32 imm_addr = 0;
bool is_immediate = false;
if (update) if (update)
{ {
RA = gpr.R(a); // Always uses RA
// Update path /always/ uses RA if (gpr.IsImm(a) && offsetReg == -1)
if (offsetReg == -1) // uses SIMM_16
{ {
MOVI2R(rB, offset); is_immediate = true;
ADD(rB, rB, RA); imm_addr = offset + gpr.GetImm(a);
}
else if (gpr.IsImm(a) && offsetReg != -1 && gpr.IsImm(offsetReg))
{
is_immediate = true;
imm_addr = gpr.GetImm(a) + gpr.GetImm(offsetReg);
} }
else else
{ {
ADD(rB, gpr.R(offsetReg), RA); if (offsetReg == -1)
{
Operand2 off;
if (TryMakeOperand2(offset, off))
{
ADD(addr, gpr.R(a), off);
}
else
{
MOVI2R(addr, offset);
ADD(addr, addr, gpr.R(a));
}
}
else
{
ADD(addr, gpr.R(offsetReg), gpr.R(a));
}
} }
} }
else else
{ {
if (zeroA) if (offsetReg == -1)
{ {
if (offsetReg == -1) if (a && gpr.IsImm(a))
{ {
if (a) is_immediate = true;
imm_addr = gpr.GetImm(a) + offset;
}
else if (a)
{
Operand2 off;
if (TryMakeOperand2(offset, off))
{ {
RA = gpr.R(a); ADD(addr, gpr.R(a), off);
MOVI2R(rB, offset);
ADD(rB, rB, RA);
} }
else else
{ {
MOVI2R(rB, (u32)offset); MOVI2R(addr, offset);
ADD(addr, addr, gpr.R(a));
} }
} }
else else
{ {
ARMReg RB = gpr.R(offsetReg); is_immediate = true;
if (a) imm_addr = offset;
{ }
RA = gpr.R(a); }
ADD(rB, RB, RA); else
} {
else if (a && gpr.IsImm(a) && gpr.IsImm(offsetReg))
{ {
MOV(rB, RB); is_immediate = true;
} imm_addr = gpr.GetImm(a) + gpr.GetImm(offsetReg);
}
else if (!a && gpr.IsImm(offsetReg))
{
is_immediate = true;
imm_addr = gpr.GetImm(offsetReg);
}
else if (a)
{
ADD(addr, gpr.R(a), gpr.R(offsetReg));
}
else
{
MOV(addr, gpr.R(offsetReg));
} }
} }
} }
if (is_immediate)
MOVI2R(addr, imm_addr);
if (update) if (update)
{ {
RA = gpr.R(a);
LDR(rA, R9, PPCSTATE_OFF(Exceptions)); LDR(rA, R9, PPCSTATE_OFF(Exceptions));
CMP(rA, EXCEPTION_DSI); CMP(rA, EXCEPTION_DSI);
SetCC(CC_NEQ); SetCC(CC_NEQ);
MOV(RA, rB); MOV(RA, addr);
SetCC(); SetCC();
} }
// This branch gets changed to a NOP when the fastpath fails if (is_immediate)
FixupBranch fast_path;
if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem)
fast_path = B();
{ {
PUSH(4, R0, R1, R2, R3); if ((imm_addr & 0xFFFFF000) == 0xCC008000 && jit->jo.optimizeGatherPipe)
if (single)
{ {
MOV(R1, rB); int accessSize;
VCVT(S0, v0, 0); if (flags & BackPatchInfo::FLAG_SIZE_F64)
VMOV(R0, S0); accessSize = 64;
MOVI2R(rA, (u32)&Memory::Write_U32); else
BL(rA); accessSize = 32;
}
else
{
MOVI2R(rA, (u32)&Memory::Write_F64);
#if !defined(__ARM_PCS_VFP) // SoftFP returns in R0 and R1
VMOV(R0, v0);
MOV(R2, rB);
#else
VMOV(D0, v0);
MOV(R0, rB);
#endif
BL(rA);
}
POP(4, R0, R1, R2, R3);
}
if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem)
{
FixupBranch slow_out = B();
SetJumpTarget(fast_path);
{
Operand2 mask(2, 1); // ~(Memory::MEMVIEW32_MASK)
ARMReg rC = gpr.GetReg();
BIC(rC, rB, mask);
MOVI2R(rA, (u32)Memory::base);
ADD(rC, rC, rA);
MOVI2R(R14, (u32)&GPFifo::m_gatherPipeCount);
MOVI2R(R10, (u32)GPFifo::m_gatherPipe);
LDR(R11, R14);
ADD(R10, R10, R11);
NEONXEmitter nemit(this); NEONXEmitter nemit(this);
if (single) if (accessSize == 64)
{
PUSH(2, R0, R1);
nemit.VREV64(I_8, D0, v0);
VMOV(R0, D0);
STR(R0, R10, 0);
STR(R1, R10, 4);
POP(2, R0, R1);
}
else if (accessSize == 32)
{ {
VCVT(S0, v0, 0); VCVT(S0, v0, 0);
nemit.VREV32(I_8, D0, D0); nemit.VREV32(I_8, D0, D0);
VSTR(S0, rC, 0); VMOV(addr, S0);
STR(addr, R10);
} }
else ADD(R11, R11, accessSize >> 3);
{ STR(R11, R14);
nemit.VREV64(I_8, D0, v0); jit->js.fifoBytesThisBlock += accessSize >> 3;
VSTR(D0, rC, 0);
}
gpr.Unlock(rC);
}
SetJumpTarget(slow_out);
}
gpr.Unlock(rA, rB); }
else if (Memory::IsRAMAddress(imm_addr))
{
MOVI2R(addr, imm_addr);
EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, false, v0);
}
else
{
MOVI2R(addr, imm_addr);
EmitBackpatchRoutine(this, flags, false, false, v0);
}
}
else
{
EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, true, v0);
}
} }