And a bit more WIP JIT work, mostly floating-point improvements. It

should be getting to the point where the performance is reasonably
comparable to the current JIT on most workloads. I'm not sure of the 
exact comparisons because I haven't done much in terms of checking the 
performance versus the current JIT on floating-point heavy stuff. The 
floating point stuff is still relatively unoptimized, and there are still a
few relatively common functions that aren't being compiled yet, but the most 
critical work is done.



git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1807 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
magumagu9 2009-01-06 07:35:06 +00:00
parent acae6e6b69
commit ce2b4bead9
5 changed files with 262 additions and 188 deletions

View File

@ -305,6 +305,10 @@ InstLoc IRBuilder::FoldUOp(unsigned Opcode, InstLoc Op1, unsigned extra) {
if (getOpcode(*Op1) == ExpandPackedToMReg) if (getOpcode(*Op1) == ExpandPackedToMReg)
return getOp1(Op1); return getOp1(Op1);
} }
if (Opcode == DoubleToSingle) {
if (getOpcode(*Op1) == DupSingleToMReg)
return getOp1(Op1);
}
return EmitUOp(Opcode, Op1, extra); return EmitUOp(Opcode, Op1, extra);
} }
@ -590,6 +594,11 @@ static OpArg fregLocForSlot(RegInfo& RI, unsigned slot) {
return M(&FSlotSet[slot*16]); return M(&FSlotSet[slot*16]);
} }
// Used for accessing the top half of a spilled double
static OpArg fregLocForSlotPlusFour(RegInfo& RI, unsigned slot) {
return M(&FSlotSet[slot*16+4]);
}
static unsigned fregCreateSpill(RegInfo& RI, InstLoc I) { static unsigned fregCreateSpill(RegInfo& RI, InstLoc I) {
unsigned newSpill = ++RI.numFSpills; unsigned newSpill = ++RI.numFSpills;
RI.IInfo[I - RI.FirstI] |= newSpill << 16; RI.IInfo[I - RI.FirstI] |= newSpill << 16;
@ -679,6 +688,16 @@ static X64Reg regEnsureInReg(RegInfo& RI, InstLoc I) {
return loc.GetSimpleReg(); return loc.GetSimpleReg();
} }
static X64Reg fregEnsureInReg(RegInfo& RI, InstLoc I) {
OpArg loc = fregLocForInst(RI, I);
if (!loc.IsSimpleReg()) {
X64Reg newReg = fregFindFreeReg(RI);
RI.Jit->MOVAPD(newReg, loc);
loc = R(newReg);
}
return loc.GetSimpleReg();
}
static void regSpillCallerSaved(RegInfo& RI) { static void regSpillCallerSaved(RegInfo& RI) {
regSpill(RI, EDX); regSpill(RI, EDX);
regSpill(RI, ECX); regSpill(RI, ECX);
@ -1044,6 +1063,8 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
case ExpandPackedToMReg: case ExpandPackedToMReg:
case CompactMRegToPacked: case CompactMRegToPacked:
case FPNeg: case FPNeg:
case FSNeg:
case FDNeg:
if (thisUsed) if (thisUsed)
regMarkUse(RI, I, getOp1(I), 1); regMarkUse(RI, I, getOp1(I), 1);
break; break;
@ -1052,6 +1073,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
case Load32: case Load32:
regMarkMemAddress(RI, I, getOp1(I), 1); regMarkMemAddress(RI, I, getOp1(I), 1);
break; break;
case LoadDouble:
case LoadSingle: case LoadSingle:
case LoadPaired: case LoadPaired:
regMarkUse(RI, I, getOp1(I), 1); regMarkUse(RI, I, getOp1(I), 1);
@ -1086,9 +1108,17 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
case ICmpSgt: case ICmpSgt:
case FSMul: case FSMul:
case FSAdd: case FSAdd:
case FSSub:
case FDMul:
case FDAdd:
case FDSub:
case FPAdd: case FPAdd:
case FPMul: case FPMul:
case FPSub: case FPSub:
case FPMerge00:
case FPMerge01:
case FPMerge10:
case FPMerge11:
case InsertDoubleInMReg: case InsertDoubleInMReg:
if (thisUsed) { if (thisUsed) {
regMarkUse(RI, I, getOp1(I), 1); regMarkUse(RI, I, getOp1(I), 1);
@ -1104,6 +1134,7 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
regMarkMemAddress(RI, I, getOp2(I), 2); regMarkMemAddress(RI, I, getOp2(I), 2);
break; break;
case StoreSingle: case StoreSingle:
case StoreDouble:
case StorePaired: case StorePaired:
regMarkUse(RI, I, getOp1(I), 1); regMarkUse(RI, I, getOp1(I), 1);
regMarkUse(RI, I, getOp2(I), 2); regMarkUse(RI, I, getOp2(I), 2);
@ -1417,6 +1448,21 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
regNormalRegClear(RI, I); regNormalRegClear(RI, I);
break; break;
} }
case LoadDouble: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
Jit->ADD(32, R(ECX), Imm8(4));
RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
Jit->MOVD_xmm(reg, R(ECX));
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I)));
RI.Jit->UnsafeLoadRegToReg(ECX, ECX, 32, 0, false);
Jit->MOVD_xmm(XMM0, R(ECX));
Jit->PUNPCKLDQ(reg, R(XMM0));
RI.fregs[reg] = I;
regNormalRegClear(RI, I);
break;
}
case LoadPaired: { case LoadPaired: {
if (!thisUsed) break; if (!thisUsed) break;
regSpill(RI, EAX); regSpill(RI, EAX);
@ -1449,6 +1495,30 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
regClearInst(RI, getOp2(I)); regClearInst(RI, getOp2(I));
break; break;
} }
case StoreDouble: {
regSpill(RI, EAX);
// FIXME: Use 64-bit where possible
// FIXME: Use unsafe write with pshufb where possible
unsigned fspill = fregGetSpill(RI, getOp1(I));
if (!fspill) {
// Force the value to spill, so we can use
// memory operations to load it
fspill = fregCreateSpill(RI, getOp1(I));
X64Reg reg = fregLocForInst(RI, getOp1(I)).GetSimpleReg();
RI.Jit->MOVAPD(fregLocForSlot(RI, fspill), reg);
}
Jit->MOV(32, R(EAX), fregLocForSlotPlusFour(RI, fspill));
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 0);
Jit->MOV(32, R(EAX), fregLocForSlot(RI, fspill));
Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I)));
RI.Jit->SafeWriteRegToReg(EAX, ECX, 32, 4);
if (RI.IInfo[I - RI.FirstI] & 4)
fregClearInst(RI, getOp1(I));
if (RI.IInfo[I - RI.FirstI] & 8)
regClearInst(RI, getOp2(I));
break;
}
case StorePaired: { case StorePaired: {
regSpill(RI, EAX); regSpill(RI, EAX);
regSpill(RI, EDX); regSpill(RI, EDX);
@ -1501,6 +1571,28 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
fregNormalRegClear(RI, I); fregNormalRegClear(RI, I);
break; break;
} }
case FSNeg: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
static const u32 GC_ALIGNED16(ssSignBits[4]) =
{0x80000000};
Jit->PXOR(reg, M((void*)&ssSignBits));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FDNeg: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
static const u64 GC_ALIGNED16(ssSignBits[2]) =
{0x8000000000000000ULL};
Jit->PXOR(reg, M((void*)&ssSignBits));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FPNeg: { case FPNeg: {
if (!thisUsed) break; if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI); X64Reg reg = fregFindFreeReg(RI);
@ -1522,8 +1614,8 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
} }
case StoreFReg: { case StoreFReg: {
unsigned ppcreg = *I >> 16; unsigned ppcreg = *I >> 16;
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I))); Jit->MOVAPD(M(&PowerPC::ppcState.ps[ppcreg]),
Jit->MOVAPD(M(&PowerPC::ppcState.ps[ppcreg]), XMM0); fregEnsureInReg(RI, getOp1(I)));
fregNormalRegClear(RI, I); fregNormalRegClear(RI, I);
break; break;
} }
@ -1553,6 +1645,42 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
fregNormalRegClear(RI, I); fregNormalRegClear(RI, I);
break; break;
} }
case FSSub: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->SUBSS(reg, fregLocForInst(RI, getOp2(I)));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FDMul: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->MULSD(reg, fregLocForInst(RI, getOp2(I)));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FDAdd: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->ADDSD(reg, fregLocForInst(RI, getOp2(I)));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FDSub: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->SUBSD(reg, fregLocForInst(RI, getOp2(I)));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FPAdd: { case FPAdd: {
if (!thisUsed) break; if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI); X64Reg reg = fregFindFreeReg(RI);
@ -1580,6 +1708,47 @@ static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) {
fregNormalRegClear(RI, I); fregNormalRegClear(RI, I);
break; break;
} }
case FPMerge00: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->PUNPCKLDQ(reg, fregLocForInst(RI, getOp2(I)));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FPMerge01: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
// Note reversed operands!
Jit->MOVAPD(reg, fregLocForInst(RI, getOp2(I)));
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
Jit->MOVSS(reg, R(XMM0));
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FPMerge10: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp2(I)));
Jit->MOVSS(reg, R(XMM0));
Jit->SHUFPS(reg, R(reg), 0xF1);
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case FPMerge11: {
if (!thisUsed) break;
X64Reg reg = fregFindFreeReg(RI);
Jit->MOVAPD(reg, fregLocForInst(RI, getOp1(I)));
Jit->PUNPCKLDQ(reg, fregLocForInst(RI, getOp2(I)));
Jit->SHUFPD(reg, R(reg), 0x1);
RI.fregs[reg] = I;
fregNormalRegClear(RI, I);
break;
}
case CInt32: case CInt32:
case CInt16: { case CInt16: {
if (!thisUsed) break; if (!thisUsed) break;

View File

@ -157,13 +157,24 @@ namespace IREmitter {
LoadFReg, LoadFReg,
FSMul, FSMul,
FSAdd, FSAdd,
FSSub,
FSNeg,
FPAdd, FPAdd,
FPMul, FPMul,
FPSub, FPSub,
FPNeg, FPNeg,
FDMul,
FDAdd,
FDSub,
FDNeg,
FPMerge00,
FPMerge01,
FPMerge10,
FPMerge11,
FResult_End, FResult_End,
StorePaired, StorePaired,
StoreSingle, StoreSingle,
StoreDouble,
StoreFReg, StoreFReg,
// "Trinary" operators // "Trinary" operators
@ -380,6 +391,9 @@ namespace IREmitter {
InstLoc EmitStoreSingle(InstLoc value, InstLoc addr) { InstLoc EmitStoreSingle(InstLoc value, InstLoc addr) {
return FoldBiOp(StoreSingle, value, addr); return FoldBiOp(StoreSingle, value, addr);
} }
InstLoc EmitStoreDouble(InstLoc value, InstLoc addr) {
return FoldBiOp(StoreDouble, value, addr);
}
InstLoc EmitStorePaired(InstLoc value, InstLoc addr, unsigned quantReg) { InstLoc EmitStorePaired(InstLoc value, InstLoc addr, unsigned quantReg) {
return FoldBiOp(StorePaired, value, addr, quantReg); return FoldBiOp(StorePaired, value, addr, quantReg);
} }
@ -410,6 +424,24 @@ namespace IREmitter {
InstLoc EmitFSAdd(InstLoc op1, InstLoc op2) { InstLoc EmitFSAdd(InstLoc op1, InstLoc op2) {
return FoldBiOp(FSAdd, op1, op2); return FoldBiOp(FSAdd, op1, op2);
} }
InstLoc EmitFSSub(InstLoc op1, InstLoc op2) {
return FoldBiOp(FSSub, op1, op2);
}
InstLoc EmitFSNeg(InstLoc op1) {
return FoldUOp(FSNeg, op1);
}
InstLoc EmitFDMul(InstLoc op1, InstLoc op2) {
return FoldBiOp(FDMul, op1, op2);
}
InstLoc EmitFDAdd(InstLoc op1, InstLoc op2) {
return FoldBiOp(FDAdd, op1, op2);
}
InstLoc EmitFDSub(InstLoc op1, InstLoc op2) {
return FoldBiOp(FDSub, op1, op2);
}
InstLoc EmitFDNeg(InstLoc op1) {
return FoldUOp(FDNeg, op1);
}
InstLoc EmitFPAdd(InstLoc op1, InstLoc op2) { InstLoc EmitFPAdd(InstLoc op1, InstLoc op2) {
return FoldBiOp(FPAdd, op1, op2); return FoldBiOp(FPAdd, op1, op2);
} }
@ -419,6 +451,18 @@ namespace IREmitter {
InstLoc EmitFPSub(InstLoc op1, InstLoc op2) { InstLoc EmitFPSub(InstLoc op1, InstLoc op2) {
return FoldBiOp(FPSub, op1, op2); return FoldBiOp(FPSub, op1, op2);
} }
InstLoc EmitFPMerge00(InstLoc op1, InstLoc op2) {
return FoldBiOp(FPMerge00, op1, op2);
}
InstLoc EmitFPMerge01(InstLoc op1, InstLoc op2) {
return FoldBiOp(FPMerge01, op1, op2);
}
InstLoc EmitFPMerge10(InstLoc op1, InstLoc op2) {
return FoldBiOp(FPMerge10, op1, op2);
}
InstLoc EmitFPMerge11(InstLoc op1, InstLoc op2) {
return FoldBiOp(FPMerge11, op1, op2);
}
InstLoc EmitFPNeg(InstLoc op1) { InstLoc EmitFPNeg(InstLoc op1) {
return FoldUOp(FPNeg, op1); return FoldUOp(FPNeg, op1);
} }

View File

@ -28,31 +28,40 @@
void Jit64::fp_arith_s(UGeckoInstruction inst) void Jit64::fp_arith_s(UGeckoInstruction inst)
{ {
if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 25) { if (inst.Rc || inst.OPCD != 59 || (inst.SUBOP5 != 25 && inst.SUBOP5 != 20 && inst.SUBOP5 != 21)) {
Default(inst); return; Default(inst); return;
} }
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA); IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA);
val = ibuild.EmitDoubleToSingle(val);
bool dupe = inst.OPCD == 59;
switch (inst.SUBOP5) switch (inst.SUBOP5)
{ {
case 25: //mul case 25: //mul
val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC))); val = ibuild.EmitFDMul(val, ibuild.EmitLoadFReg(inst.FC));
break;
case 18: //div case 18: //div
case 20: //sub case 20: //sub
val = ibuild.EmitFDSub(val, ibuild.EmitLoadFReg(inst.FB));
break;
case 21: //add case 21: //add
val = ibuild.EmitFDAdd(val, ibuild.EmitLoadFReg(inst.FB));
break;
case 23: //sel case 23: //sel
case 24: //res case 24: //res
default: default:
_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!"); _assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
} }
val = ibuild.EmitDupSingleToMReg(val);
if (inst.OPCD == 59) {
val = ibuild.EmitDoubleToSingle(val);
val = ibuild.EmitDupSingleToMReg(val);
} else {
val = ibuild.EmitInsertDoubleInMReg(val, ibuild.EmitLoadFReg(inst.FD));
}
ibuild.EmitStoreFReg(val, inst.FD); ibuild.EmitStoreFReg(val, inst.FD);
} }
void Jit64::fmaddXX(UGeckoInstruction inst) void Jit64::fmaddXX(UGeckoInstruction inst)
{ {
if (inst.Rc || inst.OPCD != 59 || inst.SUBOP5 != 29) { if (inst.Rc || inst.OPCD != 59) {
Default(inst); return; Default(inst); return;
} }
@ -61,7 +70,12 @@
IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA); IREmitter::InstLoc val = ibuild.EmitLoadFReg(inst.FA);
val = ibuild.EmitDoubleToSingle(val); val = ibuild.EmitDoubleToSingle(val);
val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC))); val = ibuild.EmitFSMul(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FC)));
val = ibuild.EmitFSAdd(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB))); if (inst.SUBOP5 & 1)
val = ibuild.EmitFSAdd(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB)));
else
val = ibuild.EmitFSSub(val, ibuild.EmitDoubleToSingle(ibuild.EmitLoadFReg(inst.FB)));
if (inst.SUBOP5 & 2)
val = ibuild.EmitFSNeg(val);
val = ibuild.EmitDupSingleToMReg(val); val = ibuild.EmitDupSingleToMReg(val);
ibuild.EmitStoreFReg(val, inst.FD); ibuild.EmitStoreFReg(val, inst.FD);
} }
@ -78,57 +92,6 @@
void Jit64::fcmpx(UGeckoInstruction inst) void Jit64::fcmpx(UGeckoInstruction inst)
{ {
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff) Default(inst);
{Default(inst); return;} // turn off from debugger return;
INSTRUCTION_START;
if (jo.fpAccurateFlags)
{
Default(inst);
return;
}
bool ordered = inst.SUBOP10 == 32;
/*
double fa = rPS0(_inst.FA);
double fb = rPS0(_inst.FB);
u32 compareResult;
if(IsNAN(fa) || IsNAN(fb)) compareResult = 1;
else if(fa < fb) compareResult = 8;
else if(fa > fb) compareResult = 4;
else compareResult = 2;
FPSCR.FPRF = compareResult;
CR = (CR & (~(0xf0000000 >> (_inst.CRFD * 4)))) | (compareResult << ((7 - _inst.CRFD) * 4));
*/
int a = inst.FA;
int b = inst.FB;
int crf = inst.CRFD;
int shift = crf * 4;
//FPSCR
//XOR(32,R(EAX),R(EAX));
fpr.Lock(a,b);
if (a != b)
fpr.LoadToX64(a, true);
// USES_CR
if (ordered)
COMISD(fpr.R(a).GetSimpleReg(), fpr.R(b));
else
UCOMISD(fpr.R(a).GetSimpleReg(), fpr.R(b));
FixupBranch pLesser = J_CC(CC_B);
FixupBranch pGreater = J_CC(CC_A);
// _x86Reg == 0
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2));
FixupBranch continue1 = J();
// _x86Reg > 0
SetJumpTarget(pGreater);
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4));
FixupBranch continue2 = J();
// _x86Reg < 0
SetJumpTarget(pLesser);
MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8));
SetJumpTarget(continue1);
SetJumpTarget(continue2);
fpr.UnlockAll();
} }

View File

@ -65,121 +65,26 @@ void Jit64::lfs(UGeckoInstruction inst)
void Jit64::lfd(UGeckoInstruction inst) void Jit64::lfd(UGeckoInstruction inst)
{ {
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff) IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16), val;
{Default(inst); return;} // turn off from debugger if (inst.RA)
INSTRUCTION_START; addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
val = ibuild.EmitLoadFReg(inst.RD);
int d = inst.RD; val = ibuild.EmitInsertDoubleInMReg(ibuild.EmitLoadDouble(addr), val);
int a = inst.RA; ibuild.EmitStoreFReg(val, inst.RD);
if (!a) return;
{
Default(inst);
return;
}
s32 offset = (s32)(s16)inst.SIMM_16;
gpr.FlushLockX(ABI_PARAM1);
gpr.Lock(a);
MOV(32, R(ABI_PARAM1), gpr.R(a));
// TODO - optimize. This has to load the previous value - upper double should stay unmodified.
fpr.LoadToX64(d, true);
fpr.Lock(d);
X64Reg xd = fpr.RX(d);
if (cpu_info.bSSSE3) {
#ifdef _M_X64
MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
#else
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
#endif
PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe));
MOVSD(xd, R(XMM0));
} else {
#ifdef _M_X64
MOV(64, R(EAX), MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
BSWAP(64, EAX);
MOV(64, M(&temp64), R(EAX));
MOVSD(XMM0, M(&temp64));
MOVSD(xd, R(XMM0));
#else
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset));
BSWAP(32, EAX);
MOV(32, M((void*)((u32)&temp64+4)), R(EAX));
MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4));
BSWAP(32, EAX);
MOV(32, M(&temp64), R(EAX));
MOVSD(XMM0, M(&temp64));
MOVSD(xd, R(XMM0));
#if 0
// Alternate implementation; possibly faster
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
PSHUFLW(XMM0, R(XMM0), 0x1B);
PSRLW(XMM0, 8);
MOVSD(xd, R(XMM0));
MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
PSHUFLW(XMM0, R(XMM0), 0x1B);
PSLLW(XMM0, 8);
POR(xd, R(XMM0));
#endif
#endif
}
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
} }
void Jit64::stfd(UGeckoInstruction inst) void Jit64::stfd(UGeckoInstruction inst)
{ {
if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff) IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16),
{Default(inst); return;} // turn off from debugger val = ibuild.EmitLoadFReg(inst.RS);
INSTRUCTION_START; if (inst.RA)
addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA));
int s = inst.RS; if (inst.OPCD & 1)
int a = inst.RA; ibuild.EmitStoreGReg(addr, inst.RA);
if (!a) ibuild.EmitStoreDouble(val, addr);
{ return;
Default(inst);
return;
}
s32 offset = (s32)(s16)inst.SIMM_16;
gpr.FlushLockX(ABI_PARAM1);
gpr.Lock(a);
fpr.Lock(s);
MOV(32, R(ABI_PARAM1), gpr.R(a));
#ifdef _M_IX86
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
#endif
if (cpu_info.bSSSE3) {
MOVAPD(XMM0, fpr.R(s));
PSHUFB(XMM0, M((void *)bswapShuffle1x8));
#ifdef _M_X64
MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, offset), XMM0);
#else
MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base + offset), XMM0);
#endif
} else {
#ifdef _M_X64
fpr.LoadToX64(s, true, false);
MOVSD(M(&temp64), fpr.RX(s));
MOV(64, R(EAX), M(&temp64));
BSWAP(64, EAX);
MOV(64, MComplex(RBX, ABI_PARAM1, SCALE_1, offset), R(EAX));
#else
fpr.LoadToX64(s, true, false);
MOVSD(M(&temp64), fpr.RX(s));
MOV(32, R(EAX), M(&temp64));
BSWAP(32, EAX);
MOV(32, MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4), R(EAX));
MOV(32, R(EAX), M((void*)((u32)&temp64 + 4)));
BSWAP(32, EAX);
MOV(32, MDisp(ABI_PARAM1, (u32)Memory::base + offset), R(EAX));
#endif
}
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
} }

View File

@ -270,42 +270,35 @@
//TODO: find easy cases and optimize them, do a breakout like ps_arith //TODO: find easy cases and optimize them, do a breakout like ps_arith
void Jit64::ps_mergeXX(UGeckoInstruction inst) void Jit64::ps_mergeXX(UGeckoInstruction inst)
{ {
if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff)
{Default(inst); return;} // turn off from debugger
INSTRUCTION_START;
if (inst.Rc) { if (inst.Rc) {
Default(inst); return; Default(inst); return;
} }
int d = inst.FD;
int a = inst.FA;
int b = inst.FB;
fpr.Lock(a,b,d);
MOVAPD(XMM0, fpr.R(a)); IREmitter::InstLoc val = ibuild.EmitCompactMRegToPacked(ibuild.EmitLoadFReg(inst.FA)),
rhs = ibuild.EmitCompactMRegToPacked(ibuild.EmitLoadFReg(inst.FB));
switch (inst.SUBOP10) switch (inst.SUBOP10)
{ {
case 528: case 528:
UNPCKLPD(XMM0, fpr.R(b)); //unpck is faster than shuf val = ibuild.EmitFPMerge00(val, rhs);
break; //00 break; //00
case 560: case 560:
SHUFPD(XMM0, fpr.R(b), 2); //must use shuf here val = ibuild.EmitFPMerge01(val, rhs);
break; //01 break; //01
case 592: case 592:
SHUFPD(XMM0, fpr.R(b), 1); val = ibuild.EmitFPMerge10(val, rhs);
break; //10 break; //10
case 624: case 624:
UNPCKHPD(XMM0, fpr.R(b)); val = ibuild.EmitFPMerge11(val, rhs);
break; //11 break; //11
default: default:
_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op"); _assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
} }
fpr.LoadToX64(d, false); val = ibuild.EmitExpandPackedToMReg(val);
MOVAPD(fpr.RX(d), Gen::R(XMM0)); ibuild.EmitStoreFReg(val, inst.FD);
fpr.UnlockAll();
} }
//TODO: add optimized cases
void Jit64::ps_maddXX(UGeckoInstruction inst) void Jit64::ps_maddXX(UGeckoInstruction inst)
{ {
if (inst.Rc || (inst.SUBOP5 != 28 && inst.SUBOP5 != 29 && inst.SUBOP5 != 30)) { if (inst.Rc || (inst.SUBOP5 != 28 && inst.SUBOP5 != 29 && inst.SUBOP5 != 30)) {