mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-02-24 21:40:10 +00:00
Merge pull request #995 from FioraAeterna/fma
Add FMA support to emitter and use it in the JIT
This commit is contained in:
commit
9ddbdeb39f
@ -1355,9 +1355,9 @@ void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extr
|
||||
arg.WriteRest(this, extrabytes);
|
||||
}
|
||||
|
||||
void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
|
||||
void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int W, int extrabytes)
|
||||
{
|
||||
WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, extrabytes);
|
||||
WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, W, extrabytes);
|
||||
}
|
||||
|
||||
static int GetVEXmmmmm(u16 op)
|
||||
@ -1383,14 +1383,14 @@ static int GetVEXpp(u8 opPrefix)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
|
||||
void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int W, int extrabytes)
|
||||
{
|
||||
if (!cpu_info.bAVX)
|
||||
PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
|
||||
int mmmmm = GetVEXmmmmm(op);
|
||||
int pp = GetVEXpp(opPrefix);
|
||||
// FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here
|
||||
arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm);
|
||||
arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm, W);
|
||||
Write8(op & 0xFF);
|
||||
arg.WriteRest(this, extrabytes, regOp1);
|
||||
}
|
||||
@ -1799,10 +1799,71 @@ void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x6
|
||||
void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg);}
|
||||
void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg);}
|
||||
void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg);}
|
||||
void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 1); Write8(shuffle);}
|
||||
void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 0, 1); Write8(shuffle);}
|
||||
void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);}
|
||||
void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);}
|
||||
|
||||
void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg);}
|
||||
void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg, 1);}
|
||||
void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg, 1);}
|
||||
|
||||
void XEmitter::SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);}
|
||||
void XEmitter::SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);}
|
||||
void XEmitter::SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);}
|
||||
|
@ -294,8 +294,8 @@ private:
|
||||
void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
|
||||
void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int W = 0, int extrabytes = 0);
|
||||
void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int W = 0, int extrabytes = 0);
|
||||
void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
|
||||
void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
|
||||
void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
|
||||
@ -773,6 +773,68 @@ public:
|
||||
void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
|
||||
// FMA
|
||||
void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
|
||||
// VEX GPR instructions
|
||||
void SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
|
||||
void SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
|
||||
|
@ -49,6 +49,7 @@
|
||||
#include "Core/HW/Wiimote.h"
|
||||
#include "Core/IPC_HLE/WII_IPC_HLE_Device_usb.h"
|
||||
#include "Core/IPC_HLE/WII_Socket.h"
|
||||
#include "Core/PowerPC/JitInterface.h"
|
||||
#include "Core/PowerPC/PowerPC.h"
|
||||
|
||||
#ifdef USE_GDBSTUB
|
||||
@ -728,6 +729,8 @@ void UpdateWantDeterminism(bool initial)
|
||||
g_want_determinism = new_want_determinism;
|
||||
WiiSockMan::GetInstance().UpdateWantDeterminism(new_want_determinism);
|
||||
g_video_backend->UpdateWantDeterminism(new_want_determinism);
|
||||
// We need to clear the cache because some parts of the JIT depend on want_determinism, e.g. use of FMA.
|
||||
JitInterface::ClearCache();
|
||||
|
||||
Core::PauseAndLock(false, was_unpaused);
|
||||
}
|
||||
|
@ -90,9 +90,44 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||
|
||||
fpr.Lock(a, b, c, d);
|
||||
|
||||
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
||||
if (inst.SUBOP5 == 30) //nmsub
|
||||
// While we don't know if any games are actually affected (replays seem to work with all the usual
|
||||
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
|
||||
// be extra careful and don't use FMA, even if in theory it might be okay.
|
||||
// Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared
|
||||
// to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin
|
||||
// instances on different computers giving identical results.
|
||||
if (cpu_info.bFMA && !Core::g_want_determinism)
|
||||
{
|
||||
if (single_precision)
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
else
|
||||
MOVSD(XMM0, fpr.R(c));
|
||||
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
||||
// if we have to pick one of a or b to bind, let's make it b.
|
||||
fpr.BindToRegister(b, true, false);
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 28: //msub
|
||||
VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
case 29: //madd
|
||||
VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
// PowerPC and x86 define NMADD/NMSUB differently
|
||||
// x86: D = -A*C (+/-) B
|
||||
// PPC: D = -(A*C (+/-) B)
|
||||
// so we have to swap them; the ADD/SUB here isn't a typo.
|
||||
case 30: //nmsub
|
||||
VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
case 31: //nmadd
|
||||
VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (inst.SUBOP5 == 30) //nmsub
|
||||
{
|
||||
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
||||
if (single_precision)
|
||||
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
|
||||
else
|
||||
@ -115,6 +150,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||
if (inst.SUBOP5 == 31) //nmadd
|
||||
PXOR(XMM0, M((void*)&psSignBits));
|
||||
}
|
||||
|
||||
fpr.BindToRegister(d, false);
|
||||
//YES it is necessary to dupe the result :(
|
||||
//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
|
||||
|
@ -305,50 +305,77 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
||||
int b = inst.FB;
|
||||
int c = inst.FC;
|
||||
int d = inst.FD;
|
||||
bool fma = cpu_info.bFMA && !Core::g_want_determinism;
|
||||
fpr.Lock(a,b,c,d);
|
||||
|
||||
switch (inst.SUBOP5)
|
||||
if (fma)
|
||||
fpr.BindToRegister(b, true, false);
|
||||
|
||||
if (inst.SUBOP5 == 14)
|
||||
{
|
||||
case 14: //madds0
|
||||
MOVDDUP(XMM0, fpr.R(c));
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 15: //madds1
|
||||
}
|
||||
else if (inst.SUBOP5 == 15)
|
||||
{
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 28: //msub
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
SUBPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 29: //madd
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 30: //nmsub
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
SUBPD(XMM0, fpr.R(b));
|
||||
PXOR(XMM0, M((void*)&psSignBits));
|
||||
break;
|
||||
case 31: //nmadd
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
PXOR(XMM0, M((void*)&psSignBits));
|
||||
break;
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!");
|
||||
//FallBackToInterpreter(inst);
|
||||
//fpr.UnlockAll();
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
}
|
||||
|
||||
if (fma)
|
||||
{
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 14: //madds0
|
||||
case 15: //madds1
|
||||
case 29: //madd
|
||||
VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
case 28: //msub
|
||||
VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
case 30: //nmsub
|
||||
VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
case 31: //nmadd
|
||||
VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 14: //madds0
|
||||
case 15: //madds1
|
||||
case 29: //madd
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 28: //msub
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
SUBPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 30: //nmsub
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
SUBPD(XMM0, fpr.R(b));
|
||||
PXOR(XMM0, M((void*)&psSignBits));
|
||||
break;
|
||||
case 31: //nmadd
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
PXOR(XMM0, M((void*)&psSignBits));
|
||||
break;
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
fpr.BindToRegister(d, false);
|
||||
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||
|
@ -948,4 +948,47 @@ VEX_RM_TEST(BLSI)
|
||||
|
||||
VEX_RMI_TEST(RORX)
|
||||
|
||||
// for AVX instructions that take the form op reg, reg, r/m
|
||||
#define AVX_RRM_TEST(Name, sizename) \
|
||||
TEST_F(x64EmitterTest, Name) \
|
||||
{ \
|
||||
struct { \
|
||||
int bits; \
|
||||
std::vector<NamedReg> regs; \
|
||||
std::string out_name; \
|
||||
std::string size; \
|
||||
} regsets[] = { \
|
||||
{ 64, xmmnames, "xmm0", sizename }, \
|
||||
}; \
|
||||
for (const auto& regset : regsets) \
|
||||
for (const auto& r : regset.regs) \
|
||||
{ \
|
||||
emitter->Name(r.reg, RAX, R(RAX)); \
|
||||
emitter->Name(RAX, RAX, R(r.reg)); \
|
||||
emitter->Name(RAX, r.reg, MatR(R12)); \
|
||||
ExpectDisassembly(#Name " " + r.name+ ", " + regset.out_name + ", " + regset.out_name + " " \
|
||||
#Name " " + regset.out_name + ", " + regset.out_name + ", " + r.name + " " \
|
||||
#Name " " + regset.out_name + ", " + r.name + ", " + regset.size + " ptr ds:[r12] "); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define FMA_TEST(Name, P, packed) \
|
||||
AVX_RRM_TEST(Name ## 132 ## P ## S, packed ? "dqword" : "dword") \
|
||||
AVX_RRM_TEST(Name ## 213 ## P ## S, packed ? "dqword" : "dword") \
|
||||
AVX_RRM_TEST(Name ## 231 ## P ## S, packed ? "dqword" : "dword") \
|
||||
AVX_RRM_TEST(Name ## 132 ## P ## D, packed ? "dqword" : "qword") \
|
||||
AVX_RRM_TEST(Name ## 213 ## P ## D, packed ? "dqword" : "qword") \
|
||||
AVX_RRM_TEST(Name ## 231 ## P ## D, packed ? "dqword" : "qword")
|
||||
|
||||
FMA_TEST(VFMADD, P, true)
|
||||
FMA_TEST(VFMADD, S, false)
|
||||
FMA_TEST(VFMSUB, P, true)
|
||||
FMA_TEST(VFMSUB, S, false)
|
||||
FMA_TEST(VFNMADD, P, true)
|
||||
FMA_TEST(VFNMADD, S, false)
|
||||
FMA_TEST(VFNMSUB, P, true)
|
||||
FMA_TEST(VFNMSUB, S, false)
|
||||
FMA_TEST(VFMADDSUB, P, true)
|
||||
FMA_TEST(VFMSUBADD, P, true)
|
||||
|
||||
} // namespace Gen
|
||||
|
Loading…
x
Reference in New Issue
Block a user