From 94c20db36968bfcef67d43203ad19c4876f439d3 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:03:07 -0700 Subject: [PATCH 01/11] Rename Log2 and add IsPow2 to MathUtils for future use Also remove unused pow2/pow2f functions. --- Source/Core/Common/MathUtil.h | 7 +++---- Source/Core/DolphinWX/GameListCtrl.cpp | 2 +- Source/Core/VideoBackends/OGL/StreamBuffer.cpp | 2 +- .../Core/VideoCommon/TextureConversionShader.cpp | 6 +++--- Source/UnitTests/Common/MathUtilTest.cpp | 16 ++++++++-------- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/Source/Core/Common/MathUtil.h b/Source/Core/Common/MathUtil.h index 9c0fe2c884..143398d6f7 100644 --- a/Source/Core/Common/MathUtil.h +++ b/Source/Core/Common/MathUtil.h @@ -175,16 +175,15 @@ struct Rectangle } // namespace MathUtil -inline float pow2f(float x) {return x * x;} -inline double pow2(double x) {return x * x;} - float MathFloatVectorSum(const std::vector&); #define ROUND_UP(x, a) (((x) + (a) - 1) & ~((a) - 1)) #define ROUND_DOWN(x, a) ((x) & ~((a) - 1)) +inline bool IsPow2(u32 imm) {return (imm & (imm - 1)) == 0;} + // Rounds down. 0 -> undefined -inline int Log2(u64 val) +inline int IntLog2(u64 val) { #if defined(__GNUC__) return 63 - __builtin_clzll(val); diff --git a/Source/Core/DolphinWX/GameListCtrl.cpp b/Source/Core/DolphinWX/GameListCtrl.cpp index a7b64e2567..98bc658a12 100644 --- a/Source/Core/DolphinWX/GameListCtrl.cpp +++ b/Source/Core/DolphinWX/GameListCtrl.cpp @@ -397,7 +397,7 @@ static wxString NiceSizeFormat(u64 _size) // Find largest power of 2 less than _size. // div 10 to get largest named unit less than _size // 10 == log2(1024) (number of B in a KiB, KiB in a MiB, etc) - const u64 unit = Log2(std::max(_size, 1)) / 10; + const u64 unit = IntLog2(std::max(_size, 1)) / 10; const u64 unit_size = (1 << (unit * 10)); // mul 1000 for 3 decimal places, add 5 to round up, div 10 for 2 decimal places diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp index 5406b6e14c..cc2a3d18a3 100644 --- a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp +++ b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp @@ -23,7 +23,7 @@ static u32 genBuffer() } StreamBuffer::StreamBuffer(u32 type, u32 size) -: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(Log2(ROUND_UP_POW2(size) / SYNC_POINTS)) +: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(IntLog2(ROUND_UP_POW2(size) / SYNC_POINTS)) { m_iterator = 0; m_used_iterator = 0; diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp index 76e49ce464..b0215e4c08 100644 --- a/Source/Core/VideoCommon/TextureConversionShader.cpp +++ b/Source/Core/VideoCommon/TextureConversionShader.cpp @@ -91,8 +91,8 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType) WRITE(p, " int y_block_position = uv1.y & %d;\n", ~(blkH - 1)); WRITE(p, " int y_offset_in_block = uv1.y & %d;\n", blkH - 1); - WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", Log2(samples)); - WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", Log2(blkH), ~(blkW - 1)); + WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", IntLog2(samples)); + WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", IntLog2(blkH), ~(blkW - 1)); if (samples == 1) { // 32 bit textures (RGBA8 and Z24) are stored in 2 cache line increments @@ -100,7 +100,7 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType) WRITE(p, " x_virtual_position = x_virtual_position << 1;\n"); } WRITE(p, " int x_offset_in_block = x_virtual_position & %d;\n", blkW - 1); - WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", Log2(blkW), blkH - 1); + WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", IntLog2(blkW), blkH - 1); WRITE(p, " sampleUv.x = x_offset_in_block + x_block_position;\n"); WRITE(p, " sampleUv.y = y_block_position + y_offset;\n"); diff --git a/Source/UnitTests/Common/MathUtilTest.cpp b/Source/UnitTests/Common/MathUtilTest.cpp index 8ae757962c..9549039304 100644 --- a/Source/UnitTests/Common/MathUtilTest.cpp +++ b/Source/UnitTests/Common/MathUtilTest.cpp @@ -44,17 +44,17 @@ TEST(MathUtil, IsSNAN) EXPECT_TRUE(MathUtil::IsSNAN(std::numeric_limits::signaling_NaN())); } -TEST(MathUtil, Log2) +TEST(MathUtil, IntLog2) { - EXPECT_EQ(0, Log2(1)); - EXPECT_EQ(1, Log2(2)); - EXPECT_EQ(2, Log2(4)); - EXPECT_EQ(3, Log2(8)); - EXPECT_EQ(63, Log2(0x8000000000000000ull)); + EXPECT_EQ(0, IntLog2(1)); + EXPECT_EQ(1, IntLog2(2)); + EXPECT_EQ(2, IntLog2(4)); + EXPECT_EQ(3, IntLog2(8)); + EXPECT_EQ(63, IntLog2(0x8000000000000000ull)); // Rounding behavior. - EXPECT_EQ(3, Log2(15)); - EXPECT_EQ(63, Log2(0xFFFFFFFFFFFFFFFFull)); + EXPECT_EQ(3, IntLog2(15)); + EXPECT_EQ(63, IntLog2(0xFFFFFFFFFFFFFFFFull)); } TEST(MathUtil, FlushToZero) From 858296e1c7fe6a492a1e31d532bffea5361b0c39 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:09:10 -0700 Subject: [PATCH 02/11] JIT64: optimize multiplication by immediate constants Factor out common code and handle a few more common cases. --- Source/Core/Core/PowerPC/Jit64/Jit.h | 2 + .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 140 ++++++++---------- 2 files changed, 63 insertions(+), 79 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index de95967df0..1f16d02e45 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -117,6 +117,8 @@ public: Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true); void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm); + void MultiplyImmediate(u32 imm, int a, int d, bool overflow); + void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); typedef u32 (*Operation)(u32 a, u32 b); void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 2e05ce1623..3b64195a38 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -5,6 +5,7 @@ #include #include +#include "Common/MathUtil.h" #include "Core/PowerPC/Jit64/Jit.h" #include "Core/PowerPC/Jit64/JitAsm.h" #include "Core/PowerPC/Jit64/JitRegCache.h" @@ -1005,6 +1006,64 @@ void Jit64::subfx(UGeckoInstruction inst) } } +void Jit64::MultiplyImmediate(u32 imm, int a, int d, bool overflow) +{ + // simplest cases first + if (imm == 0) + { + XOR(32, gpr.R(d), gpr.R(d)); + return; + } + + if (imm == (u32)-1) + { + if (d != a) + MOV(32, gpr.R(d), gpr.R(a)); + NEG(32, gpr.R(d)); + return; + } + + // skip these if we need to check overflow flag + if (!overflow) + { + // power of 2; just a shift + if (IsPow2(imm)) + { + u32 shift = IntLog2(imm); + // use LEA if it saves an op + if (d != a && shift <= 3 && shift >= 1 && gpr.R(a).IsSimpleReg()) + { + LEA(32, gpr.RX(d), MScaled(gpr.RX(a), SCALE_1 << shift, 0)); + } + else + { + if (d != a) + MOV(32, gpr.R(d), gpr.R(a)); + if (shift) + SHL(32, gpr.R(d), Imm8(shift)); + } + return; + } + + // We could handle factors of 2^N*3, 2^N*5, and 2^N*9 using lea+shl, but testing shows + // it seems to be slower overall. + static u8 lea_scales[3] = { 3, 5, 9 }; + for (int i = 0; i < 3; i++) + { + if (imm == lea_scales[i]) + { + if (d != a) + gpr.BindToRegister(a, true, false); + LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(a), SCALE_2 << i, 0)); + return; + } + } + } + + // if we didn't find any better options + IMUL(32, gpr.RX(d), gpr.R(a), Imm32(imm)); +} + void Jit64::mulli(UGeckoInstruction inst) { INSTRUCTION_START @@ -1020,46 +1079,7 @@ void Jit64::mulli(UGeckoInstruction inst) { gpr.Lock(a, d); gpr.BindToRegister(d, (d == a), true); - if (imm == 0) - { - XOR(32, gpr.R(d), gpr.R(d)); - } - else if (imm == (u32)-1) - { - if (d != a) - MOV(32, gpr.R(d), gpr.R(a)); - NEG(32, gpr.R(d)); - } - else if ((imm & (imm - 1)) == 0) - { - u32 shift = 0; - - if (imm & 0xFFFF0000) - shift |= 16; - - if (imm & 0xFF00FF00) - shift |= 8; - - if (imm & 0xF0F0F0F0) - shift |= 4; - - if (imm & 0xCCCCCCCC) - shift |= 2; - - if (imm & 0xAAAAAAAA) - shift |= 1; - - if (d != a) - MOV(32, gpr.R(d), gpr.R(a)); - - if (shift) - SHL(32, gpr.R(d), Imm8(shift)); - } - else - { - IMUL(32, gpr.RX(d), gpr.R(a), Imm32(imm)); - } - + MultiplyImmediate(imm, a, d, false); gpr.UnlockAll(); } } @@ -1087,45 +1107,7 @@ void Jit64::mullwx(UGeckoInstruction inst) { u32 imm = gpr.R(a).IsImm() ? (u32)gpr.R(a).offset : (u32)gpr.R(b).offset; int src = gpr.R(a).IsImm() ? b : a; - if (imm == 0) - { - XOR(32, gpr.R(d), gpr.R(d)); - } - else if (imm == (u32)-1) - { - if (d != src) - MOV(32, gpr.R(d), gpr.R(src)); - NEG(32, gpr.R(d)); - } - else if ((imm & (imm - 1)) == 0 && !inst.OE) - { - u32 shift = 0; - - if (imm & 0xFFFF0000) - shift |= 16; - - if (imm & 0xFF00FF00) - shift |= 8; - - if (imm & 0xF0F0F0F0) - shift |= 4; - - if (imm & 0xCCCCCCCC) - shift |= 2; - - if (imm & 0xAAAAAAAA) - shift |= 1; - - if (d != src) - MOV(32, gpr.R(d), gpr.R(src)); - - if (shift) - SHL(32, gpr.R(d), Imm8(shift)); - } - else - { - IMUL(32, gpr.RX(d), gpr.R(src), Imm32(imm)); - } + MultiplyImmediate(imm, src, d, inst.OE); } else if (d == a) { From de662a79b7fce6b309e03b85f5416ac89549db46 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:21:00 -0700 Subject: [PATCH 03/11] JIT64: optimize rlwinmx/rlwinix and friends Take advantage of movzx as a replacement for anding with 0xff or 0xffff, and abuse loads from the register cache to save ops. --- Source/Core/Core/PowerPC/Jit64/Jit.h | 4 + .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 174 +++++++++++------- 2 files changed, 114 insertions(+), 64 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 1f16d02e45..0b726521a6 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -107,6 +107,10 @@ public: void GenerateRC(); void ComputeRC(const Gen::OpArg & arg); + // Use to extract bytes from a register using the regcache. offset is in bytes. + Gen::OpArg ExtractFromReg(int reg, int offset); + void AndWithMask(Gen::X64Reg reg, u32 mask); + // Reads a given bit of a given CR register part. void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false); // Clobbers RDX. diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 3b64195a38..90659d9729 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -138,6 +138,30 @@ void Jit64::ComputeRC(const Gen::OpArg & arg) } } +OpArg Jit64::ExtractFromReg(int reg, int offset) +{ + OpArg src = gpr.R(reg); + // store to load forwarding should handle this case efficiently + if (offset) + { + gpr.StoreFromRegister(reg, FLUSH_MAINTAIN_STATE); + src = gpr.GetDefaultLocation(reg); + src.offset += offset; + } + return src; +} + +// we can't do this optimization in the emitter because MOVZX and AND have different effects on flags. +void Jit64::AndWithMask(X64Reg reg, u32 mask) + { + if (mask == 0xff) + MOVZX(32, 8, reg, R(reg)); + else if (mask == 0xffff) + MOVZX(32, 16, reg, R(reg)); + else + AND(32, R(reg), Imm32(mask)); +} + // Following static functions are used in conjunction with regimmop static u32 Add(u32 a, u32 b) { @@ -1576,49 +1600,57 @@ void Jit64::rlwinmx(UGeckoInstruction inst) result &= Helper_Mask(inst.MB, inst.ME); gpr.SetImmediate32(a, result); if (inst.Rc) - { ComputeRC(gpr.R(a)); - } } else { + bool left_shift = inst.SH && inst.MB == 0 && inst.ME == 31 - inst.SH; + bool right_shift = inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH; + u32 mask = Helper_Mask(inst.MB, inst.ME); + bool simple_mask = mask == 0xff || mask == 0xffff; + int mask_size = inst.ME - inst.MB + 1; + gpr.Lock(a, s); gpr.BindToRegister(a, a == s); - if (a != s) + if (a != s && left_shift && gpr.R(s).IsSimpleReg() && inst.SH <= 3) { - MOV(32, gpr.R(a), gpr.R(s)); + LEA(32, gpr.RX(a), MScaled(gpr.RX(s), SCALE_1 << inst.SH, 0)); } - - if (inst.SH && inst.MB == 0 && inst.ME==31-inst.SH) + // common optimized case: byte/word extract + else if (simple_mask && !(inst.SH & (mask_size - 1))) { + MOVZX(32, mask_size, gpr.RX(a), ExtractFromReg(s, inst.SH ? (32 - inst.SH) >> 3 : 0)); + } + // another optimized special case: byte/word extract plus shift + else if (((mask >> inst.SH) << inst.SH) == mask && !left_shift && + ((mask >> inst.SH) == 0xff || (mask >> inst.SH) == 0xffff)) + { + MOVZX(32, mask_size, gpr.RX(a), gpr.R(s)); SHL(32, gpr.R(a), Imm8(inst.SH)); - if (inst.Rc) - ComputeRC(gpr.R(a)); - } - else if (inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH) - { - SHR(32, gpr.R(a), Imm8(inst.MB)); - if (inst.Rc) - ComputeRC(gpr.R(a)); } else { - if (inst.SH != 0) - { - ROL(32, gpr.R(a), Imm8(inst.SH)); - } + if (a != s) + MOV(32, gpr.R(a), gpr.R(s)); - if (!(inst.MB==0 && inst.ME==31)) + if (left_shift) { - AND(32, gpr.R(a), Imm32(Helper_Mask(inst.MB, inst.ME))); - if (inst.Rc) - ComputeRC(gpr.R(a)); + SHL(32, gpr.R(a), Imm8(inst.SH)); } - else if (inst.Rc) + else if (right_shift) { - ComputeRC(gpr.R(a)); + SHR(32, gpr.R(a), Imm8(inst.MB)); + } + else + { + if (inst.SH != 0) + ROL(32, gpr.R(a), Imm8(inst.SH)); + if (!(inst.MB == 0 && inst.ME == 31)) + AndWithMask(gpr.RX(a), mask); } } + if (inst.Rc) + ComputeRC(gpr.R(a)); gpr.UnlockAll(); } } @@ -1636,75 +1668,89 @@ void Jit64::rlwimix(UGeckoInstruction inst) u32 mask = Helper_Mask(inst.MB,inst.ME); gpr.SetImmediate32(a, ((u32)gpr.R(a).offset & ~mask) | (_rotl((u32)gpr.R(s).offset,inst.SH) & mask)); if (inst.Rc) - { ComputeRC(gpr.R(a)); - } } else { gpr.Lock(a, s); - gpr.BindToRegister(a, true, true); u32 mask = Helper_Mask(inst.MB, inst.ME); if (mask == 0 || (a == s && inst.SH == 0)) { - if (inst.Rc) - { - ComputeRC(gpr.R(a)); - } + // nothing to do } else if (mask == 0xFFFFFFFF) { + gpr.BindToRegister(a, a == s, true); if (a != s) - { MOV(32, gpr.R(a), gpr.R(s)); - } - if (inst.SH) - { ROL(32, gpr.R(a), Imm8(inst.SH)); - } - - if (inst.Rc) - { - ComputeRC(gpr.R(a)); - } + } + else if(gpr.R(s).IsImm()) + { + gpr.BindToRegister(a, true, true); + AndWithMask(gpr.RX(a), ~mask); + OR(32, gpr.R(a), Imm32(_rotl((u32)gpr.R(s).offset, inst.SH) & mask)); } else if (inst.SH) { - if (mask == 0U - (1U << inst.SH)) + bool isLeftShift = mask == 0U - (1U << inst.SH); + bool isRightShift = mask == (1U << inst.SH) - 1; + if (gpr.R(a).IsImm()) { - MOV(32, R(RSCRATCH), gpr.R(s)); - SHL(32, R(RSCRATCH), Imm8(inst.SH)); - AND(32, gpr.R(a), Imm32(~mask)); - OR(32, gpr.R(a), R(RSCRATCH)); - } - else if (mask == (1U << inst.SH) - 1) - { - MOV(32, R(RSCRATCH), gpr.R(s)); - SHR(32, R(RSCRATCH), Imm8(32-inst.SH)); - AND(32, gpr.R(a), Imm32(~mask)); - OR(32, gpr.R(a), R(RSCRATCH)); + u32 maskA = gpr.R(a).offset & ~mask; + gpr.BindToRegister(a, false, true); + MOV(32, gpr.R(a), gpr.R(s)); + if (isLeftShift) + { + SHL(32, gpr.R(a), Imm8(inst.SH)); + } + else if (isRightShift) + { + SHR(32, gpr.R(a), Imm8(32 - inst.SH)); + } + else + { + ROL(32, gpr.R(a), Imm8(inst.SH)); + AND(32, gpr.R(a), Imm32(mask)); + } + OR(32, gpr.R(a), Imm32(maskA)); } else { + // TODO: common cases of this might be faster with pinsrb or abuse of AH + gpr.BindToRegister(a, true, true); MOV(32, R(RSCRATCH), gpr.R(s)); - ROL(32, R(RSCRATCH), Imm8(inst.SH)); - XOR(32, R(RSCRATCH), gpr.R(a)); - AND(32, R(RSCRATCH), Imm32(mask)); - XOR(32, gpr.R(a), R(RSCRATCH)); + if (isLeftShift) + { + SHL(32, R(RSCRATCH), Imm8(inst.SH)); + AndWithMask(gpr.RX(a), ~mask); + OR(32, gpr.R(a), R(RSCRATCH)); + } + else if (isRightShift) + { + SHR(32, R(RSCRATCH), Imm8(32 - inst.SH)); + AndWithMask(gpr.RX(a), ~mask); + OR(32, gpr.R(a), R(RSCRATCH)); + } + else + { + ROL(32, R(RSCRATCH), Imm8(inst.SH)); + XOR(32, R(RSCRATCH), gpr.R(a)); + AndWithMask(RSCRATCH, mask); + XOR(32, gpr.R(a), R(RSCRATCH)); + } } - - if (inst.Rc) - ComputeRC(gpr.R(a)); } else { + gpr.BindToRegister(a, true, true); XOR(32, gpr.R(a), gpr.R(s)); - AND(32, gpr.R(a), Imm32(~mask)); + AndWithMask(gpr.RX(a), ~mask); XOR(32, gpr.R(a), gpr.R(s)); - if (inst.Rc) - ComputeRC(gpr.R(a)); } + if (inst.Rc) + ComputeRC(gpr.R(a)); gpr.UnlockAll(); } } @@ -1736,7 +1782,7 @@ void Jit64::rlwnmx(UGeckoInstruction inst) MOV(32, gpr.R(a), gpr.R(s)); } ROL(32, gpr.R(a), R(ECX)); - AND(32, gpr.R(a), Imm32(mask)); + AndWithMask(gpr.RX(a), mask); if (inst.Rc) ComputeRC(gpr.R(a)); gpr.UnlockAll(); From faf6bdfd96b1953fd6eb40935d9162ccd986902c Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:24:27 -0700 Subject: [PATCH 04/11] JIT64: Optimize cmpXX Use TEST instead of CMP if we're comparing against 0 (rather common), and optimize the case of immediate compares further. --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 48 +++++++++++++++---- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 90659d9729..ec5f68cd18 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -460,12 +460,13 @@ void Jit64::cmpXX(UGeckoInstruction inst) } else { + X64Reg input = RSCRATCH; if (signedCompare) { if (gpr.R(a).IsImm()) - MOV(64, R(RSCRATCH), Imm32((s32)gpr.R(a).offset)); + MOV(64, R(input), Imm32((s32)gpr.R(a).offset)); else - MOVSX(64, 32, RSCRATCH, gpr.R(a)); + MOVSX(64, 32, input, gpr.R(a)); if (!comparand.IsImm()) { @@ -476,19 +477,46 @@ void Jit64::cmpXX(UGeckoInstruction inst) else { if (gpr.R(a).IsImm()) - MOV(32, R(RSCRATCH), Imm32((u32)gpr.R(a).offset)); + { + MOV(32, R(input), Imm32((u32)gpr.R(a).offset)); + } + else if (comparand.IsImm() && !comparand.offset) + { + gpr.BindToRegister(a, true, false); + input = gpr.RX(a); + } else - MOVZX(64, 32, RSCRATCH, gpr.R(a)); + { + MOVZX(64, 32, input, gpr.R(a)); + } if (comparand.IsImm()) - MOV(32, R(RSCRATCH2), comparand); + { + // sign extension will ruin this, so store it in a register + if (comparand.offset & 0x80000000U) + { + MOV(32, R(RSCRATCH2), comparand); + comparand = R(RSCRATCH2); + } + } else - MOVZX(64, 32, RSCRATCH2, comparand); - - comparand = R(RSCRATCH2); + { + gpr.BindToRegister(b, true, false); + comparand = gpr.R(b); + } + } + if (comparand.IsImm() && !comparand.offset) + { + MOV(64, PPCSTATE(cr_val[crf]), R(input)); + // Place the comparison next to the branch for macro-op fusion + if (merge_branch) + TEST(64, R(input), R(input)); + } + else + { + SUB(64, R(input), comparand); + MOV(64, PPCSTATE(cr_val[crf]), R(input)); } - SUB(64, R(RSCRATCH), comparand); - MOV(64, PPCSTATE(cr_val[crf]), R(RSCRATCH)); if (merge_branch) { From 298f85e15243c562edd984f01a5eb3d8979bd8a2 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:25:56 -0700 Subject: [PATCH 05/11] JIT64: optimize sign-extend Remove some code duplication. Also remove some comments that no longer apply since x86_32 was dropped. --- Source/Core/Core/PowerPC/Jit64/Jit.h | 3 +- .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 4 +- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 53 ++++++------------- 3 files changed, 20 insertions(+), 40 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 0b726521a6..a2b261d7b3 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -153,8 +153,7 @@ public: void addmex(UGeckoInstruction inst); void addzex(UGeckoInstruction inst); - void extsbx(UGeckoInstruction inst); - void extshx(UGeckoInstruction inst); + void extsXx(UGeckoInstruction inst); void sc(UGeckoInstruction _inst); void rfi(UGeckoInstruction _inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index fa7c19aec8..927e83353f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -190,8 +190,8 @@ static GekkoOPTemplate table31[] = {0, &Jit64::cmpXX}, //"cmp", OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}}, {32, &Jit64::cmpXX}, //"cmpl", OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}}, {26, &Jit64::cntlzwx}, //"cntlzwx",OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, - {922, &Jit64::extshx}, //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, - {954, &Jit64::extsbx}, //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, + {922, &Jit64::extsXx}, //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, + {954, &Jit64::extsXx}, //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {536, &Jit64::srwx}, //"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index ec5f68cd18..01212e3065 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -796,56 +796,37 @@ void Jit64::boolX(UGeckoInstruction inst) } } -void Jit64::extsbx(UGeckoInstruction inst) +void Jit64::extsXx(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITIntegerOff); int a = inst.RA, s = inst.RS; + int size = inst.SUBOP10 == 922 ? 16 : 8; if (gpr.R(s).IsImm()) { - gpr.SetImmediate32(a, (u32)(s32)(s8)gpr.R(s).offset); + gpr.SetImmediate32(a, (u32)(s32)(size == 16 ? (s16)gpr.R(s).offset : (s8)gpr.R(s).offset)); + if (inst.Rc) + ComputeRC(gpr.R(a)); } else { gpr.Lock(a, s); gpr.BindToRegister(a, a == s, true); - MOVSX(32, 8, gpr.RX(a), gpr.R(s)); + // exts is moderately commonly used with inst.Rc, so try to optimize it. + if (inst.Rc) + { + // Only do one movsx; the movzx is free on most modern CPUs. + MOVSX(64, size, gpr.RX(a), gpr.R(s)); + MOV(64, PPCSTATE(cr_val[0]), gpr.R(a)); + MOVZX(64, 32, gpr.RX(a), gpr.R(a)); + } + else + { + MOVSX(32, size, gpr.RX(a), gpr.R(s)); + } gpr.UnlockAll(); } - - if (inst.Rc) - { - ComputeRC(gpr.R(a)); - } -} - -void Jit64::extshx(UGeckoInstruction inst) -{ - INSTRUCTION_START - JITDISABLE(bJITIntegerOff); - int a = inst.RA, s = inst.RS; - - if (gpr.R(s).IsImm()) - { - gpr.SetImmediate32(a, (u32)(s32)(s16)gpr.R(s).offset); - } - else - { - gpr.Lock(a, s); - gpr.KillImmediate(s, true, false); - gpr.BindToRegister(a, a == s, true); - // This looks a little dangerous, but it's safe because - // every 32-bit register has a 16-bit half at the same index - // as the 32-bit register. - MOVSX(32, 16, gpr.RX(a), gpr.R(s)); - gpr.UnlockAll(); - } - - if (inst.Rc) - { - ComputeRC(gpr.R(a)); - } } void Jit64::subfic(UGeckoInstruction inst) From 9977da0550f7661a2cbe67c3371b123e94f65910 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:26:46 -0700 Subject: [PATCH 06/11] JIT64: avoid using LEA for adds when not necessary --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 01212e3065..2d7cfedd8e 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1428,13 +1428,6 @@ void Jit64::addx(UGeckoInstruction inst) GenerateConstantOverflow((s64)i + (s64)j); } } - else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && !inst.Rc && !inst.OE) - { - gpr.Lock(a, b, d); - gpr.BindToRegister(d, false); - LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(b), 1, 0)); - gpr.UnlockAll(); - } else if ((d == a) || (d == b)) { int operand = ((d == a) ? b : a); @@ -1447,6 +1440,15 @@ void Jit64::addx(UGeckoInstruction inst) ComputeRC(gpr.R(d)); gpr.UnlockAll(); } + else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && !inst.OE) + { + gpr.Lock(a, b, d); + gpr.BindToRegister(d, false); + LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(b), 1, 0)); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); + } else { gpr.Lock(a, b, d); From 5b7761706952998b0e9844873c0a0a84cf4329a3 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:54:37 -0700 Subject: [PATCH 07/11] JIT64: use LEA for the "a = b + imm" case of addi --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 2d7cfedd8e..af40bbef78 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -208,8 +208,15 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void else { gpr.BindToRegister(d, false); - MOV(32, gpr.R(d), gpr.R(a)); - (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; + if (doop == Add && gpr.R(a).IsSimpleReg() && !carry) + { + LEA(32, gpr.RX(d), MDisp(gpr.RX(a), value)); + } + else + { + MOV(32, gpr.R(d), gpr.R(a)); + (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; + } if (carry) GenerateCarry(); if (Rc) From a570c6b4a47accf098d658afe3e429e57742db78 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:29:58 -0700 Subject: [PATCH 08/11] JIT64: tweak srwx/slwx BindToRegister arguments Register B gets immediately moved into the shift register, so even if a == b it doesn't need to be loaded. --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index af40bbef78..62e6425d4a 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1861,8 +1861,8 @@ void Jit64::srwx(UGeckoInstruction inst) // no register choice gpr.FlushLockX(ECX); gpr.Lock(a, b, s); - gpr.BindToRegister(a, (a == b || a == s), true); MOV(32, R(ECX), gpr.R(b)); + gpr.BindToRegister(a, a == s, true); if (a != s) { MOV(32, gpr.R(a), gpr.R(s)); @@ -1900,8 +1900,8 @@ void Jit64::slwx(UGeckoInstruction inst) // no register choice gpr.FlushLockX(ECX); gpr.Lock(a, b, s); - gpr.BindToRegister(a, (a == b || a == s), true); MOV(32, R(ECX), gpr.R(b)); + gpr.BindToRegister(a, a == s, true); if (a != s) { MOV(32, gpr.R(a), gpr.R(s)); From a95d8cbcb482426f36e2cd325f5aba1ee5fbef24 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:35:57 -0700 Subject: [PATCH 09/11] JIT64: optimize carry handling Carries are rather common and unpredictable, so do them branchlessly wherever we can. --- Source/Core/Core/PowerPC/Gekko.h | 9 +- Source/Core/Core/PowerPC/Jit64/Jit.h | 4 - .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 260 ++++-------------- Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp | 2 +- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 17 +- Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 3 +- 6 files changed, 84 insertions(+), 211 deletions(-) diff --git a/Source/Core/Core/PowerPC/Gekko.h b/Source/Core/Core/PowerPC/Gekko.h index 1a9e97b559..3a97d96472 100644 --- a/Source/Core/Core/PowerPC/Gekko.h +++ b/Source/Core/Core/PowerPC/Gekko.h @@ -331,9 +331,12 @@ union UFPR float f[2]; }; -#define XER_CA_MASK 0x20000000 -#define XER_OV_MASK 0x40000000 -#define XER_SO_MASK 0x80000000 +#define XER_CA_SHIFT 29 +#define XER_OV_SHIFT 30 +#define XER_SO_SHIFT 31 +#define XER_CA_MASK (1U << XER_CA_SHIFT) +#define XER_OV_MASK (1U << XER_OV_SHIFT) +#define XER_SO_MASK (1U << XER_SO_SHIFT) // XER union UReg_XER { diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index a2b261d7b3..ac3defbb9f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -101,10 +101,6 @@ public: void GenerateConstantOverflow(s64 val); void GenerateOverflow(); void FinalizeCarryOverflow(bool oe, bool inv = false); - void GetCarryRSCRATCHAndClear(); - void FinalizeCarryGenerateOverflowRSCRATCH(bool oe, bool inv = false); - void GenerateCarry(); - void GenerateRC(); void ComputeRC(const Gen::OpArg & arg); // Use to extract bytes from a register using the regcache. offset is in bytes. diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 62e6425d4a..b119232bf1 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -31,6 +31,7 @@ void Jit64::GenerateConstantOverflow(bool overflow) } } +// We could do overflow branchlessly, but unlike carry it seems to be quite a bit rarer. void Jit64::GenerateOverflow() { FixupBranch jno = J_CC(CC_NO); @@ -49,82 +50,24 @@ void Jit64::FinalizeCarryOverflow(bool oe, bool inv) // USES_XER if (oe) { + // this is slightly messy because JitSetCAIf modifies x86 flags, so we have to do it in both + // sides of the branch. FixupBranch jno = J_CC(CC_NO); - // Do carry - FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC); - JitSetCA(); - SetJumpTarget(carry1); + JitSetCAIf(inv ? CC_NC : CC_C); //XER[OV/SO] = 1 OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_SO_MASK | XER_OV_MASK)); FixupBranch exit = J(); SetJumpTarget(jno); - // Do carry - FixupBranch carry2 = J_CC(inv ? CC_C : CC_NC); - JitSetCA(); - SetJumpTarget(carry2); + JitSetCAIf(inv ? CC_NC : CC_C); SetJumpTarget(exit); } else { // Do carry - FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC); - JitSetCA(); - SetJumpTarget(carry1); + JitSetCAIf(inv ? CC_NC : CC_C); } } -void Jit64::GetCarryRSCRATCHAndClear() -{ - MOV(32, R(RSCRATCH), PPCSTATE(spr[SPR_XER])); - BTR(32, R(RSCRATCH), Imm8(29)); -} - -// Assumes that XER is in RSCRATCH and that the CA bit is clear. -void Jit64::FinalizeCarryGenerateOverflowRSCRATCH(bool oe, bool inv) -{ - // USES_XER - if (oe) - { - FixupBranch jno = J_CC(CC_NO); - // Do carry - FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC); - OR(32, R(RSCRATCH), Imm32(XER_CA_MASK)); - SetJumpTarget(carry1); - //XER[OV/SO] = 1 - OR(32, R(RSCRATCH), Imm32(XER_SO_MASK | XER_OV_MASK)); - FixupBranch exit = J(); - SetJumpTarget(jno); - // Do carry - FixupBranch carry2 = J_CC(inv ? CC_C : CC_NC); - OR(32, R(RSCRATCH), Imm32(XER_CA_MASK)); - SetJumpTarget(carry2); - //XER[OV] = 0 - AND(32, R(RSCRATCH), Imm32(~XER_OV_MASK)); - SetJumpTarget(exit); - } - else - { - // Do carry - FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC); - OR(32, R(RSCRATCH), Imm32(XER_CA_MASK)); - SetJumpTarget(carry1); - } - // Dump RSCRATCH back into XER - MOV(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); -} - -// Assumes that the flags were just set through an addition. -void Jit64::GenerateCarry() -{ - // USES_XER - FixupBranch pNoCarry = J_CC(CC_NC); - OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_CA_MASK)); - FixupBranch pContinue = J(); - SetJumpTarget(pNoCarry); - AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~(XER_CA_MASK))); - SetJumpTarget(pContinue); -} - void Jit64::ComputeRC(const Gen::OpArg & arg) { if (arg.IsImm()) @@ -153,12 +96,12 @@ OpArg Jit64::ExtractFromReg(int reg, int offset) // we can't do this optimization in the emitter because MOVZX and AND have different effects on flags. void Jit64::AndWithMask(X64Reg reg, u32 mask) - { +{ if (mask == 0xff) MOVZX(32, 8, reg, R(reg)); else if (mask == 0xffff) MOVZX(32, 16, reg, R(reg)); - else + else AND(32, R(reg), Imm32(mask)); } @@ -188,22 +131,16 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void gpr.Lock(d, a); if (a || binary || carry) // yeh nasty special case addic { + if (carry) + JitClearCAOV(false); if (gpr.R(a).IsImm() && !carry) { gpr.SetImmediate32(d, doop((u32)gpr.R(a).offset, value)); - if (Rc) - { - ComputeRC(gpr.R(d)); - } } else if (a == d) { gpr.KillImmediate(d, true, true); (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; - if (carry) - GenerateCarry(); - if (Rc) - ComputeRC(gpr.R(d)); } else { @@ -217,11 +154,11 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void MOV(32, gpr.R(d), gpr.R(a)); (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; } - if (carry) - GenerateCarry(); - if (Rc) - ComputeRC(gpr.R(d)); } + if (carry) + JitSetCAIf(CC_C); + if (Rc) + ComputeRC(gpr.R(d)); } else if (doop == Add) { @@ -848,13 +785,11 @@ void Jit64::subfic(UGeckoInstruction inst) { if (imm == 0) { - JitClearCA(); + JitClearCAOV(false); // Flags act exactly like subtracting from 0 NEG(32, gpr.R(d)); // Output carry is inverted - FixupBranch carry1 = J_CC(CC_C); - JitSetCA(); - SetJumpTarget(carry1); + JitSetCAIf(CC_NC); } else if (imm == -1) { @@ -864,24 +799,20 @@ void Jit64::subfic(UGeckoInstruction inst) } else { - JitClearCA(); + JitClearCAOV(false); NOT(32, gpr.R(d)); ADD(32, gpr.R(d), Imm32(imm+1)); // Output carry is normal - FixupBranch carry1 = J_CC(CC_NC); - JitSetCA(); - SetJumpTarget(carry1); + JitSetCAIf(CC_C); } } else { - JitClearCA(); + JitClearCAOV(false); MOV(32, gpr.R(d), Imm32(imm)); SUB(32, gpr.R(d), gpr.R(a)); // Output carry is inverted - FixupBranch carry1 = J_CC(CC_C); - JitSetCA(); - SetJumpTarget(carry1); + JitSetCAIf(CC_NC); } gpr.UnlockAll(); // This instruction has no RC flag @@ -926,7 +857,7 @@ void Jit64::subfex(UGeckoInstruction inst) gpr.Lock(a, b, d); gpr.BindToRegister(d, (d == a || d == b), true); - GetCarryRSCRATCHAndClear(); + JitGetAndClearCAOV(inst.OE); bool invertedCarry = false; if (d == b) @@ -947,7 +878,7 @@ void Jit64::subfex(UGeckoInstruction inst) NOT(32, gpr.R(d)); ADC(32, gpr.R(d), gpr.R(b)); } - FinalizeCarryGenerateOverflowRSCRATCH(inst.OE, invertedCarry); + FinalizeCarryOverflow(inst.OE, invertedCarry); if (inst.Rc) ComputeRC(gpr.R(d)); @@ -963,14 +894,12 @@ void Jit64::subfmex(UGeckoInstruction inst) gpr.Lock(a, d); gpr.BindToRegister(d, d == a); - GetCarryRSCRATCHAndClear(); + JitGetAndClearCAOV(inst.OE); if (d != a) - { MOV(32, gpr.R(d), gpr.R(a)); - } NOT(32, gpr.R(d)); ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); - FinalizeCarryGenerateOverflowRSCRATCH(inst.OE); + FinalizeCarryOverflow(inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -986,14 +915,12 @@ void Jit64::subfzex(UGeckoInstruction inst) gpr.Lock(a, d); gpr.BindToRegister(d, d == a); - GetCarryRSCRATCHAndClear(); + JitGetAndClearCAOV(inst.OE); if (d != a) - { MOV(32, gpr.R(d), gpr.R(a)); - } NOT(32, gpr.R(d)); ADC(32, gpr.R(d), Imm8(0)); - FinalizeCarryGenerateOverflowRSCRATCH(inst.OE); + FinalizeCarryOverflow(inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); @@ -1011,13 +938,9 @@ void Jit64::subfx(UGeckoInstruction inst) s32 i = (s32)gpr.R(b).offset, j = (s32)gpr.R(a).offset; gpr.SetImmediate32(d, i - j); if (inst.Rc) - { ComputeRC(gpr.R(d)); - } if (inst.OE) - { GenerateConstantOverflow((s64)i - (s64)j); - } } else { @@ -1477,31 +1400,22 @@ void Jit64::addex(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); int a = inst.RA, b = inst.RB, d = inst.RD; + gpr.Lock(a, b, d); + gpr.BindToRegister(d, (d == a) || (d == b)); + JitGetAndClearCAOV(inst.OE); if ((d == a) || (d == b)) { - gpr.Lock(a, b, d); - gpr.BindToRegister(d, true); - - GetCarryRSCRATCHAndClear(); ADC(32, gpr.R(d), gpr.R((d == a) ? b : a)); - FinalizeCarryGenerateOverflowRSCRATCH(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); } else { - gpr.Lock(a, b, d); - gpr.BindToRegister(d, false); - - GetCarryRSCRATCHAndClear(); MOV(32, gpr.R(d), gpr.R(a)); ADC(32, gpr.R(d), gpr.R(b)); - FinalizeCarryGenerateOverflowRSCRATCH(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); } + FinalizeCarryOverflow(inst.OE); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); } void Jit64::addcx(UGeckoInstruction inst) @@ -1543,31 +1457,16 @@ void Jit64::addmex(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); int a = inst.RA, d = inst.RD; - if (d == a) - { - gpr.Lock(d); - gpr.BindToRegister(d, true); - - GetCarryRSCRATCHAndClear(); - ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); - FinalizeCarryGenerateOverflowRSCRATCH(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); - } - else - { - gpr.Lock(a, d); - gpr.BindToRegister(d, false); - - GetCarryRSCRATCHAndClear(); + gpr.Lock(d); + gpr.BindToRegister(d, d == a); + JitGetAndClearCAOV(inst.OE); + if (d != a) MOV(32, gpr.R(d), gpr.R(a)); - ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); - FinalizeCarryGenerateOverflowRSCRATCH(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); - } + ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); + FinalizeCarryOverflow(inst.OE); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); } void Jit64::addzex(UGeckoInstruction inst) @@ -1577,31 +1476,16 @@ void Jit64::addzex(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); int a = inst.RA, d = inst.RD; - if (d == a) - { - gpr.Lock(d); - gpr.BindToRegister(d, true); - - GetCarryRSCRATCHAndClear(); - ADC(32, gpr.R(d), Imm8(0)); - FinalizeCarryGenerateOverflowRSCRATCH(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); - } - else - { - gpr.Lock(a, d); - gpr.BindToRegister(d, false); - - GetCarryRSCRATCHAndClear(); + gpr.Lock(d); + gpr.BindToRegister(d, d == a); + JitGetAndClearCAOV(inst.OE); + if (d != a) MOV(32, gpr.R(d), gpr.R(a)); - ADC(32, gpr.R(d), Imm8(0)); - FinalizeCarryGenerateOverflowRSCRATCH(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); - } + ADC(32, gpr.R(d), Imm8(0)); + FinalizeCarryOverflow(inst.OE); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); } void Jit64::rlwinmx(UGeckoInstruction inst) @@ -1793,8 +1677,8 @@ void Jit64::rlwnmx(UGeckoInstruction inst) // no register choice gpr.FlushLockX(ECX); gpr.Lock(a, b, s); - gpr.BindToRegister(a, (a == b || a == s), true); MOV(32, R(ECX), gpr.R(b)); + gpr.BindToRegister(a, (a == s), true); if (a != s) { MOV(32, gpr.R(a), gpr.R(s)); @@ -1903,9 +1787,7 @@ void Jit64::slwx(UGeckoInstruction inst) MOV(32, R(ECX), gpr.R(b)); gpr.BindToRegister(a, a == s, true); if (a != s) - { MOV(32, gpr.R(a), gpr.R(s)); - } SHL(64, gpr.R(a), R(ECX)); if (inst.Rc) { @@ -1932,7 +1814,7 @@ void Jit64::srawx(UGeckoInstruction inst) gpr.FlushLockX(ECX); gpr.Lock(a, s, b); gpr.BindToRegister(a, (a == s || a == b), true); - JitClearCA(); + JitClearCAOV(false); MOV(32, R(ECX), gpr.R(b)); if (a != s) MOV(32, gpr.R(a), gpr.R(s)); @@ -1941,16 +1823,11 @@ void Jit64::srawx(UGeckoInstruction inst) MOV(32, R(RSCRATCH), gpr.R(a)); SHR(64, gpr.R(a), Imm8(32)); TEST(32, gpr.R(a), R(RSCRATCH)); - FixupBranch nocarry = J_CC(CC_Z); - JitSetCA(); - SetJumpTarget(nocarry); + JitSetCAIf(CC_NZ); gpr.UnlockAll(); gpr.UnlockAllX(); - if (inst.Rc) - { ComputeRC(gpr.R(a)); - } } void Jit64::srawix(UGeckoInstruction inst) @@ -1964,39 +1841,27 @@ void Jit64::srawix(UGeckoInstruction inst) { gpr.Lock(a, s); gpr.BindToRegister(a, a == s, true); - JitClearCA(); + JitClearCAOV(false); MOV(32, R(RSCRATCH), gpr.R(s)); if (a != s) - { MOV(32, gpr.R(a), R(RSCRATCH)); - } SAR(32, gpr.R(a), Imm8(amount)); - if (inst.Rc) - ComputeRC(gpr.R(a)); - SHL(32, R(RSCRATCH), Imm8(32-amount)); + SHL(32, R(RSCRATCH), Imm8(32 - amount)); TEST(32, R(RSCRATCH), gpr.R(a)); - FixupBranch nocarry = J_CC(CC_Z); - JitSetCA(); - SetJumpTarget(nocarry); - gpr.UnlockAll(); + JitSetCAIf(CC_NZ); } else { gpr.Lock(a, s); - JitClearCA(); + JitClearCAOV(false); gpr.BindToRegister(a, a == s, true); if (a != s) - { MOV(32, gpr.R(a), gpr.R(s)); - } - - if (inst.Rc) - { - ComputeRC(gpr.R(a)); - } - gpr.UnlockAll(); } + if (inst.Rc) + ComputeRC(gpr.R(a)); + gpr.UnlockAll(); } // count leading zeroes @@ -2032,10 +1897,7 @@ void Jit64::cntlzwx(UGeckoInstruction inst) } if (inst.Rc) - { ComputeRC(gpr.R(a)); - // TODO: Check PPC manual too - } } void Jit64::twx(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp index d266023df5..3874c22a91 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp @@ -1106,7 +1106,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) Jit->JitSetCA(); FixupBranch cont = Jit->J(); Jit->SetJumpTarget(nocarry); - Jit->JitClearCA(); + Jit->JitClearCAOV(false); Jit->SetJumpTarget(cont); regNormalRegClear(RI, I); break; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 6b80fd853d..32be48fe0d 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -803,10 +803,11 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm) OR(32, PPCSTATE(fpscr), R(RSCRATCH)); } - -void EmuCodeBlock::JitClearCA() +void EmuCodeBlock::JitGetAndClearCAOV(bool oe) { - AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0 + if (oe) + AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_OV_MASK)); //XER.OV = 0 + BTR(32, PPCSTATE(spr[SPR_XER]), Imm8(29)); //carry = XER.CA, XER.CA = 0 } void EmuCodeBlock::JitSetCA() @@ -814,6 +815,16 @@ void EmuCodeBlock::JitSetCA() OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1 } +// Some testing shows CA is set roughly ~1/3 of the time (relative to clears), so +// branchless calculation of CA is probably faster in general. +void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode) +{ + SETcc(conditionCode, R(RSCRATCH)); + MOVZX(32, 8, RSCRATCH, R(RSCRATCH)); + SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT)); + OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); //XER.CA = 1 +} + void EmuCodeBlock::JitClearCAOV(bool oe) { if (oe) diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index 73eb9ebfe8..e50eedf08f 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -71,8 +71,9 @@ public: void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0); void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false); - void JitClearCA(); + void JitGetAndClearCAOV(bool oe); void JitSetCA(); + void JitSetCAIf(Gen::CCFlags conditionCode); void JitClearCAOV(bool oe); void ForceSinglePrecisionS(Gen::X64Reg xmm); From b56117de05287f4bf573ed5e6d1de46efe5a3cac Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 28 Aug 2014 10:21:46 -0700 Subject: [PATCH 10/11] JIT64: optimize some special cases of srawix Shift by 31 and 1, both of which are pretty common, can be done in a few less instructions. Tested with a hwtest. --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index b119232bf1..a3f0eec8c6 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1841,14 +1841,36 @@ void Jit64::srawix(UGeckoInstruction inst) { gpr.Lock(a, s); gpr.BindToRegister(a, a == s, true); - JitClearCAOV(false); MOV(32, R(RSCRATCH), gpr.R(s)); if (a != s) MOV(32, gpr.R(a), R(RSCRATCH)); - SAR(32, gpr.R(a), Imm8(amount)); - SHL(32, R(RSCRATCH), Imm8(32 - amount)); - TEST(32, R(RSCRATCH), gpr.R(a)); - JitSetCAIf(CC_NZ); + // some optimized common cases that can be done in slightly fewer ops + if (amount == 31) + { + JitSetCA(); + SAR(32, gpr.R(a), Imm8(31)); + NEG(32, R(RSCRATCH)); // RSCRATCH = input == INT_MIN ? INT_MIN : -input; + AND(32, R(RSCRATCH), Imm32(0x80000000)); // RSCRATCH = input < 0 && input != INT_MIN ? 0 : 0x80000000 + SHR(32, R(RSCRATCH), Imm8(31 - XER_CA_SHIFT)); + XOR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = (input < 0 && input != INT_MIN) + } + else if (amount == 1) + { + JitClearCAOV(false); + SHR(32, R(RSCRATCH), Imm8(31)); // sign + AND(32, R(RSCRATCH), gpr.R(a)); // (sign && carry) + SAR(32, gpr.R(a), Imm8(1)); + SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT)); + OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001 + } + else + { + JitClearCAOV(false); + SAR(32, gpr.R(a), Imm8(amount)); + SHL(32, R(RSCRATCH), Imm8(32 - amount)); + TEST(32, R(RSCRATCH), gpr.R(a)); + JitSetCAIf(CC_NZ); + } } else { From 8fc57d61ba701088c2517001066d24fbc6035b2d Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 17 Aug 2014 23:12:16 -0700 Subject: [PATCH 11/11] JIT64: support merged branching for rlwinmx, too Not quite as common a branch instruction as cmpwi, but close. --- Source/Core/Core/PowerPC/Jit64/Jit.h | 2 + .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 216 ++++++++++-------- 2 files changed, 120 insertions(+), 98 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index ac3defbb9f..821df102d1 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -106,6 +106,8 @@ public: // Use to extract bytes from a register using the regcache. offset is in bytes. Gen::OpArg ExtractFromReg(int reg, int offset); void AndWithMask(Gen::X64Reg reg, u32 mask); + bool CheckMergedBranch(int crf); + void DoMergedBranch(); // Reads a given bit of a given CR register part. void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index a3f0eec8c6..5b405de5bb 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -264,6 +264,54 @@ void Jit64::reg_imm(UGeckoInstruction inst) } } +bool Jit64::CheckMergedBranch(int crf) +{ + const UGeckoInstruction& next = js.next_inst; + return (((next.OPCD == 16 /* bcx */) || + ((next.OPCD == 19) && (next.SUBOP10 == 528) /* bcctrx */) || + ((next.OPCD == 19) && (next.SUBOP10 == 16) /* bclrx */)) && + (next.BO & BO_DONT_DECREMENT_FLAG) && + !(next.BO & BO_DONT_CHECK_CONDITION) && + (next.BI >> 2) == crf); +} + +void Jit64::DoMergedBranch() +{ + // Code that handles successful PPC branching. + if (js.next_inst.OPCD == 16) // bcx + { + if (js.next_inst.LK) + MOV(32, M(&LR), Imm32(js.next_compilerPC + 4)); + + u32 destination; + if (js.next_inst.AA) + destination = SignExt16(js.next_inst.BD << 2); + else + destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2); + WriteExit(destination); + } + else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx + { + if (js.next_inst.LK) + MOV(32, M(&LR), Imm32(js.next_compilerPC + 4)); + MOV(32, R(RSCRATCH), M(&CTR)); + AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); + WriteExitDestInRSCRATCH(); + } + else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx + { + MOV(32, R(RSCRATCH), M(&LR)); + AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); + if (js.next_inst.LK) + MOV(32, M(&LR), Imm32(js.next_compilerPC + 4)); + WriteExitDestInRSCRATCH(); + } + else + { + PanicAlert("WTF invalid branch"); + } +} + void Jit64::cmpXX(UGeckoInstruction inst) { // USES_CR @@ -272,23 +320,7 @@ void Jit64::cmpXX(UGeckoInstruction inst) int a = inst.RA; int b = inst.RB; int crf = inst.CRFD; - - bool merge_branch = false; - int test_crf = js.next_inst.BI >> 2; - // Check if the next instruction is a branch - if it is, merge the two. - if (((js.next_inst.OPCD == 16 /* bcx */) || - ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528) /* bcctrx */) || - ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16) /* bclrx */)) && - (js.next_inst.BO & BO_DONT_DECREMENT_FLAG) && - !(js.next_inst.BO & BO_DONT_CHECK_CONDITION)) - { - // Looks like a decent conditional branch that we can merge with. - // It only test CR, not CTR. - if (test_crf == crf) - { - merge_branch = true; - } - } + bool merge_branch = CheckMergedBranch(crf); OpArg comparand; bool signedCompare; @@ -358,47 +390,13 @@ void Jit64::cmpXX(UGeckoInstruction inst) { gpr.Flush(); fpr.Flush(); - - if (js.next_inst.OPCD == 16) // bcx - { - if (js.next_inst.LK) - MOV(32, PPCSTATE_LR, Imm32(js.next_compilerPC + 4)); - - u32 destination; - if (js.next_inst.AA) - destination = SignExt16(js.next_inst.BD << 2); - else - destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2); - WriteExit(destination); - } - else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx - { - if (js.next_inst.LK) - MOV(32, PPCSTATE_LR, Imm32(js.next_compilerPC + 4)); - MOV(32, R(RSCRATCH), PPCSTATE_CTR); - AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); - WriteExitDestInRSCRATCH(); - } - else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx - { - MOV(32, R(RSCRATCH), PPCSTATE_LR); - if (js.next_inst.LK) - MOV(32, PPCSTATE_LR, Imm32(js.next_compilerPC + 4)); - WriteExitDestInRSCRATCH(); - } - else - { - PanicAlert("WTF invalid branch"); - } + DoMergedBranch(); } - else + else if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) { - if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) - { - gpr.Flush(); - fpr.Flush(); - WriteExit(js.next_compilerPC + 4); - } + gpr.Flush(); + fpr.Flush(); + WriteExit(js.next_compilerPC + 4); } } } @@ -487,51 +485,12 @@ void Jit64::cmpXX(UGeckoInstruction inst) gpr.Flush(FLUSH_MAINTAIN_STATE); fpr.Flush(FLUSH_MAINTAIN_STATE); - // Code that handles successful PPC branching. - if (js.next_inst.OPCD == 16) // bcx - { - if (js.next_inst.LK) - MOV(32, PPCSTATE_LR, Imm32(js.next_compilerPC + 4)); - - u32 destination; - if (js.next_inst.AA) - destination = SignExt16(js.next_inst.BD << 2); - else - destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2); - WriteExit(destination); - } - else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx - { - if (js.next_inst.LK) - MOV(32, PPCSTATE_LR, Imm32(js.next_compilerPC + 4)); - - MOV(32, R(RSCRATCH), PPCSTATE_CTR); - AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); - WriteExitDestInRSCRATCH(); - } - else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx - { - MOV(32, R(RSCRATCH), PPCSTATE_LR); - AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); - - if (js.next_inst.LK) - MOV(32, PPCSTATE_LR, Imm32(js.next_compilerPC + 4)); - - WriteExitDestInRSCRATCH(); - } - else - { - PanicAlert("WTF invalid branch"); - } + DoMergedBranch(); SetJumpTarget(pDontBranch); if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) - { - gpr.Flush(); - fpr.Flush(); WriteExit(js.next_compilerPC + 4); - } } } @@ -1494,6 +1453,11 @@ void Jit64::rlwinmx(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); int a = inst.RA; int s = inst.RS; + + // rlwinm is commonly used as a branch test, second only to the more obvious cmpw. + // since it's almost never used with any check other than beq, only support beq for simplicity. + bool merge_branch = inst.Rc && CheckMergedBranch(0) && (js.next_inst.BI & 3) == 2; + if (gpr.R(s).IsImm()) { u32 result = (int)gpr.R(s).offset; @@ -1510,6 +1474,11 @@ void Jit64::rlwinmx(UGeckoInstruction inst) bool right_shift = inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH; u32 mask = Helper_Mask(inst.MB, inst.ME); bool simple_mask = mask == 0xff || mask == 0xffff; + // in case of a merged branch, track whether or not we've set flags. + // if not, we need to do a TEST later to get them. + bool needs_test = false; + // if we know the high bit can't be set, we can avoid doing a sign extend for flag storage + bool needs_sext = true; int mask_size = inst.ME - inst.MB + 1; gpr.Lock(a, s); @@ -1517,11 +1486,14 @@ void Jit64::rlwinmx(UGeckoInstruction inst) if (a != s && left_shift && gpr.R(s).IsSimpleReg() && inst.SH <= 3) { LEA(32, gpr.RX(a), MScaled(gpr.RX(s), SCALE_1 << inst.SH, 0)); + needs_test = true; } // common optimized case: byte/word extract else if (simple_mask && !(inst.SH & (mask_size - 1))) { MOVZX(32, mask_size, gpr.RX(a), ExtractFromReg(s, inst.SH ? (32 - inst.SH) >> 3 : 0)); + needs_test = true; + needs_sext = false; } // another optimized special case: byte/word extract plus shift else if (((mask >> inst.SH) << inst.SH) == mask && !left_shift && @@ -1529,6 +1501,7 @@ void Jit64::rlwinmx(UGeckoInstruction inst) { MOVZX(32, mask_size, gpr.RX(a), gpr.R(s)); SHL(32, gpr.R(a), Imm8(inst.SH)); + needs_sext = inst.SH + mask_size >= 32; } else { @@ -1542,17 +1515,64 @@ void Jit64::rlwinmx(UGeckoInstruction inst) else if (right_shift) { SHR(32, gpr.R(a), Imm8(inst.MB)); + needs_sext = false; } else { if (inst.SH != 0) ROL(32, gpr.R(a), Imm8(inst.SH)); if (!(inst.MB == 0 && inst.ME == 31)) - AndWithMask(gpr.RX(a), mask); + { + // we need flags if we're merging the branch + if (merge_branch) + AND(32, gpr.R(a), Imm32(mask)); + else + AndWithMask(gpr.RX(a), mask); + needs_sext = inst.MB == 0; + } + else + { + needs_test = true; + } } } - if (inst.Rc) + if (merge_branch) + { + js.downcountAmount++; + js.skipnext = true; + if (needs_sext) + { + MOVSX(64, 32, RSCRATCH, gpr.R(a)); + MOV(64, M(&PowerPC::ppcState.cr_val[0]), R(RSCRATCH)); + } + else + { + MOV(64, M(&PowerPC::ppcState.cr_val[0]), gpr.R(a)); + } + if (needs_test) + TEST(32, gpr.R(a), gpr.R(a)); + + gpr.UnlockAll(); + FixupBranch dont_branch = J_CC((js.next_inst.BO & BO_BRANCH_IF_TRUE) ? CC_NE : CC_E, true); + + gpr.Flush(FLUSH_MAINTAIN_STATE); + fpr.Flush(FLUSH_MAINTAIN_STATE); + + DoMergedBranch(); + + SetJumpTarget(dont_branch); + + if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) + { + gpr.Flush(); + fpr.Flush(); + WriteExit(js.next_compilerPC + 4); + } + } + else if (inst.Rc) + { ComputeRC(gpr.R(a)); + } gpr.UnlockAll(); } }