From b51aa4fa89c819f12c718a794ee8a62969e8b2d2 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:03:07 -0700 Subject: [PATCH 01/13] Rename Log2 and add IsPow2 to MathUtils for future use Also remove unused pow2/pow2f functions. --- Source/Core/Common/MathUtil.h | 7 +++---- Source/Core/DolphinWX/GameListCtrl.cpp | 2 +- Source/Core/VideoBackends/OGL/StreamBuffer.cpp | 2 +- .../Core/VideoCommon/TextureConversionShader.cpp | 6 +++--- Source/UnitTests/Common/MathUtilTest.cpp | 16 ++++++++-------- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/Source/Core/Common/MathUtil.h b/Source/Core/Common/MathUtil.h index db35cbedcf..013e0b9733 100644 --- a/Source/Core/Common/MathUtil.h +++ b/Source/Core/Common/MathUtil.h @@ -166,16 +166,15 @@ struct Rectangle } // namespace MathUtil -inline float pow2f(float x) {return x * x;} -inline double pow2(double x) {return x * x;} - float MathFloatVectorSum(const std::vector&); #define ROUND_UP(x, a) (((x) + (a) - 1) & ~((a) - 1)) #define ROUND_DOWN(x, a) ((x) & ~((a) - 1)) +inline bool IsPow2(u32 imm) {return (imm & (imm - 1)) == 0;} + // Rounds down. 0 -> undefined -inline int Log2(u64 val) +inline int IntLog2(u64 val) { #if defined(__GNUC__) return 63 - __builtin_clzll(val); diff --git a/Source/Core/DolphinWX/GameListCtrl.cpp b/Source/Core/DolphinWX/GameListCtrl.cpp index 53cdc272e2..84b2d8edff 100644 --- a/Source/Core/DolphinWX/GameListCtrl.cpp +++ b/Source/Core/DolphinWX/GameListCtrl.cpp @@ -397,7 +397,7 @@ static wxString NiceSizeFormat(u64 _size) // Find largest power of 2 less than _size. // div 10 to get largest named unit less than _size // 10 == log2(1024) (number of B in a KiB, KiB in a MiB, etc) - const u64 unit = Log2(std::max(_size, 1)) / 10; + const u64 unit = IntLog2(std::max(_size, 1)) / 10; const u64 unit_size = (1 << (unit * 10)); // mul 1000 for 3 decimal places, add 5 to round up, div 10 for 2 decimal places diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp index 91d4692b08..43cb1cb61b 100644 --- a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp +++ b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp @@ -23,7 +23,7 @@ static u32 genBuffer() } StreamBuffer::StreamBuffer(u32 type, u32 size) -: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(Log2(ROUND_UP_POW2(size) / SYNC_POINTS)) +: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(IntLog2(ROUND_UP_POW2(size) / SYNC_POINTS)) { m_iterator = 0; m_used_iterator = 0; diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp index 76e49ce464..b0215e4c08 100644 --- a/Source/Core/VideoCommon/TextureConversionShader.cpp +++ b/Source/Core/VideoCommon/TextureConversionShader.cpp @@ -91,8 +91,8 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType) WRITE(p, " int y_block_position = uv1.y & %d;\n", ~(blkH - 1)); WRITE(p, " int y_offset_in_block = uv1.y & %d;\n", blkH - 1); - WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", Log2(samples)); - WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", Log2(blkH), ~(blkW - 1)); + WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", IntLog2(samples)); + WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", IntLog2(blkH), ~(blkW - 1)); if (samples == 1) { // 32 bit textures (RGBA8 and Z24) are stored in 2 cache line increments @@ -100,7 +100,7 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType) WRITE(p, " x_virtual_position = x_virtual_position << 1;\n"); } WRITE(p, " int x_offset_in_block = x_virtual_position & %d;\n", blkW - 1); - WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", Log2(blkW), blkH - 1); + WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", IntLog2(blkW), blkH - 1); WRITE(p, " sampleUv.x = x_offset_in_block + x_block_position;\n"); WRITE(p, " sampleUv.y = y_block_position + y_offset;\n"); diff --git a/Source/UnitTests/Common/MathUtilTest.cpp b/Source/UnitTests/Common/MathUtilTest.cpp index 8ae757962c..9549039304 100644 --- a/Source/UnitTests/Common/MathUtilTest.cpp +++ b/Source/UnitTests/Common/MathUtilTest.cpp @@ -44,17 +44,17 @@ TEST(MathUtil, IsSNAN) EXPECT_TRUE(MathUtil::IsSNAN(std::numeric_limits::signaling_NaN())); } -TEST(MathUtil, Log2) +TEST(MathUtil, IntLog2) { - EXPECT_EQ(0, Log2(1)); - EXPECT_EQ(1, Log2(2)); - EXPECT_EQ(2, Log2(4)); - EXPECT_EQ(3, Log2(8)); - EXPECT_EQ(63, Log2(0x8000000000000000ull)); + EXPECT_EQ(0, IntLog2(1)); + EXPECT_EQ(1, IntLog2(2)); + EXPECT_EQ(2, IntLog2(4)); + EXPECT_EQ(3, IntLog2(8)); + EXPECT_EQ(63, IntLog2(0x8000000000000000ull)); // Rounding behavior. - EXPECT_EQ(3, Log2(15)); - EXPECT_EQ(63, Log2(0xFFFFFFFFFFFFFFFFull)); + EXPECT_EQ(3, IntLog2(15)); + EXPECT_EQ(63, IntLog2(0xFFFFFFFFFFFFFFFFull)); } TEST(MathUtil, FlushToZero) From 58dc802ce276de1d79131eea5b069157b0467fe9 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:09:10 -0700 Subject: [PATCH 02/13] JIT64: optimize multiplication by immediate constants Factor out common code and handle a few more common cases. --- Source/Core/Core/PowerPC/Jit64/Jit.h | 2 + .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 140 ++++++++---------- 2 files changed, 63 insertions(+), 79 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 5a515f1f81..47316f7944 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -118,6 +118,8 @@ public: Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true); void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm); + void MultiplyImmediate(u32 imm, int a, int d, bool overflow); + void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); typedef u32 (*Operation)(u32 a, u32 b); void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index ad30883018..f805a93279 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -5,6 +5,7 @@ #include #include +#include "Common/MathUtil.h" #include "Core/PowerPC/Jit64/Jit.h" #include "Core/PowerPC/Jit64/JitAsm.h" #include "Core/PowerPC/Jit64/JitRegCache.h" @@ -1007,6 +1008,64 @@ void Jit64::subfx(UGeckoInstruction inst) } } +void Jit64::MultiplyImmediate(u32 imm, int a, int d, bool overflow) +{ + // simplest cases first + if (imm == 0) + { + XOR(32, gpr.R(d), gpr.R(d)); + return; + } + + if (imm == (u32)-1) + { + if (d != a) + MOV(32, gpr.R(d), gpr.R(a)); + NEG(32, gpr.R(d)); + return; + } + + // skip these if we need to check overflow flag + if (!overflow) + { + // power of 2; just a shift + if (IsPow2(imm)) + { + u32 shift = IntLog2(imm); + // use LEA if it saves an op + if (d != a && shift <= 3 && shift >= 1 && gpr.R(a).IsSimpleReg()) + { + LEA(32, gpr.RX(d), MScaled(gpr.RX(a), SCALE_1 << shift, 0)); + } + else + { + if (d != a) + MOV(32, gpr.R(d), gpr.R(a)); + if (shift) + SHL(32, gpr.R(d), Imm8(shift)); + } + return; + } + + // We could handle factors of 2^N*3, 2^N*5, and 2^N*9 using lea+shl, but testing shows + // it seems to be slower overall. + static u8 lea_scales[3] = { 3, 5, 9 }; + for (int i = 0; i < 3; i++) + { + if (imm == lea_scales[i]) + { + if (d != a) + gpr.BindToRegister(a, true, false); + LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(a), SCALE_2 << i, 0)); + return; + } + } + } + + // if we didn't find any better options + IMUL(32, gpr.RX(d), gpr.R(a), Imm32(imm)); +} + void Jit64::mulli(UGeckoInstruction inst) { INSTRUCTION_START @@ -1022,46 +1081,7 @@ void Jit64::mulli(UGeckoInstruction inst) { gpr.Lock(a, d); gpr.BindToRegister(d, (d == a), true); - if (imm == 0) - { - XOR(32, gpr.R(d), gpr.R(d)); - } - else if (imm == (u32)-1) - { - if (d != a) - MOV(32, gpr.R(d), gpr.R(a)); - NEG(32, gpr.R(d)); - } - else if ((imm & (imm - 1)) == 0) - { - u32 shift = 0; - - if (imm & 0xFFFF0000) - shift |= 16; - - if (imm & 0xFF00FF00) - shift |= 8; - - if (imm & 0xF0F0F0F0) - shift |= 4; - - if (imm & 0xCCCCCCCC) - shift |= 2; - - if (imm & 0xAAAAAAAA) - shift |= 1; - - if (d != a) - MOV(32, gpr.R(d), gpr.R(a)); - - if (shift) - SHL(32, gpr.R(d), Imm8(shift)); - } - else - { - IMUL(32, gpr.RX(d), gpr.R(a), Imm32(imm)); - } - + MultiplyImmediate(imm, a, d, false); gpr.UnlockAll(); } } @@ -1089,45 +1109,7 @@ void Jit64::mullwx(UGeckoInstruction inst) { u32 imm = gpr.R(a).IsImm() ? (u32)gpr.R(a).offset : (u32)gpr.R(b).offset; int src = gpr.R(a).IsImm() ? b : a; - if (imm == 0) - { - XOR(32, gpr.R(d), gpr.R(d)); - } - else if (imm == (u32)-1) - { - if (d != src) - MOV(32, gpr.R(d), gpr.R(src)); - NEG(32, gpr.R(d)); - } - else if ((imm & (imm - 1)) == 0 && !inst.OE) - { - u32 shift = 0; - - if (imm & 0xFFFF0000) - shift |= 16; - - if (imm & 0xFF00FF00) - shift |= 8; - - if (imm & 0xF0F0F0F0) - shift |= 4; - - if (imm & 0xCCCCCCCC) - shift |= 2; - - if (imm & 0xAAAAAAAA) - shift |= 1; - - if (d != src) - MOV(32, gpr.R(d), gpr.R(src)); - - if (shift) - SHL(32, gpr.R(d), Imm8(shift)); - } - else - { - IMUL(32, gpr.RX(d), gpr.R(src), Imm32(imm)); - } + MultiplyImmediate(imm, src, d, inst.OE); } else if (d == a) { From 41c3dde737d9dfbca01fce7ed1e9b561ab4023c8 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:21:00 -0700 Subject: [PATCH 03/13] JIT64: optimize rlwinmx/rlwinix and friends Take advantage of movzx as a replacement for anding with 0xff or 0xffff, and abuse loads from the register cache to save ops. --- Source/Core/Core/PowerPC/Jit64/Jit.h | 4 + .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 174 +++++++++++------- 2 files changed, 114 insertions(+), 64 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 47316f7944..79c1b9c36f 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -107,6 +107,10 @@ public: void GenerateRC(); void ComputeRC(const Gen::OpArg & arg); + // use to extract bytes from a register using the regcache. offset is in bytes. + Gen::OpArg ExtractFromReg(int reg, int offset); + void AndWithMask(Gen::X64Reg reg, u32 mask); + // Reads a given bit of a given CR register part. Clobbers ABI_PARAM1, // don't forget to xlock it before. void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index f805a93279..8398a5e97c 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -138,6 +138,30 @@ void Jit64::ComputeRC(const Gen::OpArg & arg) } } +OpArg Jit64::ExtractFromReg(int reg, int offset) +{ + OpArg src = gpr.R(reg); + // store to load forwarding should handle this case efficiently + if (offset) + { + gpr.StoreFromRegister(reg, FLUSH_MAINTAIN_STATE); + src = gpr.GetDefaultLocation(reg); + src.offset += offset; + } + return src; +} + +// we can't do this optimization in the emitter because MOVZX and AND have different effects on flags. +void Jit64::AndWithMask(X64Reg reg, u32 mask) + { + if (mask == 0xff) + MOVZX(32, 8, reg, R(reg)); + else if (mask == 0xffff) + MOVZX(32, 16, reg, R(reg)); + else + AND(32, R(reg), Imm32(mask)); +} + // Following static functions are used in conjunction with regimmop static u32 Add(u32 a, u32 b) { @@ -1577,49 +1601,57 @@ void Jit64::rlwinmx(UGeckoInstruction inst) result &= Helper_Mask(inst.MB, inst.ME); gpr.SetImmediate32(a, result); if (inst.Rc) - { ComputeRC(gpr.R(a)); - } } else { + bool isLeftShift = inst.SH && inst.MB == 0 && inst.ME == 31 - inst.SH; + bool isRightShift = inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH; + u32 mask = Helper_Mask(inst.MB, inst.ME); + bool simpleMask = mask == 0xff || mask == 0xffff; + int maskSize = inst.ME - inst.MB + 1; + gpr.Lock(a, s); gpr.BindToRegister(a, a == s); - if (a != s) + if (a != s && isLeftShift && gpr.R(s).IsSimpleReg() && inst.SH <= 3) { - MOV(32, gpr.R(a), gpr.R(s)); + LEA(32, gpr.RX(a), MScaled(gpr.RX(s), SCALE_1 << inst.SH, 0)); } - - if (inst.SH && inst.MB == 0 && inst.ME==31-inst.SH) + // common optimized case: byte/word extract + else if (simpleMask && !(inst.SH & (maskSize - 1))) { + MOVZX(32, maskSize, gpr.RX(a), ExtractFromReg(s, inst.SH ? (32 - inst.SH) >> 3 : 0)); + } + // another optimized special case: byte/word extract plus shift + else if (((mask >> inst.SH) << inst.SH) == mask && !isLeftShift && + ((mask >> inst.SH) == 0xff || (mask >> inst.SH) == 0xffff)) + { + MOVZX(32, maskSize, gpr.RX(a), gpr.R(s)); SHL(32, gpr.R(a), Imm8(inst.SH)); - if (inst.Rc) - ComputeRC(gpr.R(a)); - } - else if (inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH) - { - SHR(32, gpr.R(a), Imm8(inst.MB)); - if (inst.Rc) - ComputeRC(gpr.R(a)); } else { - if (inst.SH != 0) - { - ROL(32, gpr.R(a), Imm8(inst.SH)); - } + if (a != s) + MOV(32, gpr.R(a), gpr.R(s)); - if (!(inst.MB==0 && inst.ME==31)) + if (isLeftShift) { - AND(32, gpr.R(a), Imm32(Helper_Mask(inst.MB, inst.ME))); - if (inst.Rc) - ComputeRC(gpr.R(a)); + SHL(32, gpr.R(a), Imm8(inst.SH)); } - else if (inst.Rc) + else if (isRightShift) { - ComputeRC(gpr.R(a)); + SHR(32, gpr.R(a), Imm8(inst.MB)); + } + else + { + if (inst.SH != 0) + ROL(32, gpr.R(a), Imm8(inst.SH)); + if (!(inst.MB == 0 && inst.ME == 31)) + AndWithMask(gpr.RX(a), mask); } } + if (inst.Rc) + ComputeRC(gpr.R(a)); gpr.UnlockAll(); } } @@ -1637,75 +1669,89 @@ void Jit64::rlwimix(UGeckoInstruction inst) u32 mask = Helper_Mask(inst.MB,inst.ME); gpr.SetImmediate32(a, ((u32)gpr.R(a).offset & ~mask) | (_rotl((u32)gpr.R(s).offset,inst.SH) & mask)); if (inst.Rc) - { ComputeRC(gpr.R(a)); - } } else { gpr.Lock(a, s); - gpr.BindToRegister(a, true, true); u32 mask = Helper_Mask(inst.MB, inst.ME); if (mask == 0 || (a == s && inst.SH == 0)) { - if (inst.Rc) - { - ComputeRC(gpr.R(a)); - } + // nothing to do } else if (mask == 0xFFFFFFFF) { + gpr.BindToRegister(a, a == s, true); if (a != s) - { MOV(32, gpr.R(a), gpr.R(s)); - } - if (inst.SH) - { ROL(32, gpr.R(a), Imm8(inst.SH)); - } - - if (inst.Rc) - { - ComputeRC(gpr.R(a)); - } + } + else if(gpr.R(s).IsImm()) + { + gpr.BindToRegister(a, true, true); + AndWithMask(gpr.RX(a), ~mask); + OR(32, gpr.R(a), Imm32(_rotl((u32)gpr.R(s).offset, inst.SH) & mask)); } else if (inst.SH) { - if (mask == 0U - (1U << inst.SH)) + bool isLeftShift = mask == 0U - (1U << inst.SH); + bool isRightShift = mask == (1U << inst.SH) - 1; + if (gpr.R(a).IsImm()) { - MOV(32, R(EAX), gpr.R(s)); - SHL(32, R(EAX), Imm8(inst.SH)); - AND(32, gpr.R(a), Imm32(~mask)); - OR(32, gpr.R(a), R(EAX)); - } - else if (mask == (1U << inst.SH) - 1) - { - MOV(32, R(EAX), gpr.R(s)); - SHR(32, R(EAX), Imm8(32-inst.SH)); - AND(32, gpr.R(a), Imm32(~mask)); - OR(32, gpr.R(a), R(EAX)); + u32 maskA = gpr.R(a).offset & ~mask; + gpr.BindToRegister(a, false, true); + MOV(32, gpr.R(a), gpr.R(s)); + if (isLeftShift) + { + SHL(32, gpr.R(a), Imm8(inst.SH)); + } + else if (isRightShift) + { + SHR(32, gpr.R(a), Imm8(32 - inst.SH)); + } + else + { + ROL(32, gpr.R(a), Imm8(inst.SH)); + AND(32, gpr.R(a), Imm32(mask)); + } + OR(32, gpr.R(a), Imm32(maskA)); } else { + // TODO: common cases of this might be faster with pinsrb or abuse of AH + gpr.BindToRegister(a, true, true); MOV(32, R(EAX), gpr.R(s)); - ROL(32, R(EAX), Imm8(inst.SH)); - XOR(32, R(EAX), gpr.R(a)); - AND(32, R(EAX), Imm32(mask)); - XOR(32, gpr.R(a), R(EAX)); + if (isLeftShift) + { + SHL(32, R(EAX), Imm8(inst.SH)); + AndWithMask(gpr.RX(a), ~mask); + OR(32, gpr.R(a), R(EAX)); + } + else if (isRightShift) + { + SHR(32, R(EAX), Imm8(32 - inst.SH)); + AndWithMask(gpr.RX(a), ~mask); + OR(32, gpr.R(a), R(EAX)); + } + else + { + ROL(32, R(EAX), Imm8(inst.SH)); + XOR(32, R(EAX), gpr.R(a)); + AndWithMask(EAX, mask); + XOR(32, gpr.R(a), R(EAX)); + } } - - if (inst.Rc) - ComputeRC(gpr.R(a)); } else { + gpr.BindToRegister(a, true, true); XOR(32, gpr.R(a), gpr.R(s)); - AND(32, gpr.R(a), Imm32(~mask)); + AndWithMask(gpr.RX(a), ~mask); XOR(32, gpr.R(a), gpr.R(s)); - if (inst.Rc) - ComputeRC(gpr.R(a)); } + if (inst.Rc) + ComputeRC(gpr.R(a)); gpr.UnlockAll(); } } @@ -1736,7 +1782,7 @@ void Jit64::rlwnmx(UGeckoInstruction inst) MOV(32, gpr.R(a), gpr.R(s)); } ROL(32, gpr.R(a), R(ECX)); - AND(32, gpr.R(a), Imm32(mask)); + AndWithMask(gpr.RX(a), mask); if (inst.Rc) ComputeRC(gpr.R(a)); gpr.UnlockAll(); From 61af91ff163468cd9d99a9229001f614155ecc1e Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:24:27 -0700 Subject: [PATCH 04/13] JIT64: Optimize cmpXX Use TEST instead of CMP if we're comparing against 0 (rather common), and optimize the case of immediate compares further. --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 8398a5e97c..1d43d2b406 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -479,13 +479,29 @@ void Jit64::cmpXX(UGeckoInstruction inst) MOVZX(64, 32, RAX, gpr.R(a)); if (comparand.IsImm()) - MOV(32, R(ABI_PARAM1), comparand); + { + // sign extension will ruin this, so store it in a register + if (comparand.offset & 0x80000000U) + { + MOV(32, R(ABI_PARAM1), comparand); + comparand = R(ABI_PARAM1); + } + } else + { MOVZX(64, 32, ABI_PARAM1, comparand); - - comparand = R(ABI_PARAM1); + comparand = R(ABI_PARAM1); + } + } + if (comparand.IsImm() && !comparand.offset) + { + if (merge_branch) + TEST(64, R(RAX), R(RAX)); + } + else + { + SUB(64, R(RAX), comparand); } - SUB(64, R(RAX), comparand); MOV(64, M(&PowerPC::ppcState.cr_val[crf]), R(RAX)); if (merge_branch) From 355850f499fd15a5bc6df987040d6c6a0987154b Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:25:56 -0700 Subject: [PATCH 05/13] JIT64: optimize sign/zero-extend Also remove some comments that no longer apply since x86_32 was dropped. --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 1d43d2b406..45a164de98 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -796,11 +796,7 @@ void Jit64::extsbx(UGeckoInstruction inst) { gpr.Lock(a, s); gpr.BindToRegister(a, a == s, true); - // Always force moving to EAX because it isn't possible - // to refer to the lowest byte of some registers, at least in - // 32-bit mode. - MOV(32, R(EAX), gpr.R(s)); - MOVSX(32, 8, gpr.RX(a), R(AL)); // watch out for ah and friends + MOVSX(32, 8, gpr.RX(a), gpr.R(s)); gpr.UnlockAll(); } @@ -823,11 +819,7 @@ void Jit64::extshx(UGeckoInstruction inst) else { gpr.Lock(a, s); - gpr.KillImmediate(s, true, false); gpr.BindToRegister(a, a == s, true); - // This looks a little dangerous, but it's safe because - // every 32-bit register has a 16-bit half at the same index - // as the 32-bit register. MOVSX(32, 16, gpr.RX(a), gpr.R(s)); gpr.UnlockAll(); } From cd0c52b537628cdfcce483cbbbdf310c62666dc0 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:26:46 -0700 Subject: [PATCH 06/13] JIT64: avoid using LEA for adds when not necessary --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 45a164de98..bb3bd15969 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1428,13 +1428,6 @@ void Jit64::addx(UGeckoInstruction inst) GenerateConstantOverflow((s64)i + (s64)j); } } - else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && !inst.Rc && !inst.OE) - { - gpr.Lock(a, b, d); - gpr.BindToRegister(d, false); - LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(b), 1, 0)); - gpr.UnlockAll(); - } else if ((d == a) || (d == b)) { int operand = ((d == a) ? b : a); @@ -1447,6 +1440,15 @@ void Jit64::addx(UGeckoInstruction inst) ComputeRC(gpr.R(d)); gpr.UnlockAll(); } + else if (gpr.R(a).IsSimpleReg() && gpr.R(b).IsSimpleReg() && !inst.OE) + { + gpr.Lock(a, b, d); + gpr.BindToRegister(d, false); + LEA(32, gpr.RX(d), MComplex(gpr.RX(a), gpr.RX(b), 1, 0)); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); + } else { gpr.Lock(a, b, d); From 27996a65cfef64701a3b6e1a43bf982531ed000c Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:54:37 -0700 Subject: [PATCH 07/13] JIT64: use LEA for the "a = b + imm" case of addi --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index bb3bd15969..a5e492b162 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -208,8 +208,15 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void else { gpr.BindToRegister(d, false); - MOV(32, gpr.R(d), gpr.R(a)); - (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; + if (doop == Add && gpr.R(a).IsSimpleReg() && !carry) + { + LEA(32, gpr.RX(d), MDisp(gpr.RX(a), value)); + } + else + { + MOV(32, gpr.R(d), gpr.R(a)); + (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; + } if (carry) GenerateCarry(); if (Rc) From ad51fc7c4b2c7bd5df6aa86d235f573f1d0507dd Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 12:45:16 -0700 Subject: [PATCH 08/13] JIT64: use xor instead of mov for loading a zero regcache immediate --- Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp index 695d68b578..632e1d1694 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp @@ -299,7 +299,10 @@ void RegCache::StoreFromRegister(size_t i, FlushMode mode) void GPRRegCache::LoadRegister(size_t preg, X64Reg newLoc) { - emit->MOV(32, ::Gen::R(newLoc), regs[preg].location); + if (regs[preg].location.IsImm() && !regs[preg].location.offset) + emit->XOR(32, ::Gen::R(newLoc), ::Gen::R(newLoc)); + else + emit->MOV(32, ::Gen::R(newLoc), regs[preg].location); } void GPRRegCache::StoreRegister(size_t preg, OpArg newLoc) From ee24d4714a1ed3995870927b906af4ffe39b527d Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:29:58 -0700 Subject: [PATCH 09/13] JIT64: tweak srwx/slwx BindToRegister arguments Register B gets immediately moved into the shift register, so even if a == b it doesn't need to be loaded. --- Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index a5e492b162..1dd80562de 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1859,8 +1859,8 @@ void Jit64::srwx(UGeckoInstruction inst) { gpr.FlushLockX(ECX); gpr.Lock(a, b, s); - gpr.BindToRegister(a, (a == b || a == s), true); MOV(32, R(ECX), gpr.R(b)); + gpr.BindToRegister(a, a == s, true); if (a != s) { MOV(32, gpr.R(a), gpr.R(s)); @@ -1897,8 +1897,8 @@ void Jit64::slwx(UGeckoInstruction inst) { gpr.FlushLockX(ECX); gpr.Lock(a, b, s); - gpr.BindToRegister(a, (a == b || a == s), true); MOV(32, R(ECX), gpr.R(b)); + gpr.BindToRegister(a, a == s, true); if (a != s) { MOV(32, gpr.R(a), gpr.R(s)); From 805be80f1277a6a11d789b7c742c59048daa84a1 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 24 Aug 2014 11:35:57 -0700 Subject: [PATCH 10/13] JIT64: Optimize carry handling Carries are rather common and unpredictable, so do them branchlessly wherever we can. --- Source/Core/Core/PowerPC/Gekko.h | 9 +- Source/Core/Core/PowerPC/Jit64/Jit.h | 4 - .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 260 ++++-------------- Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp | 2 +- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 17 +- Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 3 +- 6 files changed, 84 insertions(+), 211 deletions(-) diff --git a/Source/Core/Core/PowerPC/Gekko.h b/Source/Core/Core/PowerPC/Gekko.h index 99cc750ee1..9354cc9738 100644 --- a/Source/Core/Core/PowerPC/Gekko.h +++ b/Source/Core/Core/PowerPC/Gekko.h @@ -331,9 +331,12 @@ union UFPR float f[2]; }; -#define XER_CA_MASK 0x20000000 -#define XER_OV_MASK 0x40000000 -#define XER_SO_MASK 0x80000000 +#define XER_CA_SHIFT 29 +#define XER_OV_SHIFT 30 +#define XER_SO_SHIFT 31 +#define XER_CA_MASK (1U << XER_CA_SHIFT) +#define XER_OV_MASK (1U << XER_OV_SHIFT) +#define XER_SO_MASK (1U << XER_SO_SHIFT) // XER union UReg_XER { diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 79c1b9c36f..d6eb895b47 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -101,10 +101,6 @@ public: void GenerateConstantOverflow(s64 val); void GenerateOverflow(); void FinalizeCarryOverflow(bool oe, bool inv = false); - void GetCarryEAXAndClear(); - void FinalizeCarryGenerateOverflowEAX(bool oe, bool inv = false); - void GenerateCarry(); - void GenerateRC(); void ComputeRC(const Gen::OpArg & arg); // use to extract bytes from a register using the regcache. offset is in bytes. diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 1dd80562de..8f11862754 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -31,6 +31,7 @@ void Jit64::GenerateConstantOverflow(bool overflow) } } +// We could do overflow branchlessly, but unlike carry it seems to be quite a bit rarer. void Jit64::GenerateOverflow() { FixupBranch jno = J_CC(CC_NO); @@ -49,82 +50,24 @@ void Jit64::FinalizeCarryOverflow(bool oe, bool inv) // USES_XER if (oe) { + // this is slightly messy because JitSetCAIf modifies x86 flags, so we have to do it in both + // sides of the branch. FixupBranch jno = J_CC(CC_NO); - // Do carry - FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC); - JitSetCA(); - SetJumpTarget(carry1); + JitSetCAIf(inv ? CC_NC : CC_C); //XER[OV/SO] = 1 OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_SO_MASK | XER_OV_MASK)); FixupBranch exit = J(); SetJumpTarget(jno); - // Do carry - FixupBranch carry2 = J_CC(inv ? CC_C : CC_NC); - JitSetCA(); - SetJumpTarget(carry2); + JitSetCAIf(inv ? CC_NC : CC_C); SetJumpTarget(exit); } else { // Do carry - FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC); - JitSetCA(); - SetJumpTarget(carry1); + JitSetCAIf(inv ? CC_NC : CC_C); } } -void Jit64::GetCarryEAXAndClear() -{ - MOV(32, R(EAX), M(&PowerPC::ppcState.spr[SPR_XER])); - BTR(32, R(EAX), Imm8(29)); -} - -// Assumes that XER is in EAX and that the CA bit is clear. -void Jit64::FinalizeCarryGenerateOverflowEAX(bool oe, bool inv) -{ - // USES_XER - if (oe) - { - FixupBranch jno = J_CC(CC_NO); - // Do carry - FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC); - OR(32, R(EAX), Imm32(XER_CA_MASK)); - SetJumpTarget(carry1); - //XER[OV/SO] = 1 - OR(32, R(EAX), Imm32(XER_SO_MASK | XER_OV_MASK)); - FixupBranch exit = J(); - SetJumpTarget(jno); - // Do carry - FixupBranch carry2 = J_CC(inv ? CC_C : CC_NC); - OR(32, R(EAX), Imm32(XER_CA_MASK)); - SetJumpTarget(carry2); - //XER[OV] = 0 - AND(32, R(EAX), Imm32(~XER_OV_MASK)); - SetJumpTarget(exit); - } - else - { - // Do carry - FixupBranch carry1 = J_CC(inv ? CC_C : CC_NC); - OR(32, R(EAX), Imm32(XER_CA_MASK)); - SetJumpTarget(carry1); - } - // Dump EAX back into XER - MOV(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); -} - -// Assumes that the flags were just set through an addition. -void Jit64::GenerateCarry() -{ - // USES_XER - FixupBranch pNoCarry = J_CC(CC_NC); - OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK)); - FixupBranch pContinue = J(); - SetJumpTarget(pNoCarry); - AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~(XER_CA_MASK))); - SetJumpTarget(pContinue); -} - void Jit64::ComputeRC(const Gen::OpArg & arg) { if (arg.IsImm()) @@ -153,12 +96,12 @@ OpArg Jit64::ExtractFromReg(int reg, int offset) // we can't do this optimization in the emitter because MOVZX and AND have different effects on flags. void Jit64::AndWithMask(X64Reg reg, u32 mask) - { +{ if (mask == 0xff) MOVZX(32, 8, reg, R(reg)); else if (mask == 0xffff) MOVZX(32, 16, reg, R(reg)); - else + else AND(32, R(reg), Imm32(mask)); } @@ -188,22 +131,16 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void gpr.Lock(d, a); if (a || binary || carry) // yeh nasty special case addic { + if (carry) + JitClearCAOV(false); if (gpr.R(a).IsImm() && !carry) { gpr.SetImmediate32(d, doop((u32)gpr.R(a).offset, value)); - if (Rc) - { - ComputeRC(gpr.R(d)); - } } else if (a == d) { gpr.KillImmediate(d, true, true); (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; - if (carry) - GenerateCarry(); - if (Rc) - ComputeRC(gpr.R(d)); } else { @@ -217,11 +154,11 @@ void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void MOV(32, gpr.R(d), gpr.R(a)); (this->*op)(32, gpr.R(d), Imm32(value)); //m_GPR[d] = m_GPR[_inst.RA] + _inst.SIMM_16; } - if (carry) - GenerateCarry(); - if (Rc) - ComputeRC(gpr.R(d)); } + if (carry) + JitSetCAIf(CC_C); + if (Rc) + ComputeRC(gpr.R(d)); } else if (doop == Add) { @@ -849,13 +786,11 @@ void Jit64::subfic(UGeckoInstruction inst) { if (imm == 0) { - JitClearCA(); + JitClearCAOV(false); // Flags act exactly like subtracting from 0 NEG(32, gpr.R(d)); // Output carry is inverted - FixupBranch carry1 = J_CC(CC_C); - JitSetCA(); - SetJumpTarget(carry1); + JitSetCAIf(CC_NC); } else if (imm == -1) { @@ -865,24 +800,20 @@ void Jit64::subfic(UGeckoInstruction inst) } else { - JitClearCA(); + JitClearCAOV(false); NOT(32, gpr.R(d)); ADD(32, gpr.R(d), Imm32(imm+1)); // Output carry is normal - FixupBranch carry1 = J_CC(CC_NC); - JitSetCA(); - SetJumpTarget(carry1); + JitSetCAIf(CC_C); } } else { - JitClearCA(); + JitClearCAOV(false); MOV(32, gpr.R(d), Imm32(imm)); SUB(32, gpr.R(d), gpr.R(a)); // Output carry is inverted - FixupBranch carry1 = J_CC(CC_C); - JitSetCA(); - SetJumpTarget(carry1); + JitSetCAIf(CC_NC); } gpr.UnlockAll(); // This instruction has no RC flag @@ -927,7 +858,7 @@ void Jit64::subfex(UGeckoInstruction inst) gpr.Lock(a, b, d); gpr.BindToRegister(d, (d == a || d == b), true); - GetCarryEAXAndClear(); + JitGetAndClearCAOV(inst.OE); bool invertedCarry = false; if (d == b) @@ -948,7 +879,7 @@ void Jit64::subfex(UGeckoInstruction inst) NOT(32, gpr.R(d)); ADC(32, gpr.R(d), gpr.R(b)); } - FinalizeCarryGenerateOverflowEAX(inst.OE, invertedCarry); + FinalizeCarryOverflow(inst.OE, invertedCarry); if (inst.Rc) ComputeRC(gpr.R(d)); @@ -964,14 +895,12 @@ void Jit64::subfmex(UGeckoInstruction inst) gpr.Lock(a, d); gpr.BindToRegister(d, d == a); - GetCarryEAXAndClear(); + JitGetAndClearCAOV(inst.OE); if (d != a) - { MOV(32, gpr.R(d), gpr.R(a)); - } NOT(32, gpr.R(d)); ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); - FinalizeCarryGenerateOverflowEAX(inst.OE); + FinalizeCarryOverflow(inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -987,14 +916,12 @@ void Jit64::subfzex(UGeckoInstruction inst) gpr.Lock(a, d); gpr.BindToRegister(d, d == a); - GetCarryEAXAndClear(); + JitGetAndClearCAOV(inst.OE); if (d != a) - { MOV(32, gpr.R(d), gpr.R(a)); - } NOT(32, gpr.R(d)); ADC(32, gpr.R(d), Imm8(0)); - FinalizeCarryGenerateOverflowEAX(inst.OE); + FinalizeCarryOverflow(inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); @@ -1012,13 +939,9 @@ void Jit64::subfx(UGeckoInstruction inst) s32 i = (s32)gpr.R(b).offset, j = (s32)gpr.R(a).offset; gpr.SetImmediate32(d, i - j); if (inst.Rc) - { ComputeRC(gpr.R(d)); - } if (inst.OE) - { GenerateConstantOverflow((s64)i - (s64)j); - } } else { @@ -1477,31 +1400,22 @@ void Jit64::addex(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); int a = inst.RA, b = inst.RB, d = inst.RD; + gpr.Lock(a, b, d); + gpr.BindToRegister(d, (d == a) || (d == b)); + JitGetAndClearCAOV(inst.OE); if ((d == a) || (d == b)) { - gpr.Lock(a, b, d); - gpr.BindToRegister(d, true); - - GetCarryEAXAndClear(); ADC(32, gpr.R(d), gpr.R((d == a) ? b : a)); - FinalizeCarryGenerateOverflowEAX(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); } else { - gpr.Lock(a, b, d); - gpr.BindToRegister(d, false); - - GetCarryEAXAndClear(); MOV(32, gpr.R(d), gpr.R(a)); ADC(32, gpr.R(d), gpr.R(b)); - FinalizeCarryGenerateOverflowEAX(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); } + FinalizeCarryOverflow(inst.OE); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); } void Jit64::addcx(UGeckoInstruction inst) @@ -1543,31 +1457,16 @@ void Jit64::addmex(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); int a = inst.RA, d = inst.RD; - if (d == a) - { - gpr.Lock(d); - gpr.BindToRegister(d, true); - - GetCarryEAXAndClear(); - ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); - FinalizeCarryGenerateOverflowEAX(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); - } - else - { - gpr.Lock(a, d); - gpr.BindToRegister(d, false); - - GetCarryEAXAndClear(); + gpr.Lock(d); + gpr.BindToRegister(d, d == a); + JitGetAndClearCAOV(inst.OE); + if (d != a) MOV(32, gpr.R(d), gpr.R(a)); - ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); - FinalizeCarryGenerateOverflowEAX(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); - } + ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); + FinalizeCarryOverflow(inst.OE); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); } void Jit64::addzex(UGeckoInstruction inst) @@ -1577,31 +1476,16 @@ void Jit64::addzex(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); int a = inst.RA, d = inst.RD; - if (d == a) - { - gpr.Lock(d); - gpr.BindToRegister(d, true); - - GetCarryEAXAndClear(); - ADC(32, gpr.R(d), Imm8(0)); - FinalizeCarryGenerateOverflowEAX(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); - } - else - { - gpr.Lock(a, d); - gpr.BindToRegister(d, false); - - GetCarryEAXAndClear(); + gpr.Lock(d); + gpr.BindToRegister(d, d == a); + JitGetAndClearCAOV(inst.OE); + if (d != a) MOV(32, gpr.R(d), gpr.R(a)); - ADC(32, gpr.R(d), Imm8(0)); - FinalizeCarryGenerateOverflowEAX(inst.OE); - if (inst.Rc) - ComputeRC(gpr.R(d)); - gpr.UnlockAll(); - } + ADC(32, gpr.R(d), Imm8(0)); + FinalizeCarryOverflow(inst.OE); + if (inst.Rc) + ComputeRC(gpr.R(d)); + gpr.UnlockAll(); } void Jit64::rlwinmx(UGeckoInstruction inst) @@ -1792,8 +1676,8 @@ void Jit64::rlwnmx(UGeckoInstruction inst) { gpr.FlushLockX(ECX); gpr.Lock(a, b, s); - gpr.BindToRegister(a, (a == b || a == s), true); MOV(32, R(ECX), gpr.R(b)); + gpr.BindToRegister(a, (a == s), true); if (a != s) { MOV(32, gpr.R(a), gpr.R(s)); @@ -1900,9 +1784,7 @@ void Jit64::slwx(UGeckoInstruction inst) MOV(32, R(ECX), gpr.R(b)); gpr.BindToRegister(a, a == s, true); if (a != s) - { MOV(32, gpr.R(a), gpr.R(s)); - } SHL(64, gpr.R(a), R(ECX)); if (inst.Rc) { @@ -1929,7 +1811,7 @@ void Jit64::srawx(UGeckoInstruction inst) gpr.Lock(a, s, b); gpr.FlushLockX(ECX); gpr.BindToRegister(a, (a == s || a == b), true); - JitClearCA(); + JitClearCAOV(false); MOV(32, R(ECX), gpr.R(b)); if (a != s) MOV(32, gpr.R(a), gpr.R(s)); @@ -1938,16 +1820,11 @@ void Jit64::srawx(UGeckoInstruction inst) MOV(32, R(EAX), gpr.R(a)); SHR(64, gpr.R(a), Imm8(32)); TEST(32, gpr.R(a), R(EAX)); - FixupBranch nocarry = J_CC(CC_Z); - JitSetCA(); - SetJumpTarget(nocarry); + JitSetCAIf(CC_NZ); gpr.UnlockAll(); gpr.UnlockAllX(); - if (inst.Rc) - { ComputeRC(gpr.R(a)); - } } void Jit64::srawix(UGeckoInstruction inst) @@ -1961,21 +1838,14 @@ void Jit64::srawix(UGeckoInstruction inst) { gpr.Lock(a, s); gpr.BindToRegister(a, a == s, true); - JitClearCA(); + JitClearCAOV(false); MOV(32, R(EAX), gpr.R(s)); if (a != s) - { MOV(32, gpr.R(a), R(EAX)); - } SAR(32, gpr.R(a), Imm8(amount)); - if (inst.Rc) - ComputeRC(gpr.R(a)); - SHL(32, R(EAX), Imm8(32-amount)); + SHL(32, R(EAX), Imm8(32 - amount)); TEST(32, R(EAX), gpr.R(a)); - FixupBranch nocarry = J_CC(CC_Z); - JitSetCA(); - SetJumpTarget(nocarry); - gpr.UnlockAll(); + JitSetCAIf(CC_NZ); } else { @@ -1983,20 +1853,15 @@ void Jit64::srawix(UGeckoInstruction inst) FALLBACK_IF(true); gpr.Lock(a, s); - JitClearCA(); + JitClearCAOV(false); gpr.BindToRegister(a, a == s, true); if (a != s) - { MOV(32, gpr.R(a), gpr.R(s)); - } - - if (inst.Rc) - { - ComputeRC(gpr.R(a)); - } - gpr.UnlockAll(); } + if (inst.Rc) + ComputeRC(gpr.R(a)); + gpr.UnlockAll(); } // count leading zeroes @@ -2032,10 +1897,7 @@ void Jit64::cntlzwx(UGeckoInstruction inst) } if (inst.Rc) - { ComputeRC(gpr.R(a)); - // TODO: Check PPC manual too - } } void Jit64::twx(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp index a3b4a91881..c0abd0242c 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp @@ -1110,7 +1110,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) Jit->JitSetCA(); FixupBranch cont = Jit->J(); Jit->SetJumpTarget(nocarry); - Jit->JitClearCA(); + Jit->JitClearCAOV(false); Jit->SetJumpTarget(cont); regNormalRegClear(RI, I); break; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 58340b072e..f701c95ee9 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -802,10 +802,11 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm) OR(32, M(&FPSCR), R(EAX)); } - -void EmuCodeBlock::JitClearCA() +void EmuCodeBlock::JitGetAndClearCAOV(bool oe) { - AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0 + if (oe) + AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_OV_MASK)); //XER.OV = 0 + BTR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm8(29)); //carry = XER.CA, XER.CA = 0 } void EmuCodeBlock::JitSetCA() @@ -813,6 +814,16 @@ void EmuCodeBlock::JitSetCA() OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1 } +// Some testing shows CA is set roughly ~1/3 of the time (relative to clears), so +// branchless calculation of CA is probably faster in general. +void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode) +{ + SETcc(conditionCode, R(EAX)); + MOVZX(32, 8, EAX, R(AL)); + SHL(32, R(EAX), Imm8(XER_CA_SHIFT)); + OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); //XER.CA = 1 +} + void EmuCodeBlock::JitClearCAOV(bool oe) { if (oe) diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index addce16e93..2ce315d20e 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -50,8 +50,9 @@ public: void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0); void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false); - void JitClearCA(); + void JitGetAndClearCAOV(bool oe); void JitSetCA(); + void JitSetCAIf(Gen::CCFlags conditionCode); void JitClearCAOV(bool oe); void ForceSinglePrecisionS(Gen::X64Reg xmm); From 10d691a2779135f4a6e4683c5a501809d921e329 Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 28 Aug 2014 10:21:46 -0700 Subject: [PATCH 11/13] JIT64: optimize some special cases of srawix Shift by 31 and 1, both of which are pretty common, can be done in a few less instructions. Tested with a hwtest. --- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 8f11862754..893921f5e8 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -1842,10 +1842,30 @@ void Jit64::srawix(UGeckoInstruction inst) MOV(32, R(EAX), gpr.R(s)); if (a != s) MOV(32, gpr.R(a), R(EAX)); - SAR(32, gpr.R(a), Imm8(amount)); - SHL(32, R(EAX), Imm8(32 - amount)); - TEST(32, R(EAX), gpr.R(a)); - JitSetCAIf(CC_NZ); + // some optimized common cases that can be done in slightly fewer ops + if (amount == 31) + { + SAR(32, gpr.R(a), Imm8(31)); + NEG(32, R(EAX)); // EAX = input == INT_MIN ? INT_MIN : -input; + AND(32, R(EAX), Imm32(0x80000000)); // EAX = input < 0 && input != INT_MIN ? 0 : 0x80000000 + SHR(32, R(EAX), Imm8(31 - XER_CA_SHIFT)); + XOR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = (input < 0 && input != INT_MIN) + } + else if (amount == 1) + { + SHR(32, R(EAX), Imm8(31)); // sign + AND(32, R(EAX), gpr.R(a)); // (sign && carry) + SAR(32, gpr.R(a), Imm8(1)); + SHL(32, R(EAX), Imm8(XER_CA_SHIFT)); + OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001 + } + else + { + SAR(32, gpr.R(a), Imm8(amount)); + SHL(32, R(EAX), Imm8(32 - amount)); + TEST(32, R(EAX), gpr.R(a)); + JitSetCAIf(CC_NZ); + } } else { From a40278b1c452a734fd131add07c560b59a5438f6 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 17 Aug 2014 23:12:16 -0700 Subject: [PATCH 12/13] JIT64: support merged branching for rlwinmx, too Not quite as common a branch instruction as cmpwi, but close. --- Source/Core/Core/PowerPC/Jit64/Jit.h | 2 + .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 205 ++++++++++-------- 2 files changed, 112 insertions(+), 95 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index d6eb895b47..2d23bc8fbb 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -106,6 +106,8 @@ public: // use to extract bytes from a register using the regcache. offset is in bytes. Gen::OpArg ExtractFromReg(int reg, int offset); void AndWithMask(Gen::X64Reg reg, u32 mask); + bool CheckMergedBranch(int crf); + void DoMergedBranch(); // Reads a given bit of a given CR register part. Clobbers ABI_PARAM1, // don't forget to xlock it before. diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index 893921f5e8..b187ff3a71 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -264,6 +264,56 @@ void Jit64::reg_imm(UGeckoInstruction inst) } } +bool Jit64::CheckMergedBranch(int crf) +{ + const UGeckoInstruction& next = js.next_inst; + if (((next.OPCD == 16 /* bcx */) || + ((next.OPCD == 19) && (next.SUBOP10 == 528) /* bcctrx */) || + ((next.OPCD == 19) && (next.SUBOP10 == 16) /* bclrx */)) && + (next.BO & BO_DONT_DECREMENT_FLAG) && + !(next.BO & BO_DONT_CHECK_CONDITION) && + (next.BI >> 2) == crf) + return true; + return false; +} + +void Jit64::DoMergedBranch() +{ + // Code that handles successful PPC branching. + if (js.next_inst.OPCD == 16) // bcx + { + if (js.next_inst.LK) + MOV(32, M(&LR), Imm32(js.compilerPC + 4)); + + u32 destination; + if (js.next_inst.AA) + destination = SignExt16(js.next_inst.BD << 2); + else + destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2); + WriteExit(destination); + } + else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx + { + if (js.next_inst.LK) + MOV(32, M(&LR), Imm32(js.compilerPC + 4)); + MOV(32, R(EAX), M(&CTR)); + AND(32, R(EAX), Imm32(0xFFFFFFFC)); + WriteExitDestInEAX(); + } + else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx + { + MOV(32, R(EAX), M(&LR)); + AND(32, R(EAX), Imm32(0xFFFFFFFC)); + if (js.next_inst.LK) + MOV(32, M(&LR), Imm32(js.compilerPC + 4)); + WriteExitDestInEAX(); + } + else + { + PanicAlert("WTF invalid branch"); + } +} + void Jit64::cmpXX(UGeckoInstruction inst) { // USES_CR @@ -272,23 +322,7 @@ void Jit64::cmpXX(UGeckoInstruction inst) int a = inst.RA; int b = inst.RB; int crf = inst.CRFD; - - bool merge_branch = false; - int test_crf = js.next_inst.BI >> 2; - // Check if the next instruction is a branch - if it is, merge the two. - if (((js.next_inst.OPCD == 16 /* bcx */) || - ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528) /* bcctrx */) || - ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16) /* bclrx */)) && - (js.next_inst.BO & BO_DONT_DECREMENT_FLAG) && - !(js.next_inst.BO & BO_DONT_CHECK_CONDITION)) - { - // Looks like a decent conditional branch that we can merge with. - // It only test CR, not CTR. - if (test_crf == crf) - { - merge_branch = true; - } - } + bool merge_branch = CheckMergedBranch(crf); OpArg comparand; bool signedCompare; @@ -358,45 +392,13 @@ void Jit64::cmpXX(UGeckoInstruction inst) { gpr.Flush(); fpr.Flush(); - - if (js.next_inst.OPCD == 16) // bcx - { - if (js.next_inst.LK) - MOV(32, M(&LR), Imm32(js.compilerPC + 4)); - - u32 destination; - if (js.next_inst.AA) - destination = SignExt16(js.next_inst.BD << 2); - else - destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2); - WriteExit(destination); - } - else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx - { - if (js.next_inst.LK) - MOV(32, M(&LR), Imm32(js.compilerPC + 4)); - MOV(32, R(EAX), M(&CTR)); - AND(32, R(EAX), Imm32(0xFFFFFFFC)); - WriteExitDestInEAX(); - } - else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx - { - MOV(32, R(EAX), M(&LR)); - if (js.next_inst.LK) - MOV(32, M(&LR), Imm32(js.compilerPC + 4)); - WriteExitDestInEAX(); - } - else - { - PanicAlert("WTF invalid branch"); - } + DoMergedBranch(); } - else + else if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) { - if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) - { - WriteExit(js.next_compilerPC + 4); - } + gpr.Flush(); + fpr.Flush(); + WriteExit(js.next_compilerPC + 4); } } } @@ -473,51 +475,12 @@ void Jit64::cmpXX(UGeckoInstruction inst) gpr.Flush(FLUSH_MAINTAIN_STATE); fpr.Flush(FLUSH_MAINTAIN_STATE); - // Code that handles successful PPC branching. - if (js.next_inst.OPCD == 16) // bcx - { - if (js.next_inst.LK) - MOV(32, M(&LR), Imm32(js.compilerPC + 4)); - - u32 destination; - if (js.next_inst.AA) - destination = SignExt16(js.next_inst.BD << 2); - else - destination = js.next_compilerPC + SignExt16(js.next_inst.BD << 2); - WriteExit(destination); - } - else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 528)) // bcctrx - { - if (js.next_inst.LK) - MOV(32, M(&LR), Imm32(js.compilerPC + 4)); - - MOV(32, R(EAX), M(&CTR)); - AND(32, R(EAX), Imm32(0xFFFFFFFC)); - WriteExitDestInEAX(); - } - else if ((js.next_inst.OPCD == 19) && (js.next_inst.SUBOP10 == 16)) // bclrx - { - MOV(32, R(EAX), M(&LR)); - AND(32, R(EAX), Imm32(0xFFFFFFFC)); - - if (js.next_inst.LK) - MOV(32, M(&LR), Imm32(js.compilerPC + 4)); - - WriteExitDestInEAX(); - } - else - { - PanicAlert("WTF invalid branch"); - } + DoMergedBranch(); SetJumpTarget(pDontBranch); if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) - { - gpr.Flush(); - fpr.Flush(); WriteExit(js.next_compilerPC + 4); - } } } @@ -1494,6 +1457,11 @@ void Jit64::rlwinmx(UGeckoInstruction inst) JITDISABLE(bJITIntegerOff); int a = inst.RA; int s = inst.RS; + + // rlwinm is commonly used as a branch test, second only to the more obvious cmpw. + // since it's almost never used with any check other than beq, only support beq for simplicity. + bool merge_branch = inst.Rc && CheckMergedBranch(0) && (js.next_inst.BI & 3) == 2; + if (gpr.R(s).IsImm()) { u32 result = (int)gpr.R(s).offset; @@ -1510,6 +1478,11 @@ void Jit64::rlwinmx(UGeckoInstruction inst) bool isRightShift = inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH; u32 mask = Helper_Mask(inst.MB, inst.ME); bool simpleMask = mask == 0xff || mask == 0xffff; + // in case of a merged branch, track whether or not we've set flags. + // if not, we need to do a TEST later to get them. + bool needsTest = false; + // if we know the high bit can't be set, we can avoid doing a sign extend for flag storage + bool needsSext = true; int maskSize = inst.ME - inst.MB + 1; gpr.Lock(a, s); @@ -1517,11 +1490,14 @@ void Jit64::rlwinmx(UGeckoInstruction inst) if (a != s && isLeftShift && gpr.R(s).IsSimpleReg() && inst.SH <= 3) { LEA(32, gpr.RX(a), MScaled(gpr.RX(s), SCALE_1 << inst.SH, 0)); + needsTest = true; } // common optimized case: byte/word extract else if (simpleMask && !(inst.SH & (maskSize - 1))) { MOVZX(32, maskSize, gpr.RX(a), ExtractFromReg(s, inst.SH ? (32 - inst.SH) >> 3 : 0)); + needsTest = true; + needsSext = false; } // another optimized special case: byte/word extract plus shift else if (((mask >> inst.SH) << inst.SH) == mask && !isLeftShift && @@ -1529,6 +1505,7 @@ void Jit64::rlwinmx(UGeckoInstruction inst) { MOVZX(32, maskSize, gpr.RX(a), gpr.R(s)); SHL(32, gpr.R(a), Imm8(inst.SH)); + needsSext = inst.SH + maskSize >= 32; } else { @@ -1542,16 +1519,54 @@ void Jit64::rlwinmx(UGeckoInstruction inst) else if (isRightShift) { SHR(32, gpr.R(a), Imm8(inst.MB)); + needsSext = false; } else { if (inst.SH != 0) ROL(32, gpr.R(a), Imm8(inst.SH)); if (!(inst.MB == 0 && inst.ME == 31)) + { AndWithMask(gpr.RX(a), mask); + needsSext = inst.MB == 0; + needsTest = simpleMask; + } + else + { + needsTest = true; + } } } - if (inst.Rc) + if (merge_branch) + { + js.downcountAmount++; + js.skipnext = true; + + if (needsSext) + MOVSX(64, 32, gpr.RX(a), gpr.R(a)); + MOV(64, M(&PowerPC::ppcState.cr_val[0]), gpr.R(a)); + + if (needsTest) + TEST(32, gpr.R(a), gpr.R(a)); + + gpr.UnlockAll(); + FixupBranch pDontBranch = J_CC((js.next_inst.BO & BO_BRANCH_IF_TRUE) ? CC_NE : CC_E, true); + + gpr.Flush(FLUSH_MAINTAIN_STATE); + fpr.Flush(FLUSH_MAINTAIN_STATE); + + DoMergedBranch(); + + SetJumpTarget(pDontBranch); + + if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) + { + gpr.Flush(); + fpr.Flush(); + WriteExit(js.next_compilerPC + 4); + } + } + else if (inst.Rc) ComputeRC(gpr.R(a)); gpr.UnlockAll(); } From 3aa40dab001319b9f488c8e9f9a2ab3cd2f2d661 Mon Sep 17 00:00:00 2001 From: Fiora Date: Thu, 21 Aug 2014 13:56:18 -0700 Subject: [PATCH 13/13] JIT64: optimize carry calculations Omit carry calculations that get overwritten later in the block before they're used. Very common in the case of srawix and friends. --- .../Interpreter/Interpreter_Tables.cpp | 8 +- Source/Core/Core/PowerPC/Jit64/Jit.h | 2 +- .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 4 +- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 131 ++++++++++-------- Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp | 2 +- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 10 +- Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 2 +- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 42 ++++-- Source/Core/Core/PowerPC/PPCAnalyst.h | 4 +- 9 files changed, 119 insertions(+), 86 deletions(-) diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp index 2bf66ae99b..317132266d 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_Tables.cpp @@ -34,7 +34,7 @@ static GekkoOPTemplate primarytable[] = {10, Interpreter::cmpli, {"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}}, {11, Interpreter::cmpi, {"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}}, {12, Interpreter::addic, {"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA, 1, 0, 0, 0}}, - {13, Interpreter::addic_rc, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0, 1, 0, 0, 0}}, + {13, Interpreter::addic_rc, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0, 1, 0, 0, 0}}, {14, Interpreter::addi, {"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}}, {15, Interpreter::addis, {"addis", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}}, @@ -180,8 +180,8 @@ static GekkoOPTemplate table31[] = {922, Interpreter::extshx, {"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {954, Interpreter::extsbx, {"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {536, Interpreter::srwx, {"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, - {792, Interpreter::srawx, {"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, - {824, Interpreter::srawix, {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, + {792, Interpreter::srawx, {"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, + {824, Interpreter::srawix, {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}}, {24, Interpreter::slwx, {"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}}, {54, Interpreter::dcbst, {"dcbst", OPTYPE_DCACHE, 0, 5, 0, 0, 0}}, @@ -260,7 +260,7 @@ static GekkoOPTemplate table31[] = {339, Interpreter::mfspr, {"mfspr", OPTYPE_SPR, FL_OUT_D, 1, 0, 0, 0}}, {467, Interpreter::mtspr, {"mtspr", OPTYPE_SPR, 0, 2, 0, 0, 0}}, {371, Interpreter::mftb, {"mftb", OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER, 1, 0, 0, 0}}, - {512, Interpreter::mcrxr, {"mcrxr", OPTYPE_SYSTEM, 0, 1, 0, 0, 0}}, + {512, Interpreter::mcrxr, {"mcrxr", OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA, 1, 0, 0, 0}}, {595, Interpreter::mfsr, {"mfsr", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}}, {659, Interpreter::mfsrin, {"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 2d23bc8fbb..a07968eabb 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -100,7 +100,7 @@ public: void GenerateConstantOverflow(bool overflow); void GenerateConstantOverflow(s64 val); void GenerateOverflow(); - void FinalizeCarryOverflow(bool oe, bool inv = false); + void FinalizeCarryOverflow(bool ca, bool oe, bool inv = false); void ComputeRC(const Gen::OpArg & arg); // use to extract bytes from a register using the regcache. offset is in bytes. diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 098c8f61e4..b965d6e91b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -193,8 +193,8 @@ static GekkoOPTemplate table31[] = {922, &Jit64::extshx}, //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {954, &Jit64::extsbx}, //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {536, &Jit64::srwx}, //"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, - {792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, - {824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, + {792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}}, + {824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}}, {24, &Jit64::slwx}, //"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {54, &Jit64::dcbst}, //"dcbst", OPTYPE_DCACHE, 0, 4}}, diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index b187ff3a71..d0c6983c20 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -45,7 +45,7 @@ void Jit64::GenerateOverflow() } // Assumes CA,OV are clear -void Jit64::FinalizeCarryOverflow(bool oe, bool inv) +void Jit64::FinalizeCarryOverflow(bool ca, bool oe, bool inv) { // USES_XER if (oe) @@ -53,15 +53,17 @@ void Jit64::FinalizeCarryOverflow(bool oe, bool inv) // this is slightly messy because JitSetCAIf modifies x86 flags, so we have to do it in both // sides of the branch. FixupBranch jno = J_CC(CC_NO); - JitSetCAIf(inv ? CC_NC : CC_C); + if (ca) + JitSetCAIf(inv ? CC_NC : CC_C); //XER[OV/SO] = 1 OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_SO_MASK | XER_OV_MASK)); FixupBranch exit = J(); SetJumpTarget(jno); - JitSetCAIf(inv ? CC_NC : CC_C); + if (ca) + JitSetCAIf(inv ? CC_NC : CC_C); SetJumpTarget(exit); } - else + else if (ca) { // Do carry JitSetCAIf(inv ? CC_NC : CC_C); @@ -129,10 +131,10 @@ static u32 Xor(u32 a, u32 b) void Jit64::regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc, bool carry) { gpr.Lock(d, a); + carry &= js.op->wantsCA; if (a || binary || carry) // yeh nasty special case addic { - if (carry) - JitClearCAOV(false); + JitClearCAOV(carry, false); if (gpr.R(a).IsImm() && !carry) { gpr.SetImmediate32(d, doop((u32)gpr.R(a).offset, value)); @@ -749,34 +751,38 @@ void Jit64::subfic(UGeckoInstruction inst) { if (imm == 0) { - JitClearCAOV(false); + JitClearCAOV(js.op->wantsCA, false); // Flags act exactly like subtracting from 0 NEG(32, gpr.R(d)); // Output carry is inverted - JitSetCAIf(CC_NC); + if (js.op->wantsCA) + JitSetCAIf(CC_NC); } else if (imm == -1) { // CA is always set in this case - JitSetCA(); + if (js.op->wantsCA) + JitSetCA(); NOT(32, gpr.R(d)); } else { - JitClearCAOV(false); + JitClearCAOV(js.op->wantsCA, false); NOT(32, gpr.R(d)); ADD(32, gpr.R(d), Imm32(imm+1)); // Output carry is normal - JitSetCAIf(CC_C); + if (js.op->wantsCA) + JitSetCAIf(CC_C); } } else { - JitClearCAOV(false); + JitClearCAOV(js.op->wantsCA, false); MOV(32, gpr.R(d), Imm32(imm)); SUB(32, gpr.R(d), gpr.R(a)); // Output carry is inverted - JitSetCAIf(CC_NC); + if (js.op->wantsCA) + JitSetCAIf(CC_NC); } gpr.UnlockAll(); // This instruction has no RC flag @@ -789,8 +795,7 @@ void Jit64::subfcx(UGeckoInstruction inst) int a = inst.RA, b = inst.RB, d = inst.RD; gpr.Lock(a, b, d); gpr.BindToRegister(d, (d == a || d == b), true); - - JitClearCAOV(inst.OE); + JitClearCAOV(js.op->wantsCA, inst.OE); if (d == b) { SUB(32, gpr.R(d), gpr.R(a)); @@ -808,7 +813,7 @@ void Jit64::subfcx(UGeckoInstruction inst) } if (inst.Rc) ComputeRC(gpr.R(d)); - FinalizeCarryOverflow(inst.OE, true); + FinalizeCarryOverflow(js.op->wantsCA, inst.OE, true); gpr.UnlockAll(); } @@ -842,7 +847,7 @@ void Jit64::subfex(UGeckoInstruction inst) NOT(32, gpr.R(d)); ADC(32, gpr.R(d), gpr.R(b)); } - FinalizeCarryOverflow(inst.OE, invertedCarry); + FinalizeCarryOverflow(js.op->wantsCA, inst.OE, invertedCarry); if (inst.Rc) ComputeRC(gpr.R(d)); @@ -863,7 +868,7 @@ void Jit64::subfmex(UGeckoInstruction inst) MOV(32, gpr.R(d), gpr.R(a)); NOT(32, gpr.R(d)); ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); - FinalizeCarryOverflow(inst.OE); + FinalizeCarryOverflow(js.op->wantsCA, inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -884,7 +889,7 @@ void Jit64::subfzex(UGeckoInstruction inst) MOV(32, gpr.R(d), gpr.R(a)); NOT(32, gpr.R(d)); ADC(32, gpr.R(d), Imm8(0)); - FinalizeCarryOverflow(inst.OE); + FinalizeCarryOverflow(js.op->wantsCA, inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); @@ -1375,7 +1380,7 @@ void Jit64::addex(UGeckoInstruction inst) MOV(32, gpr.R(d), gpr.R(a)); ADC(32, gpr.R(d), gpr.R(b)); } - FinalizeCarryOverflow(inst.OE); + FinalizeCarryOverflow(js.op->wantsCA, inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -1392,9 +1397,9 @@ void Jit64::addcx(UGeckoInstruction inst) int operand = ((d == a) ? b : a); gpr.Lock(a, b, d); gpr.BindToRegister(d, true); - JitClearCAOV(inst.OE); + JitClearCAOV(js.op->wantsCA, inst.OE); ADD(32, gpr.R(d), gpr.R(operand)); - FinalizeCarryOverflow(inst.OE); + FinalizeCarryOverflow(js.op->wantsCA, inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -1403,10 +1408,10 @@ void Jit64::addcx(UGeckoInstruction inst) { gpr.Lock(a, b, d); gpr.BindToRegister(d, false); - JitClearCAOV(inst.OE); + JitClearCAOV(js.op->wantsCA, inst.OE); MOV(32, gpr.R(d), gpr.R(a)); ADD(32, gpr.R(d), gpr.R(b)); - FinalizeCarryOverflow(inst.OE); + FinalizeCarryOverflow(js.op->wantsCA, inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -1426,7 +1431,7 @@ void Jit64::addmex(UGeckoInstruction inst) if (d != a) MOV(32, gpr.R(d), gpr.R(a)); ADC(32, gpr.R(d), Imm32(0xFFFFFFFF)); - FinalizeCarryOverflow(inst.OE); + FinalizeCarryOverflow(js.op->wantsCA, inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -1445,7 +1450,7 @@ void Jit64::addzex(UGeckoInstruction inst) if (d != a) MOV(32, gpr.R(d), gpr.R(a)); ADC(32, gpr.R(d), Imm8(0)); - FinalizeCarryOverflow(inst.OE); + FinalizeCarryOverflow(js.op->wantsCA, inst.OE); if (inst.Rc) ComputeRC(gpr.R(d)); gpr.UnlockAll(); @@ -1826,16 +1831,23 @@ void Jit64::srawx(UGeckoInstruction inst) gpr.Lock(a, s, b); gpr.FlushLockX(ECX); gpr.BindToRegister(a, (a == s || a == b), true); - JitClearCAOV(false); + JitClearCAOV(js.op->wantsCA, false); MOV(32, R(ECX), gpr.R(b)); if (a != s) MOV(32, gpr.R(a), gpr.R(s)); SHL(64, gpr.R(a), Imm8(32)); SAR(64, gpr.R(a), R(ECX)); - MOV(32, R(EAX), gpr.R(a)); - SHR(64, gpr.R(a), Imm8(32)); - TEST(32, gpr.R(a), R(EAX)); - JitSetCAIf(CC_NZ); + if (js.op->wantsCA) + { + MOV(32, R(EAX), gpr.R(a)); + SHR(64, gpr.R(a), Imm8(32)); + TEST(32, gpr.R(a), R(EAX)); + JitSetCAIf(CC_NZ); + } + else + { + SHR(64, gpr.R(a), Imm8(32)); + } gpr.UnlockAll(); gpr.UnlockAllX(); if (inst.Rc) @@ -1853,33 +1865,42 @@ void Jit64::srawix(UGeckoInstruction inst) { gpr.Lock(a, s); gpr.BindToRegister(a, a == s, true); - JitClearCAOV(false); - MOV(32, R(EAX), gpr.R(s)); - if (a != s) - MOV(32, gpr.R(a), R(EAX)); - // some optimized common cases that can be done in slightly fewer ops - if (amount == 31) + if (!js.op->wantsCA) { - SAR(32, gpr.R(a), Imm8(31)); - NEG(32, R(EAX)); // EAX = input == INT_MIN ? INT_MIN : -input; - AND(32, R(EAX), Imm32(0x80000000)); // EAX = input < 0 && input != INT_MIN ? 0 : 0x80000000 - SHR(32, R(EAX), Imm8(31 - XER_CA_SHIFT)); - XOR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = (input < 0 && input != INT_MIN) - } - else if (amount == 1) - { - SHR(32, R(EAX), Imm8(31)); // sign - AND(32, R(EAX), gpr.R(a)); // (sign && carry) - SAR(32, gpr.R(a), Imm8(1)); - SHL(32, R(EAX), Imm8(XER_CA_SHIFT)); - OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001 + if (a != s) + MOV(32, gpr.R(a), gpr.R(s)); + SAR(32, gpr.R(a), Imm8(amount)); } else { - SAR(32, gpr.R(a), Imm8(amount)); - SHL(32, R(EAX), Imm8(32 - amount)); - TEST(32, R(EAX), gpr.R(a)); - JitSetCAIf(CC_NZ); + JitClearCAOV(true, false); + MOV(32, R(EAX), gpr.R(s)); + if (a != s) + MOV(32, gpr.R(a), R(EAX)); + // some optimized common cases that can be done in slightly fewer ops + if (amount == 31) + { + SAR(32, gpr.R(a), Imm8(31)); + NEG(32, R(EAX)); // EAX = input == INT_MIN ? INT_MIN : -input; + AND(32, R(EAX), Imm32(0x80000000)); // EAX = input < 0 && input != INT_MIN ? 0 : 0x80000000 + SHR(32, R(EAX), Imm8(31 - XER_CA_SHIFT)); + XOR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = (input < 0 && input != INT_MIN) + } + else if (amount == 1) + { + SHR(32, R(EAX), Imm8(31)); // sign + AND(32, R(EAX), gpr.R(a)); // (sign && carry) + SAR(32, gpr.R(a), Imm8(1)); + SHL(32, R(EAX), Imm8(XER_CA_SHIFT)); + OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); // XER.CA = sign && carry, aka (input&0x80000001) == 0x80000001 + } + else + { + SAR(32, gpr.R(a), Imm8(amount)); + SHL(32, R(EAX), Imm8(32 - amount)); + TEST(32, R(EAX), gpr.R(a)); + JitSetCAIf(CC_NZ); + } } } else @@ -1888,7 +1909,7 @@ void Jit64::srawix(UGeckoInstruction inst) FALLBACK_IF(true); gpr.Lock(a, s); - JitClearCAOV(false); + JitClearCAOV(js.op->wantsCA, false); gpr.BindToRegister(a, a == s, true); if (a != s) diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp index c0abd0242c..f34b22774d 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp @@ -1110,7 +1110,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress) Jit->JitSetCA(); FixupBranch cont = Jit->J(); Jit->SetJumpTarget(nocarry); - Jit->JitClearCAOV(false); + Jit->JitClearCAOV(true, false); Jit->SetJumpTarget(cont); regNormalRegClear(RI, I); break; diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index f701c95ee9..1c04a7c7f1 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -824,10 +824,10 @@ void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode) OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); //XER.CA = 1 } -void EmuCodeBlock::JitClearCAOV(bool oe) +void EmuCodeBlock::JitClearCAOV(bool ca, bool oe) { - if (oe) - AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK & ~XER_OV_MASK)); //XER.CA, XER.OV = 0 - else - AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0 + u32 mask = (ca ? ~XER_CA_MASK : 0xFFFFFFFF) & (oe ? ~XER_OV_MASK : 0xFFFFFFFF); + if (mask == 0xFFFFFFFF) + return; + AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(mask)); } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index 2ce315d20e..579215a171 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -53,7 +53,7 @@ public: void JitGetAndClearCAOV(bool oe); void JitSetCA(); void JitSetCAIf(Gen::CCFlags conditionCode); - void JitClearCAOV(bool oe); + void JitClearCAOV(bool ca, bool oe); void ForceSinglePrecisionS(Gen::X64Reg xmm); void ForceSinglePrecisionP(Gen::X64Reg xmm); diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 2c81a8447a..741b453739 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -430,7 +430,6 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf { code->wantsCR0 = false; code->wantsCR1 = false; - code->wantsPS1 = false; if (opinfo->flags & FL_USE_FPU) block->m_fpa->any = true; @@ -458,6 +457,15 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false; code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false; + code->wantsCA = (opinfo->flags & FL_READ_CA) ? true : false; + code->outputCA = (opinfo->flags & FL_SET_CA) ? true : false; + + // mfspr/mtspr can affect/use XER, so be super careful here + if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 339) // mfspr + code->wantsCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER; + if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr + code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER; + int numOut = 0; int numIn = 0; if (opinfo->flags & FL_OUT_A) @@ -715,26 +723,30 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 block->m_broken = true; } - // Scan for CR0 dependency - // assume next block wants flags to be safe + // Scan for flag dependencies; assume the next block (or any branch that can leave the block) + // wants flags, to be safe. bool wantsCR0 = true; bool wantsCR1 = true; - bool wantsPS1 = true; bool wantsFPRF = true; + bool wantsCA = true; for (int i = block->m_num_instructions - 1; i >= 0; i--) { - wantsCR0 |= code[i].wantsCR0 || code[i].canEndBlock; - wantsCR1 |= code[i].wantsCR1 || code[i].canEndBlock; - wantsPS1 |= code[i].wantsPS1 || code[i].canEndBlock; - wantsFPRF |= code[i].wantsFPRF || code[i].canEndBlock; - code[i].wantsCR0 = wantsCR0; - code[i].wantsCR1 = wantsCR1; - code[i].wantsPS1 = wantsPS1; + bool opWantsCR0 = code[i].wantsCR0; + bool opWantsCR1 = code[i].wantsCR1; + bool opWantsFPRF = code[i].wantsFPRF; + bool opWantsCA = code[i].wantsCA; + wantsCR0 |= opWantsCR0 || code[i].canEndBlock; + wantsCR1 |= opWantsCR1 || code[i].canEndBlock; + wantsFPRF |= opWantsFPRF || code[i].canEndBlock; + wantsCA |= opWantsCA || code[i].canEndBlock; + code[i].wantsCR0 = wantsCR0; + code[i].wantsCR1 = wantsCR1; code[i].wantsFPRF = wantsFPRF; - wantsCR0 &= !code[i].outputCR0; - wantsCR1 &= !code[i].outputCR1; - wantsPS1 &= !code[i].outputPS1; - wantsFPRF &= !code[i].outputFPRF; + code[i].wantsCA = wantsCA; + wantsCR0 &= !code[i].outputCR0 || opWantsCR0; + wantsCR1 &= !code[i].outputCR1 || opWantsCR1; + wantsFPRF &= !code[i].outputFPRF || opWantsFPRF; + wantsCA &= !code[i].outputCA || opWantsCA; } return address; } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 0916e3951e..2177889336 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -33,12 +33,12 @@ struct CodeOp //16B bool isBranchTarget; bool wantsCR0; bool wantsCR1; - bool wantsPS1; bool wantsFPRF; + bool wantsCA; bool outputCR0; bool outputCR1; - bool outputPS1; bool outputFPRF; + bool outputCA; bool canEndBlock; bool skip; // followed BL-s for example };