From 439a78d12c7006e36f4bcb398ab4b6c83cdce236 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Fri, 2 Feb 2018 03:49:51 +0300 Subject: [PATCH] SPU ASMJIT: rewrite 128-bit shifts by bit Six instructions changed to use xmm registers instead of gpr. ROTQBII, ROTQMBII, SHLQBII look better (shifts by imm) ROTQBI, ROTQMBI, SHLQBI changed for consistency (shifts by variable) --- rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp | 128 +++++++++++++------------ 1 file changed, 66 insertions(+), 62 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index 2aa34b5b73..3400adf07c 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -1716,50 +1716,57 @@ void spu_recompiler::CDX(spu_opcode_t op) void spu_recompiler::ROTQBI(spu_opcode_t op) { - c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0)); - c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1)); - c->mov(*qw2, *qw0); - c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); - c->and_(*addr, 7); - c->shld(*qw0, *qw1, *addr); - c->shld(*qw1, *qw2, *addr); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1); - c->unuse(*addr); - c->unuse(*qw0); - c->unuse(*qw1); - c->unuse(*qw2); + const XmmLink& va = XmmGet(op.ra, XmmType::Int); + const XmmLink& vb = XmmGet(op.rb, XmmType::Int); + const XmmLink& vt = XmmAlloc(); + const XmmLink& v4 = XmmAlloc(); + c->psrldq(vb, 12); + c->pand(vb, XmmConst(_mm_set_epi64x(0, 7))); + c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64))); + c->pshufd(vt, va, 0x4e); + c->psubq(v4, vb); + c->psllq(va, vb); + c->psrlq(vt, v4); + c->por(vt, va); + c->movdqa(SPU_OFF_128(gpr, op.rt), vt); } void spu_recompiler::ROTQMBI(spu_opcode_t op) { - c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0)); - c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1)); - c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); - c->neg(*addr); - c->and_(*addr, 7); - c->shrd(*qw0, *qw1, *addr); - c->shr(*qw1, *addr); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1); - c->unuse(*addr); - c->unuse(*qw0); - c->unuse(*qw1); + const XmmLink& va = XmmGet(op.ra, XmmType::Int); + const XmmLink& vb = XmmAlloc(); + const XmmLink& vt = XmmGet(op.rb, XmmType::Int); + const XmmLink& v4 = XmmAlloc(); + c->psrldq(vt, 12); + c->pxor(vb, vb); + c->psubq(vb, vt); + c->pand(vb, XmmConst(_mm_set_epi64x(0, 7))); + c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64))); + c->movdqa(vt, va); + c->psrldq(vt, 8); + c->psubq(v4, vb); + c->psrlq(va, vb); + c->psllq(vt, v4); + c->por(vt, va); + c->movdqa(SPU_OFF_128(gpr, op.rt), vt); } void spu_recompiler::SHLQBI(spu_opcode_t op) { - c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0)); - c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1)); - c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); - c->and_(*addr, 7); - c->shld(*qw1, *qw0, *addr); - c->shl(*qw0, *addr); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1); - c->unuse(*addr); - c->unuse(*qw0); - c->unuse(*qw1); + const XmmLink& va = XmmGet(op.ra, XmmType::Int); + const XmmLink& vb = XmmGet(op.rb, XmmType::Int); + const XmmLink& vt = XmmAlloc(); + const XmmLink& v4 = XmmAlloc(); + c->psrldq(vb, 12); + c->pand(vb, XmmConst(_mm_set_epi64x(0, 7))); + c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64))); + c->movdqa(vt, va); + c->pslldq(vt, 8); + c->psubq(v4, vb); + c->psllq(va, vb); + c->psrlq(vt, v4); + c->por(vt, va); + c->movdqa(SPU_OFF_128(gpr, op.rt), vt); } void spu_recompiler::ROTQBY(spu_opcode_t op) @@ -1974,40 +1981,37 @@ void spu_recompiler::CDD(spu_opcode_t op) void spu_recompiler::ROTQBII(spu_opcode_t op) { - c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0)); - c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1)); - c->mov(*qw2, *qw0); - c->shld(*qw0, *qw1, op.i7 & 0x7); - c->shld(*qw1, *qw2, op.i7 & 0x7); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1); - c->unuse(*qw0); - c->unuse(*qw1); - c->unuse(*qw2); + const XmmLink& va = XmmGet(op.ra, XmmType::Int); + const XmmLink& vt = XmmAlloc(); + c->pshufd(vt, va, 0x4e); // swap 64-bit parts + c->psllq(va, (op.i7 & 0x7)); + c->psrlq(vt, 64 - (op.i7 & 0x7)); + c->por(vt, va); + c->movdqa(SPU_OFF_128(gpr, op.rt), vt); } void spu_recompiler::ROTQMBII(spu_opcode_t op) { - c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0)); - c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1)); - c->shrd(*qw0, *qw1, 0-op.i7 & 0x7); - c->shr(*qw1, 0-op.i7 & 0x7); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1); - c->unuse(*qw0); - c->unuse(*qw1); + const XmmLink& va = XmmGet(op.ra, XmmType::Int); + const XmmLink& vt = XmmAlloc(); + c->movdqa(vt, va); + c->psrldq(vt, 8); + c->psrlq(va, ((0 - op.i7) & 0x7)); + c->psllq(vt, 64 - ((0 - op.i7) & 0x7)); + c->por(vt, va); + c->movdqa(SPU_OFF_128(gpr, op.rt), vt); } void spu_recompiler::SHLQBII(spu_opcode_t op) { - c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0)); - c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1)); - c->shld(*qw1, *qw0, op.i7 & 0x7); - c->shl(*qw0, op.i7 & 0x7); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0); - c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1); - c->unuse(*qw0); - c->unuse(*qw1); + const XmmLink& va = XmmGet(op.ra, XmmType::Int); + const XmmLink& vt = XmmAlloc(); + c->movdqa(vt, va); + c->pslldq(vt, 8); + c->psllq(va, (op.i7 & 0x7)); + c->psrlq(vt, 64 - (op.i7 & 0x7)); + c->por(vt, va); + c->movdqa(SPU_OFF_128(gpr, op.rt), vt); } void spu_recompiler::ROTQBYI(spu_opcode_t op)