From d1bea790f3520fa493a6dc23e2d8554bedf11af9 Mon Sep 17 00:00:00 2001 From: Malcolm Jestadt Date: Fri, 22 Sep 2023 16:15:30 -0400 Subject: [PATCH] SPU LLVM: Optimize GB/GBH/GBB with a GFNI path - Abuses GFNI to extract bits from bytes, from 5->2 instructions in most cases --- rpcs3/Emu/CPU/CPUTranslator.cpp | 11 +++++++++++ rpcs3/Emu/CPU/CPUTranslator.h | 3 +++ rpcs3/Emu/Cell/SPURecompiler.cpp | 31 ++++++++++++++++++++++++++++++- 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/rpcs3/Emu/CPU/CPUTranslator.cpp b/rpcs3/Emu/CPU/CPUTranslator.cpp index ee8d31a318..96afeb53ac 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.cpp +++ b/rpcs3/Emu/CPU/CPUTranslator.cpp @@ -154,6 +154,16 @@ void cpu_translator::initialize(llvm::LLVMContext& context, llvm::ExecutionEngin m_use_vnni = true; } + // Test GFNI feature (TODO) + if (cpu == "tremont" || + cpu == "gracemont" || + cpu == "alderlake" || + cpu == "raptorlake" || + cpu == "meteorlake") + { + m_use_gfni = true; + } + // Test AVX-512_icelake features (TODO) if (cpu == "icelake" || cpu == "icelake-client" || @@ -168,6 +178,7 @@ void cpu_translator::initialize(llvm::LLVMContext& context, llvm::ExecutionEngin m_use_avx512 = true; m_use_avx512_icl = true; m_use_vnni = true; + m_use_gfni = true; } // Aarch64 CPUs diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index c08af85f6d..edf6c92a65 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -2971,6 +2971,9 @@ protected: // Allow VNNI bool m_use_vnni = false; + // Allow GFNI + bool m_use_gfni = false; + // Allow Icelake tier AVX-512 bool m_use_avx512_icl = false; diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 811feb9c0a..e90133c93c 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -8134,6 +8134,18 @@ public: void GB(spu_opcode_t op) { + // GFNI trick to extract selected bit from bytes + // By treating the first input as constant, and the second input as variable, + // with only 1 bit set in our constant, gf2p8affineqb will extract that selected bit + // from each byte of the second operand + if (m_use_gfni) + { + const auto a = get_vr(op.ra); + const auto as = zshuffle(a, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 12, 8, 4, 0); + set_vr(op.rt, gf2p8affineqb(build(0x0, 0x0, 0x0, 0x0, 0x01, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x01, 0x0, 0x0, 0x0), as, 0x0)); + return; + } + const auto a = get_vr(op.ra); const auto m = zext(bitcast(trunc(a))); set_vr(op.rt, insert(splat(0), 3, eval(m))); @@ -8141,6 +8153,14 @@ public: void GBH(spu_opcode_t op) { + if (m_use_gfni) + { + const auto a = get_vr(op.ra); + const auto as = zshuffle(a, 16, 16, 16, 16, 16, 16, 16, 16, 14, 12, 10, 8, 6, 4, 2, 0); + set_vr(op.rt, gf2p8affineqb(build(0x0, 0x0, 0x0, 0x0, 0x01, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x01, 0x0, 0x0, 0x0), as, 0x0)); + return; + } + const auto a = get_vr(op.ra); const auto m = zext(bitcast(trunc(a))); set_vr(op.rt, insert(splat(0), 3, eval(m))); @@ -8148,7 +8168,16 @@ public: void GBB(spu_opcode_t op) { - const auto a = get_vr(op.ra); + const auto a = get_vr(op.ra); + + if (m_use_gfni) + { + const auto as = zshuffle(a, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + const auto m = gf2p8affineqb(build(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x01, 0x01, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0), as, 0x0); + set_vr(op.rt, zshuffle(m, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)); + return; + } + const auto m = zext(bitcast(trunc(a))); set_vr(op.rt, insert(splat(0), 3, eval(m))); }