mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-01-30 03:32:55 +00:00
PPU/SPU LLVM: Allow Zen4 cpus to use VPERMI2B/VPERMT2B instead of the vperm2b256to128 path
- Zen4 based cpus can process VPERM2B in a single uop, unlike intel where it is 3 uops.
This commit is contained in:
parent
7d32dc312f
commit
d8897c585d
@ -30,6 +30,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "util/types.hpp"
|
#include "util/types.hpp"
|
||||||
|
#include "util/sysinfo.hpp"
|
||||||
#include "Utilities/StrFmt.h"
|
#include "Utilities/StrFmt.h"
|
||||||
#include "Utilities/BitField.h"
|
#include "Utilities/BitField.h"
|
||||||
#include "Utilities/JIT.h"
|
#include "Utilities/JIT.h"
|
||||||
@ -3442,6 +3443,11 @@ public:
|
|||||||
template <typename T1, typename T2, typename T3>
|
template <typename T1, typename T2, typename T3>
|
||||||
value_t<u8[16]> vperm2b(T1 a, T2 b, T3 c)
|
value_t<u8[16]> vperm2b(T1 a, T2 b, T3 c)
|
||||||
{
|
{
|
||||||
|
if (!utils::has_fast_vperm2b())
|
||||||
|
{
|
||||||
|
return vperm2b256to128(a, b, c);
|
||||||
|
}
|
||||||
|
|
||||||
value_t<u8[16]> result;
|
value_t<u8[16]> result;
|
||||||
|
|
||||||
const auto data0 = a.eval(m_ir);
|
const auto data0 = a.eval(m_ir);
|
||||||
|
@ -1289,7 +1289,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
|
|||||||
if (m_use_avx512_icl)
|
if (m_use_avx512_icl)
|
||||||
{
|
{
|
||||||
const auto i = eval(~c);
|
const auto i = eval(~c);
|
||||||
set_vr(op.vd, vperm2b256to128(b, a, i));
|
set_vr(op.vd, vperm2b(b, a, i));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8313,13 +8313,13 @@ public:
|
|||||||
{
|
{
|
||||||
if (perm_only)
|
if (perm_only)
|
||||||
{
|
{
|
||||||
set_vr(op.rt4, vperm2b256to128(as, bs, c));
|
set_vr(op.rt4, vperm2b(as, bs, c));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
|
const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
|
||||||
const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
|
const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
|
||||||
const auto ab = vperm2b256to128(as, bs, c);
|
const auto ab = vperm2b(as, bs, c);
|
||||||
set_vr(op.rt4, select(noncast<s8[16]>(c) >= 0, ab, mm));
|
set_vr(op.rt4, select(noncast<s8[16]>(c) >= 0, ab, mm));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -8371,18 +8371,18 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_use_avx512_icl && (op.ra != op.rb))
|
if (m_use_avx512_icl && (op.ra != op.rb || m_interp_magn))
|
||||||
{
|
{
|
||||||
if (perm_only)
|
if (perm_only)
|
||||||
{
|
{
|
||||||
set_vr(op.rt4, vperm2b256to128(a, b, eval(c ^ 0xf)));
|
set_vr(op.rt4, vperm2b(a, b, eval(c ^ 0xf)));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
|
const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
|
||||||
const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
|
const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
|
||||||
const auto cr = eval(c ^ 0xf);
|
const auto cr = eval(c ^ 0xf);
|
||||||
const auto ab = vperm2b256to128(a, b, cr);
|
const auto ab = vperm2b(a, b, cr);
|
||||||
set_vr(op.rt4, select(noncast<s8[16]>(c) >= 0, ab, mm));
|
set_vr(op.rt4, select(noncast<s8[16]>(c) >= 0, ab, mm));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -227,6 +227,19 @@ bool utils::has_fma4()
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The Zen4 based CPUs support VPERMI2B/VPERMT2B in a single uop.
|
||||||
|
// Current Intel cpus (as of 2022) need 3 uops to execute these instructions.
|
||||||
|
// Check for SSE4A (which intel doesn't doesn't support) as well as VBMI.
|
||||||
|
bool utils::has_fast_vperm2b()
|
||||||
|
{
|
||||||
|
#if defined(ARCH_X64)
|
||||||
|
static const bool g_value = has_avx512() && (get_cpuid(7, 0)[2] & 0x2) == 0x2 && get_cpuid(0, 0)[0] >= 0x7 && (get_cpuid(0x80000001, 0)[2] & 0x20) == 0x20;
|
||||||
|
return g_value;
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
bool utils::has_erms()
|
bool utils::has_erms()
|
||||||
{
|
{
|
||||||
#if defined(ARCH_X64)
|
#if defined(ARCH_X64)
|
||||||
|
@ -37,6 +37,8 @@ namespace utils
|
|||||||
|
|
||||||
bool has_fma4();
|
bool has_fma4();
|
||||||
|
|
||||||
|
bool has_fast_vperm2b();
|
||||||
|
|
||||||
bool has_erms();
|
bool has_erms();
|
||||||
|
|
||||||
bool has_fsrm();
|
bool has_fsrm();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user