SPU LLVM: Absolute final fixes for icelake shufb paths

- The constant mask was accessing bits in reverse order of what was expected
This commit is contained in:
Malcolm Jestadt 2021-04-20 23:10:57 -04:00 committed by Ivan
parent efd38fa940
commit 6247969ede

View File

@ -7418,6 +7418,15 @@ public:
{
if (auto [ok, bs] = match_expr(b, byteswap(match<u8[16]>())); ok)
{
if (m_use_avx512_icl && (op.ra != op.rb))
{
const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
const auto ab = vperm2b(as, bs, c);
set_vr(op.rt4, select(noncast<s8[16]>(c) >= 0, ab, mm));
return;
}
const auto x = avg(noncast<u8[16]>(sext<s8[16]>((c & 0xc0) == 0xc0)), noncast<u8[16]>(sext<s8[16]>((c & 0xe0) == 0xc0)));
const auto ax = pshufb(as, c);
const auto bx = pshufb(bs, c);
@ -7455,6 +7464,16 @@ public:
}
}
if (m_use_avx512_icl && (op.ra != op.rb || m_interp_magn))
{
const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
const auto cr = eval(~c);
const auto ab = vperm2b(b, a, cr);
set_vr(op.rt4, select(noncast<s8[16]>(cr) >= 0, mm, ab));
return;
}
const auto x = avg(noncast<u8[16]>(sext<s8[16]>((c & 0xc0) == 0xc0)), noncast<u8[16]>(sext<s8[16]>((c & 0xe0) == 0xc0)));
const auto cr = eval(c ^ 0xf);
const auto ax = pshufb(a, cr);