simd_builder: fixups

Fix resetting vmask in reduce() step.
Fix AVX-512 loads in vec_load_unaligned().
Fix bzhi reg size in build_look().
This commit is contained in:
Nekotekina 2022-09-08 17:06:09 +03:00 committed by Ivan
parent 5d91caebe9
commit a9437d69ab
2 changed files with 21 additions and 9 deletions

View File

@ -420,9 +420,10 @@ void asmjit::simd_builder::_init(uint new_vsize)
vsize = new_vsize ? new_vsize : 16; vsize = new_vsize ? new_vsize : 16;
} }
if (!new_vsize && utils::has_avx512()) if (utils::has_avx512())
{ {
vmask = -1; if (!new_vsize)
vmask = -1;
} }
else else
{ {
@ -604,7 +605,7 @@ void asmjit::simd_builder::vec_load_unaligned(u32 esize, const Operand& v, const
this->emit(x86::Inst::kIdVpinsrw, x86::Xmm(v.id()), x86::Xmm(v.id()), src, Imm(0)); this->emit(x86::Inst::kIdVpinsrw, x86::Xmm(v.id()), x86::Xmm(v.id()), src, Imm(0));
else if (vsize == 2) else if (vsize == 2)
this->emit(x86::Inst::kIdPinsrw, v, src, Imm(0)); this->emit(x86::Inst::kIdPinsrw, v, src, Imm(0));
else if (vmask && vmask < 8) else if ((vmask && vmask < 8) || vsize >= 64)
this->emit(x86::Inst::kIdVmovdqu16, v, src); this->emit(x86::Inst::kIdVmovdqu16, v, src);
else else
return vec_load_unaligned(vsize, v, src); return vec_load_unaligned(vsize, v, src);
@ -616,7 +617,7 @@ void asmjit::simd_builder::vec_load_unaligned(u32 esize, const Operand& v, const
this->emit(x86::Inst::kIdVmovd, x86::Xmm(v.id()), src); this->emit(x86::Inst::kIdVmovd, x86::Xmm(v.id()), src);
else if (vsize == 4) else if (vsize == 4)
this->emit(x86::Inst::kIdMovd, v, src); this->emit(x86::Inst::kIdMovd, v, src);
else if (vmask && vmask < 8) else if ((vmask && vmask < 8) || vsize >= 64)
this->emit(x86::Inst::kIdVmovdqu32, v, src); this->emit(x86::Inst::kIdVmovdqu32, v, src);
else else
return vec_load_unaligned(vsize, v, src); return vec_load_unaligned(vsize, v, src);
@ -628,7 +629,7 @@ void asmjit::simd_builder::vec_load_unaligned(u32 esize, const Operand& v, const
this->emit(x86::Inst::kIdVmovq, x86::Xmm(v.id()), src); this->emit(x86::Inst::kIdVmovq, x86::Xmm(v.id()), src);
else if (vsize == 8) else if (vsize == 8)
this->emit(x86::Inst::kIdMovq, v, src); this->emit(x86::Inst::kIdMovq, v, src);
else if (vmask && vmask < 8) else if ((vmask && vmask < 8) || vsize >= 64)
this->emit(x86::Inst::kIdVmovdqu64, v, src); this->emit(x86::Inst::kIdVmovdqu64, v, src);
else else
return vec_load_unaligned(vsize, v, src); return vec_load_unaligned(vsize, v, src);
@ -636,7 +637,9 @@ void asmjit::simd_builder::vec_load_unaligned(u32 esize, const Operand& v, const
else if (esize >= 16) else if (esize >= 16)
{ {
ensure(vsize >= 16); ensure(vsize >= 16);
if (utils::has_avx()) if ((vmask && vmask < 8) || vsize >= 64)
this->emit(x86::Inst::kIdVmovdqu64, v, src); // Not really needed
else if (utils::has_avx())
this->emit(x86::Inst::kIdVmovdqu, v, src); this->emit(x86::Inst::kIdVmovdqu, v, src);
else else
this->emit(x86::Inst::kIdMovups, v, src); this->emit(x86::Inst::kIdMovups, v, src);

View File

@ -323,11 +323,20 @@ namespace asmjit
// Build single last iteration (masked) // Build single last iteration (masked)
this->test(reg_cnt, reg_cnt); this->test(reg_cnt, reg_cnt);
this->jz(exit); this->jz(exit);
this->bzhi(reg_cnt, x86::Mem(consts[~u128()], 0), reg_cnt);
this->kmovq(x86::k7, reg_cnt); if (esize == 1 && vsize == 64)
{
this->bzhi(reg_cnt.r64(), x86::Mem(consts[~u128()], 0), reg_cnt.r64());
this->kmovq(x86::k7, reg_cnt.r64());
}
else
{
this->bzhi(reg_cnt.r32(), x86::Mem(consts[~u128()], 0), reg_cnt.r32());
this->kmovd(x86::k7, reg_cnt.r32());
}
vmask = 7; vmask = 7;
build(); build();
vmask = -1;
// Rollout reduction step // Rollout reduction step
this->bind(exit); this->bind(exit);