SPU DMA: implement cmp_rdata_avx

Use technique similar to mov_rdata_avx with inline assembly.
This commit is contained in:
Nekotekina 2020-04-28 17:23:43 +03:00
parent 7da8ba5c15
commit 790fd9ce14

View File

@ -31,8 +31,55 @@ static const bool s_tsx_avx = utils::has_avx();
// For special case
static const bool s_tsx_haswell = utils::has_rtm() && !utils::has_mpx();
static FORCE_INLINE bool cmp_rdata_avx(const __m256i* lhs, const __m256i* rhs)
{
#if defined(_MSC_VER) || defined(__AVX__)
const __m256 x0 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_load_si256(lhs + 0)), _mm256_castsi256_ps(_mm256_load_si256(rhs + 0)));
const __m256 x1 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_load_si256(lhs + 1)), _mm256_castsi256_ps(_mm256_load_si256(rhs + 1)));
const __m256 x2 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_load_si256(lhs + 2)), _mm256_castsi256_ps(_mm256_load_si256(rhs + 2)));
const __m256 x3 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_load_si256(lhs + 3)), _mm256_castsi256_ps(_mm256_load_si256(rhs + 3)));
const __m256 c0 = _mm256_or_ps(x0, x1);
const __m256 c1 = _mm256_or_ps(x2, x3);
const __m256 c2 = _mm256_or_ps(c0, c1);
return _mm256_testz_si256(_mm256_castps_si256(c2), _mm256_castps_si256(c2)) != 0;
#else
bool result = 0;
__asm__(
"vmovaps 0*32(%[lhs]), %%ymm0;" // load
"vmovaps 1*32(%[lhs]), %%ymm1;"
"vmovaps 2*32(%[lhs]), %%ymm2;"
"vmovaps 3*32(%[lhs]), %%ymm3;"
"vxorps 0*32(%[rhs]), %%ymm0, %%ymm0;" // compare
"vxorps 1*32(%[rhs]), %%ymm1, %%ymm1;"
"vxorps 2*32(%[rhs]), %%ymm2, %%ymm2;"
"vxorps 3*32(%[rhs]), %%ymm3, %%ymm3;"
"vorps %%ymm0, %%ymm1, %%ymm0;" // merge
"vorps %%ymm2, %%ymm3, %%ymm2;"
"vorps %%ymm0, %%ymm2, %%ymm0;"
"vptest %%ymm0, %%ymm0;" // test
"vzeroupper"
: "=@ccz" (result)
: [lhs] "r" (lhs)
, [rhs] "r" (rhs)
: "cc" // Clobber flags
, "xmm0" // Clobber registers ymm0-ymm3 (see mov_rdata_avx)
, "xmm1"
, "xmm2"
, "xmm3"
);
return result;
#endif
}
static FORCE_INLINE bool cmp_rdata(const decltype(spu_thread::rdata)& lhs, const decltype(spu_thread::rdata)& rhs)
{
#ifndef __AVX__
if (s_tsx_avx) [[likely]]
#endif
{
return cmp_rdata_avx(reinterpret_cast<const __m256i*>(&lhs), reinterpret_cast<const __m256i*>(&rhs));
}
const v128 a = (lhs[0] ^ rhs[0]) | (lhs[1] ^ rhs[1]);
const v128 b = (lhs[2] ^ rhs[2]) | (lhs[3] ^ rhs[3]);
const v128 c = (lhs[4] ^ rhs[4]) | (lhs[5] ^ rhs[5]);
@ -1770,7 +1817,7 @@ bool spu_thread::process_mfc_cmd()
continue;
}
if (g_cfg.core.spu_accurate_getllar && !cmp_rdata(dst, data))
if (g_cfg.core.spu_accurate_getllar && !cmp_rdata(dst, data))
{
i += 2;
continue;
@ -2799,7 +2846,7 @@ bool spu_thread::stop_and_signal(u32 code)
// Check group status, wait if necessary
for (auto _state = +group->run_state;
_state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED;
_state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED;
_state = group->run_state)
{
if (is_stopped())
@ -3009,7 +3056,7 @@ bool spu_thread::stop_and_signal(u32 code)
while (true)
{
for (auto _state = +group->run_state;
_state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED;
_state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED;
_state = group->run_state)
{
if (is_stopped())