mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-03-12 04:14:35 +00:00
SPU DMA: implement cmp_rdata_avx
Use technique similar to mov_rdata_avx with inline assembly.
This commit is contained in:
parent
7da8ba5c15
commit
790fd9ce14
@ -31,8 +31,55 @@ static const bool s_tsx_avx = utils::has_avx();
|
||||
// For special case
|
||||
static const bool s_tsx_haswell = utils::has_rtm() && !utils::has_mpx();
|
||||
|
||||
static FORCE_INLINE bool cmp_rdata_avx(const __m256i* lhs, const __m256i* rhs)
|
||||
{
|
||||
#if defined(_MSC_VER) || defined(__AVX__)
|
||||
const __m256 x0 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_load_si256(lhs + 0)), _mm256_castsi256_ps(_mm256_load_si256(rhs + 0)));
|
||||
const __m256 x1 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_load_si256(lhs + 1)), _mm256_castsi256_ps(_mm256_load_si256(rhs + 1)));
|
||||
const __m256 x2 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_load_si256(lhs + 2)), _mm256_castsi256_ps(_mm256_load_si256(rhs + 2)));
|
||||
const __m256 x3 = _mm256_xor_ps(_mm256_castsi256_ps(_mm256_load_si256(lhs + 3)), _mm256_castsi256_ps(_mm256_load_si256(rhs + 3)));
|
||||
const __m256 c0 = _mm256_or_ps(x0, x1);
|
||||
const __m256 c1 = _mm256_or_ps(x2, x3);
|
||||
const __m256 c2 = _mm256_or_ps(c0, c1);
|
||||
return _mm256_testz_si256(_mm256_castps_si256(c2), _mm256_castps_si256(c2)) != 0;
|
||||
#else
|
||||
bool result = 0;
|
||||
__asm__(
|
||||
"vmovaps 0*32(%[lhs]), %%ymm0;" // load
|
||||
"vmovaps 1*32(%[lhs]), %%ymm1;"
|
||||
"vmovaps 2*32(%[lhs]), %%ymm2;"
|
||||
"vmovaps 3*32(%[lhs]), %%ymm3;"
|
||||
"vxorps 0*32(%[rhs]), %%ymm0, %%ymm0;" // compare
|
||||
"vxorps 1*32(%[rhs]), %%ymm1, %%ymm1;"
|
||||
"vxorps 2*32(%[rhs]), %%ymm2, %%ymm2;"
|
||||
"vxorps 3*32(%[rhs]), %%ymm3, %%ymm3;"
|
||||
"vorps %%ymm0, %%ymm1, %%ymm0;" // merge
|
||||
"vorps %%ymm2, %%ymm3, %%ymm2;"
|
||||
"vorps %%ymm0, %%ymm2, %%ymm0;"
|
||||
"vptest %%ymm0, %%ymm0;" // test
|
||||
"vzeroupper"
|
||||
: "=@ccz" (result)
|
||||
: [lhs] "r" (lhs)
|
||||
, [rhs] "r" (rhs)
|
||||
: "cc" // Clobber flags
|
||||
, "xmm0" // Clobber registers ymm0-ymm3 (see mov_rdata_avx)
|
||||
, "xmm1"
|
||||
, "xmm2"
|
||||
, "xmm3"
|
||||
);
|
||||
return result;
|
||||
#endif
|
||||
}
|
||||
|
||||
static FORCE_INLINE bool cmp_rdata(const decltype(spu_thread::rdata)& lhs, const decltype(spu_thread::rdata)& rhs)
|
||||
{
|
||||
#ifndef __AVX__
|
||||
if (s_tsx_avx) [[likely]]
|
||||
#endif
|
||||
{
|
||||
return cmp_rdata_avx(reinterpret_cast<const __m256i*>(&lhs), reinterpret_cast<const __m256i*>(&rhs));
|
||||
}
|
||||
|
||||
const v128 a = (lhs[0] ^ rhs[0]) | (lhs[1] ^ rhs[1]);
|
||||
const v128 b = (lhs[2] ^ rhs[2]) | (lhs[3] ^ rhs[3]);
|
||||
const v128 c = (lhs[4] ^ rhs[4]) | (lhs[5] ^ rhs[5]);
|
||||
@ -1770,7 +1817,7 @@ bool spu_thread::process_mfc_cmd()
|
||||
continue;
|
||||
}
|
||||
|
||||
if (g_cfg.core.spu_accurate_getllar && !cmp_rdata(dst, data))
|
||||
if (g_cfg.core.spu_accurate_getllar && !cmp_rdata(dst, data))
|
||||
{
|
||||
i += 2;
|
||||
continue;
|
||||
@ -2799,7 +2846,7 @@ bool spu_thread::stop_and_signal(u32 code)
|
||||
|
||||
// Check group status, wait if necessary
|
||||
for (auto _state = +group->run_state;
|
||||
_state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED;
|
||||
_state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED;
|
||||
_state = group->run_state)
|
||||
{
|
||||
if (is_stopped())
|
||||
@ -3009,7 +3056,7 @@ bool spu_thread::stop_and_signal(u32 code)
|
||||
while (true)
|
||||
{
|
||||
for (auto _state = +group->run_state;
|
||||
_state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED;
|
||||
_state >= SPU_THREAD_GROUP_STATUS_WAITING && _state <= SPU_THREAD_GROUP_STATUS_WAITING_AND_SUSPENDED;
|
||||
_state = group->run_state)
|
||||
{
|
||||
if (is_stopped())
|
||||
|
Loading…
x
Reference in New Issue
Block a user