mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-01-30 12:32:43 +00:00
rsx: Allow only sse4.1 capable CPUs to take the accelerated index path
- Older sets lack the required min/max functionality
This commit is contained in:
parent
dadfdc35f4
commit
c59cb1bdd3
@ -627,59 +627,36 @@ namespace
|
||||
_mm_storeu_si128(dst_stream++, value);
|
||||
}
|
||||
|
||||
if (s_use_sse4_1)
|
||||
{
|
||||
const __m128i mask_step1 = _mm_set_epi8(
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8);
|
||||
const __m128i mask_step1 = _mm_set_epi8(
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8);
|
||||
|
||||
const __m128i mask_step2 = _mm_set_epi8(
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4);
|
||||
const __m128i mask_step2 = _mm_set_epi8(
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4);
|
||||
|
||||
const __m128i mask_step3 = _mm_set_epi8(
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0x3, 0x2);
|
||||
const __m128i mask_step3 = _mm_set_epi8(
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0x3, 0x2);
|
||||
|
||||
__m128i tmp = __mm_shuffle_epi8(min, mask_step1);
|
||||
min = __mm_min_epu16(min, tmp);
|
||||
tmp = __mm_shuffle_epi8(min, mask_step2);
|
||||
min = __mm_min_epu16(min, tmp);
|
||||
tmp = __mm_shuffle_epi8(min, mask_step3);
|
||||
min = __mm_min_epu16(min, tmp);
|
||||
__m128i tmp = __mm_shuffle_epi8(min, mask_step1);
|
||||
min = __mm_min_epu16(min, tmp);
|
||||
tmp = __mm_shuffle_epi8(min, mask_step2);
|
||||
min = __mm_min_epu16(min, tmp);
|
||||
tmp = __mm_shuffle_epi8(min, mask_step3);
|
||||
min = __mm_min_epu16(min, tmp);
|
||||
|
||||
tmp = __mm_shuffle_epi8(max, mask_step1);
|
||||
max = __mm_max_epu16(max, tmp);
|
||||
tmp = __mm_shuffle_epi8(max, mask_step2);
|
||||
max = __mm_max_epu16(max, tmp);
|
||||
tmp = __mm_shuffle_epi8(max, mask_step3);
|
||||
max = __mm_max_epu16(max, tmp);
|
||||
tmp = __mm_shuffle_epi8(max, mask_step1);
|
||||
max = __mm_max_epu16(max, tmp);
|
||||
tmp = __mm_shuffle_epi8(max, mask_step2);
|
||||
max = __mm_max_epu16(max, tmp);
|
||||
tmp = __mm_shuffle_epi8(max, mask_step3);
|
||||
max = __mm_max_epu16(max, tmp);
|
||||
|
||||
const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF);
|
||||
const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF);
|
||||
const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF);
|
||||
const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF);
|
||||
|
||||
return std::make_tuple(min_index, max_index, count);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Manual min-max
|
||||
alignas(16) u16 _min[8];
|
||||
alignas(16) u16 _max[8];
|
||||
|
||||
_mm_store_si128((__m128i*)_min, min);
|
||||
_mm_store_si128((__m128i*)_max, max);
|
||||
|
||||
u16 min_index = _min[0];
|
||||
u16 max_index = _max[0];
|
||||
|
||||
for (int i = 1; i < 8; ++i)
|
||||
{
|
||||
min_index = std::min(min_index, _min[i]);
|
||||
max_index = std::max(max_index, _max[i]);
|
||||
}
|
||||
|
||||
return std::make_tuple(min_index, max_index, count);
|
||||
}
|
||||
return std::make_tuple(min_index, max_index, count);
|
||||
}
|
||||
|
||||
static
|
||||
@ -707,55 +684,29 @@ namespace
|
||||
_mm_storeu_si128(dst_stream++, value);
|
||||
}
|
||||
|
||||
if (s_use_sse4_1)
|
||||
{
|
||||
// Aggregate min-max
|
||||
const __m128i mask_step1 = _mm_set_epi8(
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8);
|
||||
// Aggregate min-max
|
||||
const __m128i mask_step1 = _mm_set_epi8(
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8);
|
||||
|
||||
const __m128i mask_step2 = _mm_set_epi8(
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4);
|
||||
const __m128i mask_step2 = _mm_set_epi8(
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4);
|
||||
|
||||
// a1, a2, a3, a4
|
||||
// a1, a2, a1, a2
|
||||
// mAX
|
||||
__m128i tmp = __mm_shuffle_epi8(min, mask_step1);
|
||||
min = __mm_min_epu32(min, tmp);
|
||||
tmp = __mm_shuffle_epi8(min, mask_step2);
|
||||
min = __mm_min_epu32(min, tmp);
|
||||
__m128i tmp = __mm_shuffle_epi8(min, mask_step1);
|
||||
min = __mm_min_epu32(min, tmp);
|
||||
tmp = __mm_shuffle_epi8(min, mask_step2);
|
||||
min = __mm_min_epu32(min, tmp);
|
||||
|
||||
tmp = __mm_shuffle_epi8(max, mask_step1);
|
||||
max = __mm_max_epu32(max, tmp);
|
||||
tmp = __mm_shuffle_epi8(max, mask_step2);
|
||||
max = __mm_max_epu32(max, tmp);
|
||||
tmp = __mm_shuffle_epi8(max, mask_step1);
|
||||
max = __mm_max_epu32(max, tmp);
|
||||
tmp = __mm_shuffle_epi8(max, mask_step2);
|
||||
max = __mm_max_epu32(max, tmp);
|
||||
|
||||
const u32 min_index = u32(_mm_cvtsi128_si32(min));
|
||||
const u32 max_index = u32(_mm_cvtsi128_si32(max));
|
||||
const u32 min_index = u32(_mm_cvtsi128_si32(min));
|
||||
const u32 max_index = u32(_mm_cvtsi128_si32(max));
|
||||
|
||||
return std::make_tuple(min_index, max_index, count);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Manual min-max
|
||||
alignas(16) u32 _min[4];
|
||||
alignas(16) u32 _max[4];
|
||||
|
||||
_mm_store_si128((__m128i*)_min, min);
|
||||
_mm_store_si128((__m128i*)_max, max);
|
||||
|
||||
u32 min_index = _min[0];
|
||||
u32 max_index = _max[0];
|
||||
|
||||
for (int i = 1; i < 4; ++i)
|
||||
{
|
||||
min_index = std::min(min_index, _min[i]);
|
||||
max_index = std::max(max_index, _max[i]);
|
||||
}
|
||||
|
||||
return std::make_tuple(min_index, max_index, count);
|
||||
}
|
||||
return std::make_tuple(min_index, max_index, count);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
@ -766,7 +717,7 @@ namespace
|
||||
u32 written;
|
||||
u32 remaining = src.size();
|
||||
|
||||
if (s_use_ssse3 && remaining >= 32)
|
||||
if (s_use_sse4_1 && remaining >= 32)
|
||||
{
|
||||
if constexpr (std::is_same<T, u32>::value)
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user