mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-01-29 00:33:01 +00:00
RSX: Add more wide paths for upload_untouched
- Adds AVX512 path for upload_untouched u16 with primitive restart, and AVX2 and AVX512 paths for upload_untouched without restart - The AVX512 paths handle the remainder in simd code with masking, which provided a large speedup - On my i5-1135G7 in demons souls benchmarking a scene in boletaria with a lot of geometry on screen via perf: SSE4_1 0.64% AVX2 0.59% AVX512 0.56% AVX512 w/ remainder masking 0.51%
This commit is contained in:
parent
ec530a2c91
commit
0d022d420b
@ -59,32 +59,32 @@ SSE4_1_FUNC static inline u16 sse41_hmax_epu16(__m128i x)
|
||||
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
|
||||
constexpr bool s_use_sse4_1 = true;
|
||||
constexpr bool s_use_avx2 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_avx3 = true;
|
||||
constexpr bool s_use_avx3 = true;
|
||||
#elif defined(__AVX2__)
|
||||
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
|
||||
constexpr bool s_use_sse4_1 = true;
|
||||
constexpr bool s_use_avx2 = true;
|
||||
[[maybe_unused]] constexpr bool s_use_avx3 = false;
|
||||
constexpr bool s_use_avx3 = false;
|
||||
#elif defined(__SSE4_1__)
|
||||
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
|
||||
constexpr bool s_use_sse4_1 = true;
|
||||
constexpr bool s_use_avx2 = false;
|
||||
[[maybe_unused]] constexpr bool s_use_avx3 = false;
|
||||
constexpr bool s_use_avx3 = false;
|
||||
#elif defined(__SSSE3__)
|
||||
[[maybe_unused]] constexpr bool s_use_ssse3 = true;
|
||||
constexpr bool s_use_sse4_1 = false;
|
||||
constexpr bool s_use_avx2 = false;
|
||||
[[maybe_unused]] constexpr bool s_use_avx3 = false;
|
||||
constexpr bool s_use_avx3 = false;
|
||||
#elif defined(ARCH_X64)
|
||||
[[maybe_unused]] const bool s_use_ssse3 = utils::has_ssse3();
|
||||
const bool s_use_sse4_1 = utils::has_sse41();
|
||||
const bool s_use_avx2 = utils::has_avx2();
|
||||
[[maybe_unused]] const bool s_use_avx3 = utils::has_avx512();
|
||||
const bool s_use_avx3 = utils::has_avx512();
|
||||
#else
|
||||
[[maybe_unused]] constexpr bool s_use_ssse3 = true; // Non x86
|
||||
constexpr bool s_use_sse4_1 = true; // Non x86
|
||||
constexpr bool s_use_avx2 = false;
|
||||
[[maybe_unused]] constexpr bool s_use_avx3 = false;
|
||||
constexpr bool s_use_avx3 = false;
|
||||
#endif
|
||||
|
||||
const __m128i s_bswap_u32_mask = _mm_set_epi8(
|
||||
@ -296,6 +296,108 @@ namespace
|
||||
|
||||
struct untouched_impl
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
AVX3_FUNC
|
||||
static
|
||||
std::tuple<u16, u16, u32> upload_u16_swapped_avx3(const void *src, void *dst, u32 count)
|
||||
{
|
||||
const __m512i s_bswap_u16_mask512 = _mm512_broadcast_i64x2(s_bswap_u16_mask);
|
||||
|
||||
const __m512i s_remainder_mask = _mm512_set_epi16(
|
||||
0x20, 0x1F, 0x1E, 0x1D,
|
||||
0x1C, 0x1B, 0x1A, 0x19,
|
||||
0x18, 0x17, 0x16, 0x15,
|
||||
0x14, 0x13, 0x12, 0x11,
|
||||
0x10, 0xF, 0xE, 0xD,
|
||||
0xC, 0xB, 0xA, 0x9,
|
||||
0x8, 0x7, 0x6, 0x5,
|
||||
0x4, 0x3, 0x2, 0x1);
|
||||
|
||||
auto src_stream = static_cast<const __m512*>(src);
|
||||
auto dst_stream = static_cast<__m512*>(dst);
|
||||
|
||||
__m512i min = _mm512_set1_epi16(-1);
|
||||
__m512i max = _mm512_set1_epi16(0);
|
||||
|
||||
const auto iterations = count / 32;
|
||||
for (unsigned n = 0; n < iterations; ++n)
|
||||
{
|
||||
const __m512i raw = _mm512_loadu_si512(src_stream++);
|
||||
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512);
|
||||
max = _mm512_max_epu16(max, value);
|
||||
min = _mm512_min_epu16(min, value);
|
||||
_mm512_store_si512(dst_stream++, value);
|
||||
}
|
||||
|
||||
if ((iterations * 32) < count )
|
||||
{
|
||||
const u16 remainder = (count - (iterations * 32));
|
||||
const __m512i remBroadcast = _mm512_set1_epi16(remainder);
|
||||
const __mmask32 mask = _mm512_cmpge_epi16_mask(remBroadcast, s_remainder_mask);
|
||||
const __m512i raw = _mm512_maskz_loadu_epi16(mask, src_stream++);
|
||||
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512);
|
||||
max = _mm512_mask_max_epu16(max, mask, max, value);
|
||||
min = _mm512_mask_min_epu16(min, mask, min, value);
|
||||
_mm512_mask_storeu_epi16(dst_stream++, mask, value);
|
||||
}
|
||||
|
||||
__m256i tmp256 = _mm512_extracti64x4_epi64(min, 1);
|
||||
__m256i min2 = _mm512_castsi512_si256(min);
|
||||
min2 = _mm256_min_epu16(min2, tmp256);
|
||||
__m128i tmp = _mm256_extracti128_si256(min2, 1);
|
||||
__m128i min3 = _mm256_castsi256_si128(min2);
|
||||
min3 = _mm_min_epu16(min3, tmp);
|
||||
|
||||
tmp256 = _mm512_extracti64x4_epi64(max, 1);
|
||||
__m256i max2 = _mm512_castsi512_si256(max);
|
||||
max2 = _mm256_max_epu16(max2, tmp256);
|
||||
tmp = _mm256_extracti128_si256(max2, 1);
|
||||
__m128i max3 = _mm256_castsi256_si128(max2);
|
||||
max3 = _mm_max_epu16(max3, tmp);
|
||||
|
||||
const u16 min_index = sse41_hmin_epu16(min3);
|
||||
const u16 max_index = sse41_hmax_epu16(max3);
|
||||
|
||||
return std::make_tuple(min_index, max_index, count);
|
||||
}
|
||||
|
||||
AVX2_FUNC
|
||||
static
|
||||
std::tuple<u16, u16, u32> upload_u16_swapped_avx2(const void *src, void *dst, u32 count)
|
||||
{
|
||||
const __m256i shuffle_mask = _mm256_set_m128i(s_bswap_u16_mask, s_bswap_u16_mask);
|
||||
|
||||
auto src_stream = static_cast<const __m256i*>(src);
|
||||
auto dst_stream = static_cast<__m256i*>(dst);
|
||||
|
||||
__m256i min = _mm256_set1_epi16(-1);
|
||||
__m256i max = _mm256_set1_epi16(0);
|
||||
|
||||
const auto iterations = count / 16;
|
||||
for (unsigned n = 0; n < iterations; ++n)
|
||||
{
|
||||
const __m256i raw = _mm256_loadu_si256(src_stream++);
|
||||
const __m256i value = _mm256_shuffle_epi8(raw, shuffle_mask);
|
||||
max = _mm256_max_epu16(max, value);
|
||||
min = _mm256_min_epu16(min, value);
|
||||
_mm256_store_si256(dst_stream++, value);
|
||||
}
|
||||
|
||||
__m128i tmp = _mm256_extracti128_si256(min, 1);
|
||||
__m128i min2 = _mm256_castsi256_si128(min);
|
||||
min2 = _mm_min_epu16(min2, tmp);
|
||||
|
||||
tmp = _mm256_extracti128_si256(max, 1);
|
||||
__m128i max2 = _mm256_castsi256_si128(max);
|
||||
max2 = _mm_max_epu16(max2, tmp);
|
||||
|
||||
const u16 min_index = sse41_hmin_epu16(min2);
|
||||
const u16 max_index = sse41_hmax_epu16(max2);
|
||||
|
||||
return std::make_tuple(min_index, max_index, count);
|
||||
}
|
||||
#endif
|
||||
|
||||
SSE4_1_FUNC
|
||||
static
|
||||
std::tuple<u16, u16, u32> upload_u16_swapped_sse4_1(const void *src, void *dst, u32 count)
|
||||
@ -375,8 +477,25 @@ namespace
|
||||
}
|
||||
else if constexpr (std::is_same<T, u16>::value)
|
||||
{
|
||||
const auto count = (remaining & ~0x7);
|
||||
std::tie(min_index, max_index, written) = upload_u16_swapped_sse4_1(src.data(), dst.data(), count);
|
||||
if (s_use_avx3)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
|
||||
// Handle remainder in function
|
||||
std::tie(min_index, max_index, written) = upload_u16_swapped_avx3(src.data(), dst.data(), remaining);
|
||||
return std::make_tuple(min_index, max_index, written);
|
||||
}
|
||||
else if (s_use_avx2)
|
||||
{
|
||||
const auto count = (remaining & ~0xf);
|
||||
std::tie(min_index, max_index, written) = upload_u16_swapped_avx2(src.data(), dst.data(), count);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto count = (remaining & ~0x7);
|
||||
std::tie(min_index, max_index, written) = upload_u16_swapped_sse4_1(src.data(), dst.data(), count);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -405,6 +524,77 @@ namespace
|
||||
struct primitive_restart_impl
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
AVX3_FUNC
|
||||
static
|
||||
std::tuple<u16, u16> upload_u16_swapped_avx3(const void *src, void *dst, u32 count, u16 restart_index)
|
||||
{
|
||||
const __m512i s_bswap_u16_mask512 = _mm512_broadcast_i64x2(s_bswap_u16_mask);
|
||||
|
||||
const __m512i s_remainder_mask = _mm512_set_epi16(
|
||||
0x20, 0x1F, 0x1E, 0x1D,
|
||||
0x1C, 0x1B, 0x1A, 0x19,
|
||||
0x18, 0x17, 0x16, 0x15,
|
||||
0x14, 0x13, 0x12, 0x11,
|
||||
0x10, 0xF, 0xE, 0xD,
|
||||
0xC, 0xB, 0xA, 0x9,
|
||||
0x8, 0x7, 0x6, 0x5,
|
||||
0x4, 0x3, 0x2, 0x1);
|
||||
|
||||
auto src_stream = static_cast<const __m512*>(src);
|
||||
auto dst_stream = static_cast<__m512*>(dst);
|
||||
|
||||
__m512i restart = _mm512_set1_epi16(restart_index);
|
||||
__m512i min = _mm512_set1_epi16(-1);
|
||||
__m512i max = _mm512_set1_epi16(0);
|
||||
const __m512i ones = _mm512_set1_epi16(-1);
|
||||
|
||||
const auto iterations = count / 32;
|
||||
for (unsigned n = 0; n < iterations; ++n)
|
||||
{
|
||||
const __m512i raw = _mm512_loadu_si512(src_stream++);
|
||||
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512);
|
||||
const __mmask32 mask = _mm512_cmpneq_epi16_mask(restart, value);
|
||||
const __m512i value_with_max_restart = _mm512_mask_blend_epi16(mask, ones, value);
|
||||
max = _mm512_mask_max_epu16(max, mask, max, value);
|
||||
min = _mm512_mask_min_epu16(min, mask, min, value);
|
||||
_mm512_store_si512(dst_stream++, value_with_max_restart);
|
||||
}
|
||||
|
||||
if ((iterations * 32) < count )
|
||||
{
|
||||
const u16 remainder = (count - (iterations * 32));
|
||||
const __m512i remBroadcast = _mm512_set1_epi16(remainder);
|
||||
const __mmask32 mask = _mm512_cmpge_epi16_mask(remBroadcast, s_remainder_mask);
|
||||
const __m512i raw = _mm512_maskz_loadu_epi16(mask, src_stream++);
|
||||
const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512);
|
||||
const __mmask32 mask2 = _mm512_cmpneq_epi16_mask(restart, value);
|
||||
const __mmask32 mask3 = _kand_mask32(mask, mask2);
|
||||
const __m512i value_with_max_restart = _mm512_mask_blend_epi16(mask3, ones, value);
|
||||
max = _mm512_mask_max_epu16(max, mask3, max, value);
|
||||
min = _mm512_mask_min_epu16(min, mask3, min, value);
|
||||
_mm512_mask_storeu_epi16(dst_stream++, mask, value_with_max_restart);
|
||||
}
|
||||
|
||||
__m256i tmp256 = _mm512_extracti64x4_epi64(min, 1);
|
||||
__m256i min2 = _mm512_castsi512_si256(min);
|
||||
min2 = _mm256_min_epu16(min2, tmp256);
|
||||
__m128i tmp = _mm256_extracti128_si256(min2, 1);
|
||||
__m128i min3 = _mm256_castsi256_si128(min2);
|
||||
min3 = _mm_min_epu16(min3, tmp);
|
||||
|
||||
tmp256 = _mm512_extracti64x4_epi64(max, 1);
|
||||
__m256i max2 = _mm512_castsi512_si256(max);
|
||||
max2 = _mm256_max_epu16(max2, tmp256);
|
||||
tmp = _mm256_extracti128_si256(max2, 1);
|
||||
__m128i max3 = _mm256_castsi256_si128(max2);
|
||||
max3 = _mm_max_epu16(max3, tmp);
|
||||
|
||||
const u16 min_index = sse41_hmin_epu16(min3);
|
||||
const u16 max_index = sse41_hmax_epu16(max3);
|
||||
|
||||
return std::make_tuple(min_index, max_index);
|
||||
}
|
||||
|
||||
AVX2_FUNC
|
||||
static
|
||||
std::tuple<u16, u16> upload_u16_swapped_avx2(const void *src, void *dst, u32 iterations, u16 restart_index)
|
||||
@ -526,9 +716,16 @@ namespace
|
||||
{
|
||||
if constexpr (std::is_same<T, u16>::value)
|
||||
{
|
||||
if (s_use_avx2)
|
||||
if (s_use_avx3)
|
||||
{
|
||||
#if defined(ARCH_X64)
|
||||
// Handle remainder in function
|
||||
written = length;
|
||||
std::tie(min_index, max_index) = upload_u16_swapped_avx3(src.data(), dst.data(), length, restart_index);
|
||||
return std::make_tuple(min_index, max_index, written);
|
||||
}
|
||||
else if (s_use_avx2)
|
||||
{
|
||||
u32 iterations = length >> 4;
|
||||
written = length & ~0xF;
|
||||
std::tie(min_index, max_index) = upload_u16_swapped_avx2(src.data(), dst.data(), iterations, restart_index);
|
||||
|
Loading…
x
Reference in New Issue
Block a user