diff --git a/Utilities/JIT.cpp b/Utilities/JIT.cpp index bbbb924302..47ca498fac 100644 --- a/Utilities/JIT.cpp +++ b/Utilities/JIT.cpp @@ -508,6 +508,54 @@ void asmjit::simd_builder::vec_clobbering_test(u32 esize, const Operand& v, cons } } +void asmjit::simd_builder::vec_broadcast_gpr(u32 esize, const Operand& v, const x86::Gp& r) +{ + if (esize == 2) + { + if (utils::has_avx512()) + this->emit(x86::Inst::kIdVpbroadcastw, v, r.r32()); + else if (utils::has_avx()) + { + this->emit(x86::Inst::kIdVmovd, v, r.r32()); + if (utils::has_avx2()) + this->emit(x86::Inst::kIdVpbroadcastw, v, v); + else + { + this->emit(x86::Inst::kIdVpunpcklwd, v, v); + this->emit(x86::Inst::kIdVpshufd, v, v, Imm(0)); + } + } + else + { + this->emit(x86::Inst::kIdMovd, v, r.r32()); + this->emit(x86::Inst::kIdPunpcklwd, v, v); + this->emit(x86::Inst::kIdPshufd, v, v, Imm(0)); + } + } + else if (esize == 4) + { + if (utils::has_avx512()) + this->emit(x86::Inst::kIdVpbroadcastd, v, r.r32()); + else if (utils::has_avx()) + { + this->emit(x86::Inst::kIdVmovd, v, r.r32()); + if (utils::has_avx2()) + this->emit(x86::Inst::kIdVpbroadcastd, v, v); + else + this->emit(x86::Inst::kIdVpshufd, v, v, Imm(0)); + } + else + { + this->emit(x86::Inst::kIdMovd, v, r.r32()); + this->emit(x86::Inst::kIdPshufd, v, v, Imm(0)); + } + } + else + { + fmt::throw_exception("Unimplemented"); + } +} + asmjit::x86::Mem asmjit::simd_builder::ptr_scale_for_vec(u32 esize, const x86::Gp& base, const x86::Gp& index) { switch (ensure(esize)) @@ -698,6 +746,39 @@ void asmjit::simd_builder::vec_umax(u32 esize, const Operand& dst, const Operand fmt::throw_exception("Unimplemented"); } +void asmjit::simd_builder::vec_cmp_eq(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs) +{ + using enum x86::Inst::Id; + if (esize == 2) + { + if (vsize == 64) + { + this->evex().emit(kIdVpcmpeqw, x86::k0, lhs, rhs); + this->evex().emit(kIdVpmovm2w, dst, x86::k0); + } + else + { + _vec_binary_op(kIdPcmpeqw, kIdVpcmpeqw, kIdNone, dst, lhs, rhs); + } + } + else if (esize == 4) + { + if (vsize == 64) + { + this->evex().emit(kIdVpcmpeqd, x86::k0, lhs, rhs); + this->evex().emit(kIdVpmovm2d, dst, x86::k0); + } + else + { + _vec_binary_op(kIdPcmpeqw, kIdVpcmpeqw, kIdNone, dst, lhs, rhs); + } + } + else + { + fmt::throw_exception("Unimplemented"); + } +} + void asmjit::simd_builder::vec_extract_high(u32, const Operand& dst, const Operand& src) { if (vsize == 32) diff --git a/Utilities/JIT.h b/Utilities/JIT.h index c5bb0b2ed4..f8b44dfffd 100644 --- a/Utilities/JIT.h +++ b/Utilities/JIT.h @@ -232,6 +232,7 @@ namespace asmjit void vec_set_all_ones(const Operand& v); void vec_set_const(const Operand& v, const v128& value); void vec_clobbering_test(u32 esize, const Operand& v, const Operand& rhs); + void vec_broadcast_gpr(u32 esize, const Operand& v, const x86::Gp& r); // return x86::ptr(base, ctr, X, 0) where X is set for esize accordingly x86::Mem ptr_scale_for_vec(u32 esize, const x86::Gp& base, const x86::Gp& index); @@ -260,8 +261,15 @@ namespace asmjit _vec_binary_op(kIdPor, kIdVpor, kIdVpord, dst, lhs, rhs); } + void vec_andn(u32, const Operand& dst, const Operand& lhs, const Operand& rhs) + { + using enum x86::Inst::Id; + _vec_binary_op(kIdPandn, kIdVpandn, kIdVpandnd, dst, lhs, rhs); + } + void vec_umin(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs); void vec_umax(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs); + void vec_cmp_eq(u32 esize, const Operand& dst, const Operand& lhs, const Operand& rhs); void vec_extract_high(u32 esize, const Operand& dst, const Operand& src); void vec_extract_gpr(u32 esize, const x86::Gp& dst, const Operand& src); diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index 199976951f..e2b239486a 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -354,279 +354,161 @@ namespace struct primitive_restart_impl { -#if defined(ARCH_X64) - AVX3_FUNC - static - std::tuple upload_u16_swapped_avx3(const void *src, void *dst, u32 count, u16 restart_index) - { - const __m512i s_bswap_u16_mask512 = _mm512_broadcast_i64x2(s_bswap_u16_mask); - - const __m512i s_remainder_mask = _mm512_set_epi16( - 0x20, 0x1F, 0x1E, 0x1D, - 0x1C, 0x1B, 0x1A, 0x19, - 0x18, 0x17, 0x16, 0x15, - 0x14, 0x13, 0x12, 0x11, - 0x10, 0xF, 0xE, 0xD, - 0xC, 0xB, 0xA, 0x9, - 0x8, 0x7, 0x6, 0x5, - 0x4, 0x3, 0x2, 0x1); - - auto src_stream = static_cast(src); - auto dst_stream = static_cast<__m512*>(dst); - - __m512i restart = _mm512_set1_epi16(restart_index); - __m512i min = _mm512_set1_epi16(-1); - __m512i max = _mm512_set1_epi16(0); - const __m512i ones = _mm512_set1_epi16(-1); - - const auto iterations = count / 32; - for (unsigned n = 0; n < iterations; ++n) - { - const __m512i raw = _mm512_loadu_si512(src_stream++); - const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512); - const __mmask32 mask = _mm512_cmpneq_epi16_mask(restart, value); - const __m512i value_with_max_restart = _mm512_mask_blend_epi16(mask, ones, value); - max = _mm512_mask_max_epu16(max, mask, max, value); - min = _mm512_mask_min_epu16(min, mask, min, value); - _mm512_store_si512(dst_stream++, value_with_max_restart); - } - - if ((iterations * 32) < count ) - { - const u16 remainder = (count - (iterations * 32)); - const __m512i remBroadcast = _mm512_set1_epi16(remainder); - const __mmask32 mask = _mm512_cmpge_epi16_mask(remBroadcast, s_remainder_mask); - const __m512i raw = _mm512_maskz_loadu_epi16(mask, src_stream++); - const __m512i value = _mm512_shuffle_epi8(raw, s_bswap_u16_mask512); - const __mmask32 mask2 = _mm512_cmpneq_epi16_mask(restart, value); - const __mmask32 mask3 = _kand_mask32(mask, mask2); - const __m512i value_with_max_restart = _mm512_mask_blend_epi16(mask3, ones, value); - max = _mm512_mask_max_epu16(max, mask3, max, value); - min = _mm512_mask_min_epu16(min, mask3, min, value); - _mm512_mask_storeu_epi16(dst_stream++, mask, value_with_max_restart); - } - - __m256i tmp256 = _mm512_extracti64x4_epi64(min, 1); - __m256i min2 = _mm512_castsi512_si256(min); - min2 = _mm256_min_epu16(min2, tmp256); - __m128i tmp = _mm256_extracti128_si256(min2, 1); - __m128i min3 = _mm256_castsi256_si128(min2); - min3 = _mm_min_epu16(min3, tmp); - - tmp256 = _mm512_extracti64x4_epi64(max, 1); - __m256i max2 = _mm512_castsi512_si256(max); - max2 = _mm256_max_epu16(max2, tmp256); - tmp = _mm256_extracti128_si256(max2, 1); - __m128i max3 = _mm256_castsi256_si128(max2); - max3 = _mm_max_epu16(max3, tmp); - - const u16 min_index = sse41_hmin_epu16(min3); - const u16 max_index = sse41_hmax_epu16(max3); - - return std::make_tuple(min_index, max_index); - } - - AVX2_FUNC - static - std::tuple upload_u16_swapped_avx2(const void *src, void *dst, u32 iterations, u16 restart_index) - { - const __m256i shuffle_mask = _mm256_set_m128i(s_bswap_u16_mask, s_bswap_u16_mask); - - auto src_stream = static_cast(src); - auto dst_stream = static_cast<__m256i*>(dst); - - __m256i restart = _mm256_set1_epi16(restart_index); - __m256i min = _mm256_set1_epi16(-1); - __m256i max = _mm256_set1_epi16(0); - - for (unsigned n = 0; n < iterations; ++n) - { - const __m256i raw = _mm256_loadu_si256(src_stream++); - const __m256i value = _mm256_shuffle_epi8(raw, shuffle_mask); - const __m256i mask = _mm256_cmpeq_epi16(restart, value); - const __m256i value_with_min_restart = _mm256_andnot_si256(mask, value); - const __m256i value_with_max_restart = _mm256_or_si256(mask, value); - max = _mm256_max_epu16(max, value_with_min_restart); - min = _mm256_min_epu16(min, value_with_max_restart); - _mm256_store_si256(dst_stream++, value_with_max_restart); - } - - __m128i tmp = _mm256_extracti128_si256(min, 1); - __m128i min2 = _mm256_castsi256_si128(min); - min2 = _mm_min_epu16(min2, tmp); - - tmp = _mm256_extracti128_si256(max, 1); - __m128i max2 = _mm256_castsi256_si128(max); - max2 = _mm_max_epu16(max2, tmp); - - const u16 min_index = sse41_hmin_epu16(min2); - const u16 max_index = sse41_hmax_epu16(max2); - - return std::make_tuple(min_index, max_index); - } -#endif - - SSE4_1_FUNC - static - std::tuple upload_u16_swapped_sse4_1(const void *src, void *dst, u32 iterations, u16 restart_index) - { - auto src_stream = static_cast(src); - auto dst_stream = static_cast<__m128i*>(dst); - - __m128i restart = _mm_set1_epi16(restart_index); - __m128i min = _mm_set1_epi16(-1); - __m128i max = _mm_set1_epi16(0); - - for (unsigned n = 0; n < iterations; ++n) - { - const __m128i raw = _mm_loadu_si128(src_stream++); - const __m128i value = _mm_shuffle_epi8(raw, s_bswap_u16_mask); - const __m128i mask = _mm_cmpeq_epi16(restart, value); - const __m128i value_with_min_restart = _mm_andnot_si128(mask, value); - const __m128i value_with_max_restart = _mm_or_si128(mask, value); - max = _mm_max_epu16(max, value_with_min_restart); - min = _mm_min_epu16(min, value_with_max_restart); - _mm_store_si128(dst_stream++, value_with_max_restart); - } - - const u16 min_index = sse41_hmin_epu16(min); - const u16 max_index = sse41_hmax_epu16(max); - - return std::make_tuple(min_index, max_index); - } - - SSE4_1_FUNC - static - std::tuple upload_u32_swapped_sse4_1(const void *src, void *dst, u32 iterations, u32 restart_index) - { - auto src_stream = static_cast(src); - auto dst_stream = static_cast<__m128i*>(dst); - - __m128i restart = _mm_set1_epi32(restart_index); - __m128i min = _mm_set1_epi32(0xffffffff); - __m128i max = _mm_set1_epi32(0); - - for (unsigned n = 0; n < iterations; ++n) - { - const __m128i raw = _mm_loadu_si128(src_stream++); - const __m128i value = _mm_shuffle_epi8(raw, s_bswap_u32_mask); - const __m128i mask = _mm_cmpeq_epi32(restart, value); - const __m128i value_with_min_restart = _mm_andnot_si128(mask, value); - const __m128i value_with_max_restart = _mm_or_si128(mask, value); - max = _mm_max_epu32(max, value_with_min_restart); - min = _mm_min_epu32(min, value_with_max_restart); - _mm_store_si128(dst_stream++, value_with_max_restart); - } - - __m128i tmp = _mm_srli_si128(min, 8); - min = _mm_min_epu32(min, tmp); - tmp = _mm_srli_si128(min, 4); - min = _mm_min_epu32(min, tmp); - - tmp = _mm_srli_si128(max, 8); - max = _mm_max_epu32(max, tmp); - tmp = _mm_srli_si128(max, 4); - max = _mm_max_epu32(max, tmp); - - const u32 min_index = _mm_cvtsi128_si32(min); - const u32 max_index = _mm_cvtsi128_si32(max); - - return std::make_tuple(min_index, max_index); - } - - template - static - std::tuple upload_untouched(std::span> src, std::span dst, T restart_index, bool skip_restart) + template + static inline u64 upload_untouched_naive(const be_t* src, T* dst, u32 count, T restart_index) { T min_index = index_limit(); T max_index = 0; - u32 written = 0; - u32 length = ::size32(src); - if (length >= 32 && !skip_restart) + for (u32 i = 0; i < count; ++i) { - if constexpr (std::is_same::value) + T index = src[i].value(); + dst[i] = index == restart_index ? index_limit() : min_max(min_index, max_index, index); + } + + return (u64{max_index} << 32) | u64{min_index}; + } + +#ifdef ARCH_X64 + template + static void build_upload_untouched(asmjit::simd_builder& c, native_args& args) + { + using namespace asmjit; + + if (!utils::has_sse41()) + { + c.jmp(&upload_untouched_naive); + return; + } + + c.vec_set_const(c.v1, sizeof(T) == 2 ? s_bswap_u16_mask : s_bswap_u32_mask); + c.vec_set_all_ones(c.v2); // vec min + c.vec_set_all_zeros(c.v3); // vec max + c.vec_broadcast_gpr(sizeof(T), c.v4, args[3].r32()); + + c.build_loop(sizeof(T), x86::eax, args[2].r32(), [&] + { + c.zero_if_not_masked().vec_load_unaligned(sizeof(T), c.v0, c.ptr_scale_for_vec(sizeof(T), args[0], x86::rax)); + + if (utils::has_ssse3()) { - if (s_use_avx3) + c.vec_shuffle_xi8(c.v0, c.v0, c.v1); + } + else + { + c.emit(x86::Inst::kIdMovdqa, c.v1, c.v0); + c.emit(x86::Inst::kIdPsrlw, c.v0, 8); + c.emit(x86::Inst::kIdPsllw, c.v1, 8); + c.emit(x86::Inst::kIdPor, c.v0, c.v1); + + if constexpr (sizeof(T) == 4) { -#if defined(ARCH_X64) - // Handle remainder in function - written = length; - std::tie(min_index, max_index) = upload_u16_swapped_avx3(src.data(), dst.data(), length, restart_index); - return std::make_tuple(min_index, max_index, written); + c.emit(x86::Inst::kIdPshuflw, c.v0, c.v0, 0b01001110); + c.emit(x86::Inst::kIdPshufhw, c.v0, c.v0, 0b01001110); } - else if (s_use_avx2) - { - u32 iterations = length >> 4; - written = length & ~0xF; - std::tie(min_index, max_index) = upload_u16_swapped_avx2(src.data(), dst.data(), iterations, restart_index); + } + + c.vec_cmp_eq(sizeof(T), c.v5, c.v4, c.v0); + c.vec_andn(sizeof(T), c.v5, c.v5, c.v0); + c.keep_if_not_masked().vec_umax(sizeof(T), c.v3, c.v3, c.v5); + c.vec_cmp_eq(sizeof(T), c.v5, c.v4, c.v0); + c.vec_or(sizeof(T), c.v0, c.v0, c.v5); + c.keep_if_not_masked().vec_umin(sizeof(T), c.v2, c.v2, c.v0); + c.keep_if_not_masked().vec_store_unaligned(sizeof(T), c.v0, c.ptr_scale_for_vec(sizeof(T), args[1], x86::rax)); + }, [&] + { + // Compress horizontally, protect high values + c.vec_extract_high(sizeof(T), c.v0, c.v3); + c.vec_umax(sizeof(T), c.v3, c.v3, c.v0); + c.vec_extract_high(sizeof(T), c.v0, c.v2); + c.vec_umin(sizeof(T), c.v2, c.v2, c.v0); + }); + + c.vec_extract_gpr(sizeof(T), x86::edx, c.v3); + c.vec_extract_gpr(sizeof(T), x86::eax, c.v2); + c.shl(x86::rdx, 32); + c.or_(x86::rax, x86::rdx); + c.vec_cleanup_ret(); + } + + static inline auto upload_xi16 = build_function_asm*, u16*, u32, u32), asmjit::simd_builder>("restart_untouched_upload_xi16", &build_upload_untouched); + static inline auto upload_xi32 = build_function_asm*, u32*, u32, u32), asmjit::simd_builder>("restart_untouched_upload_xi32", &build_upload_untouched); #endif - } - else if (s_use_sse4_1) - { - u32 iterations = length >> 3; - written = length & ~0x7; - std::tie(min_index, max_index) = upload_u16_swapped_sse4_1(src.data(), dst.data(), iterations, restart_index); - } - } - else if constexpr (std::is_same::value) - { - if (s_use_sse4_1) - { - u32 iterations = length >> 2; - written = length & ~0x3; - std::tie(min_index, max_index) = upload_u32_swapped_sse4_1(src.data(), dst.data(), iterations, restart_index); - } - } - else - { - fmt::throw_exception("Unreachable"); - } - } - for (u32 i = written; i < length; ++i) - { - T index = src[i]; - if (index == restart_index) - { - if (!skip_restart) - { - dst[written++] = index_limit(); - } - } - else - { - dst[written++] = min_max(min_index, max_index, index); - } - } + template + static inline std::tuple upload_untouched(std::span> src, std::span dst, T restart_index) + { + T min_index, max_index; + u32 count = ::size32(src); + u64 r; - return std::make_tuple(min_index, max_index, written); +#if defined(ARCH_X64) + if constexpr (sizeof(T) == 2) + r = upload_xi16(src.data(), dst.data(), count, restart_index); + else + r = upload_xi32(src.data(), dst.data(), count, restart_index); +#else + r = upload_untouched_naive(src.data(), dst.data(), count, restart_index); +#endif + + min_index = r; + max_index = r >> 32; + + return std::make_tuple(min_index, max_index, count); } }; - template - std::tuple upload_untouched(std::span> src, std::span dst, rsx::primitive_type draw_mode, bool is_primitive_restart_enabled, u32 primitive_restart_index) +template +NEVER_INLINE std::tuple upload_untouched_skip_restart(std::span> src, std::span dst, T restart_index) +{ + T min_index = index_limit(); + T max_index = 0; + u32 written = 0; + u32 length = ::size32(src); + + for (u32 i = written; i < length; ++i) { - if (!is_primitive_restart_enabled) + T index = src[i]; + if (index != restart_index) + { + dst[written++] = min_max(min_index, max_index, index); + } + } + + return std::make_tuple(min_index, max_index, written); +} + +template +std::tuple upload_untouched(std::span> src, std::span dst, rsx::primitive_type draw_mode, bool is_primitive_restart_enabled, u32 primitive_restart_index) +{ + if (!is_primitive_restart_enabled) + { + return untouched_impl::upload_untouched(src, dst); + } + else if constexpr (std::is_same::value) + { + if (primitive_restart_index > 0xffff) { return untouched_impl::upload_untouched(src, dst); } - else if constexpr (std::is_same::value) + else if (is_primitive_disjointed(draw_mode)) { - if (primitive_restart_index > 0xffff) - { - return untouched_impl::upload_untouched(src, dst); - } - else - { - return primitive_restart_impl::upload_untouched(src, dst, static_cast(primitive_restart_index), is_primitive_disjointed(draw_mode)); - } + return upload_untouched_skip_restart(src, dst, static_cast(primitive_restart_index)); } else { - return primitive_restart_impl::upload_untouched(src, dst, primitive_restart_index, is_primitive_disjointed(draw_mode)); + return primitive_restart_impl::upload_untouched(src, dst, static_cast(primitive_restart_index)); } } + else if (is_primitive_disjointed(draw_mode)) + { + return upload_untouched_skip_restart(src, dst, primitive_restart_index); + } + else + { + return primitive_restart_impl::upload_untouched(src, dst, primitive_restart_index); + } +} template std::tuple expand_indexed_triangle_fan(std::span> src, std::span dst, bool is_primitive_restart_enabled, u32 primitive_restart_index)