diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index 8e2f2e89f4..1b88b311b5 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -10,8 +10,6 @@ #include "emmintrin.h" #include "immintrin.h" -#define DEBUG_VERTEX_STREAMING 0 - #if !defined(_MSC_VER) && defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" @@ -39,11 +37,6 @@ using __m256i = long long __attribute__((vector_size(32))); #endif #endif // _MSC_VER -SSSE3_FUNC static inline __m128i ssse3_shuffle_epi8(__m128i x, __m128i y) -{ - return _mm_shuffle_epi8(x, y); -} - SSE4_1_FUNC static inline u16 sse41_hmin_epu16(__m128i x) { return _mm_cvtsi128_si32(_mm_minpos_epu16(x)); @@ -104,23 +97,6 @@ namespace utils namespace { - /** - * Convert CMP vector to RGBA16. - * A vector in CMP (compressed) format is stored as X11Y11Z10 and has a W component of 1. - * X11 and Y11 channels are int between -1024 and 1023 interpreted as -1.f, 1.f - * Z10 is int between -512 and 511 interpreted as -1.f, 1.f - */ - std::array decode_cmp_vector(u32 encoded_vector) - { - u16 Z = encoded_vector >> 22; - Z = Z << 6; - u16 Y = (encoded_vector >> 11) & 0x7FF; - Y = Y << 5; - u16 X = encoded_vector & 0x7FF; - X = X << 5; - return{ X, Y, Z, 1 }; - } - template PLAIN_FUNC bool copy_data_swap_u32_naive(u32* dst, const u32* src, u32 count) { @@ -184,475 +160,6 @@ built_function copy_data_swap_u32(&build_copy_ built_function copy_data_swap_u32_cmp(&build_copy_data_swap_u32); -namespace -{ - inline void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride) - { - auto dst_ptr = static_cast<__m128i*>(dst); - auto src_ptr = static_cast(src); - - const u32 dword_count = (vertex_count * (stride >> 2)); - const u32 iterations = dword_count >> 2; - const u32 remaining = dword_count % 4; - - if (s_use_ssse3) [[likely]] - { - for (u32 i = 0; i < iterations; ++i) - { - const __m128i vector = _mm_loadu_si128(src_ptr); - const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, s_bswap_u32_mask); - _mm_stream_si128(dst_ptr, shuffled_vector); - - src_ptr++; - dst_ptr++; - } - } - else - { - for (u32 i = 0; i < iterations; ++i) - { - const __m128i vec0 = _mm_loadu_si128(src_ptr); - const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); - const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16)); - _mm_stream_si128(dst_ptr, vec2); - - src_ptr++; - dst_ptr++; - } - } - - if (remaining) - { - const auto src_ptr2 = utils::bless>(src_ptr); - const auto dst_ptr2 = utils::bless>(dst_ptr); - - for (u32 i = 0; i < remaining; ++i) - dst_ptr2[i] = src_ptr2[i]; - } - } - - inline void stream_data_to_memory_swapped_u16(void *dst, const void *src, u32 vertex_count, u8 stride) - { - auto dst_ptr = static_cast<__m128i*>(dst); - auto src_ptr = static_cast(src); - - const u32 word_count = (vertex_count * (stride >> 1)); - const u32 iterations = word_count >> 3; - const u32 remaining = word_count % 8; - - if (s_use_ssse3) [[likely]] - { - for (u32 i = 0; i < iterations; ++i) - { - const __m128i vector = _mm_loadu_si128(src_ptr); - const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, s_bswap_u16_mask); - _mm_stream_si128(dst_ptr, shuffled_vector); - - src_ptr++; - dst_ptr++; - } - } - else - { - for (u32 i = 0; i < iterations; ++i) - { - const __m128i vec0 = _mm_loadu_si128(src_ptr); - const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); - _mm_stream_si128(dst_ptr, vec1); - - src_ptr++; - dst_ptr++; - } - } - - if (remaining) - { - auto src_ptr2 = utils::bless>(src_ptr); - auto dst_ptr2 = utils::bless>(dst_ptr); - - for (u32 i = 0; i < remaining; ++i) - dst_ptr2[i] = src_ptr2[i]; - } - } - - inline void stream_data_to_memory_swapped_u32_non_continuous(void *dst, const void *src, u32 vertex_count, u8 dst_stride, u8 src_stride) - { - auto src_ptr = static_cast(src); - auto dst_ptr = static_cast(dst); - - //Count vertices to copy - const bool is_128_aligned = !((dst_stride | src_stride) & 15); - - u32 min_block_size = std::min(src_stride, dst_stride); - if (min_block_size == 0) min_block_size = dst_stride; - - u32 iterations = 0; - u32 remainder = is_128_aligned ? 0 : 1 + ((16 - min_block_size) / min_block_size); - - if (vertex_count > remainder) - iterations = vertex_count - remainder; - else - remainder = vertex_count; - - if (s_use_ssse3) [[likely]] - { - for (u32 i = 0; i < iterations; ++i) - { - const __m128i vector = _mm_loadu_si128(reinterpret_cast(src_ptr)); - const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, s_bswap_u32_mask); - _mm_storeu_si128(reinterpret_cast<__m128i*>(dst_ptr), shuffled_vector); - - src_ptr += src_stride; - dst_ptr += dst_stride; - } - } - else - { - for (u32 i = 0; i < iterations; ++i) - { - const __m128i vec0 = _mm_loadu_si128(reinterpret_cast(src_ptr)); - const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); - const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16)); - _mm_storeu_si128(reinterpret_cast<__m128i*>(dst_ptr), vec2); - - src_ptr += src_stride; - dst_ptr += dst_stride; - } - } - - if (remainder) - { - const u8 attribute_sz = min_block_size >> 2; - for (u32 n = 0; n < remainder; ++n) - { - auto src_ptr2 = utils::bless>(src_ptr); - auto dst_ptr2 = utils::bless(dst_ptr); - - for (u32 v = 0; v < attribute_sz; ++v) - dst_ptr2[v] = src_ptr2[v]; - - src_ptr += src_stride; - dst_ptr += dst_stride; - } - } - } - - inline void stream_data_to_memory_swapped_u16_non_continuous(void *dst, const void *src, u32 vertex_count, u8 dst_stride, u8 src_stride) - { - auto src_ptr = static_cast(src); - auto dst_ptr = static_cast(dst); - - const bool is_128_aligned = !((dst_stride | src_stride) & 15); - - u32 min_block_size = std::min(src_stride, dst_stride); - if (min_block_size == 0) min_block_size = dst_stride; - - u32 iterations = 0; - u32 remainder = is_128_aligned ? 0 : 1 + ((16 - min_block_size) / min_block_size); - - if (vertex_count > remainder) - iterations = vertex_count - remainder; - else - remainder = vertex_count; - - if (s_use_ssse3) [[likely]] - { - for (u32 i = 0; i < iterations; ++i) - { - const __m128i vector = _mm_loadu_si128(reinterpret_cast(src_ptr)); - const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, s_bswap_u16_mask); - _mm_storeu_si128(reinterpret_cast<__m128i*>(dst_ptr), shuffled_vector); - - src_ptr += src_stride; - dst_ptr += dst_stride; - } - } - else - { - for (u32 i = 0; i < iterations; ++i) - { - const __m128i vec0 = _mm_loadu_si128(reinterpret_cast(src_ptr)); - const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); - _mm_storeu_si128(reinterpret_cast<__m128i*>(dst_ptr), vec1); - - src_ptr += src_stride; - dst_ptr += dst_stride; - } - } - - if (remainder) - { - const u8 attribute_sz = min_block_size >> 1; - for (u32 n = 0; n < remainder; ++n) - { - auto src_ptr2 = utils::bless>(src_ptr); - auto dst_ptr2 = utils::bless(dst_ptr); - - for (u32 v = 0; v < attribute_sz; ++v) - dst_ptr2[v] = src_ptr2[v]; - - src_ptr += src_stride; - dst_ptr += dst_stride; - } - } - } - - inline void stream_data_to_memory_u8_non_continuous(void *dst, const void *src, u32 vertex_count, u8 attribute_size, u8 dst_stride, u8 src_stride) - { - auto src_ptr = static_cast(src); - auto dst_ptr = static_cast(dst); - - switch (attribute_size) - { - case 4: - { - //Read one dword every iteration - for (u32 vertex = 0; vertex < vertex_count; ++vertex) - { - *reinterpret_cast(dst_ptr) = *reinterpret_cast(src_ptr); - - dst_ptr += dst_stride; - src_ptr += src_stride; - } - - break; - } - case 3: - { - //Read one word and one byte - for (u32 vertex = 0; vertex < vertex_count; ++vertex) - { - *reinterpret_cast(dst_ptr) = *reinterpret_cast(src_ptr); - dst_ptr[2] = src_ptr[2]; - - dst_ptr += dst_stride; - src_ptr += src_stride; - } - - break; - } - case 2: - { - //Copy u16 blocks - for (u32 vertex = 0; vertex < vertex_count; ++vertex) - { - *reinterpret_cast(dst_ptr) = *reinterpret_cast(src_ptr); - - dst_ptr += dst_stride; - src_ptr += src_stride; - } - - break; - } - case 1: - { - for (u32 vertex = 0; vertex < vertex_count; ++vertex) - { - dst_ptr[0] = src_ptr[0]; - - dst_ptr += dst_stride; - src_ptr += src_stride; - } - - break; - } - } - } - - template - void copy_whole_attribute_array_impl(void* raw_dst, const void* raw_src, u8 dst_stride, u32 src_stride, u32 vertex_count) - { - auto src_ptr = static_cast(raw_src); - auto dst_ptr = static_cast(raw_dst); - - for (u32 vertex = 0; vertex < vertex_count; ++vertex) - { - auto typed_dst = reinterpret_cast(dst_ptr); - auto typed_src = reinterpret_cast(src_ptr); - - for (u32 i = 0; i < N; ++i) - { - typed_dst[i] = typed_src[i]; - } - - src_ptr += src_stride; - dst_ptr += dst_stride; - } - } - - /* - * Copies a number of src vertices, repeated over and over to fill the dst - * e.g repeat 2 vertices over a range of 16 verts, so 8 reps - */ - template - void copy_whole_attribute_array_repeating_impl(void* raw_dst, const void* raw_src, const u8 dst_stride, const u32 src_stride, const u32 vertex_count, const u32 src_vertex_count) - { - auto src_ptr = static_cast(raw_src); - auto dst_ptr = static_cast(raw_dst); - - u32 src_offset = 0; - u32 src_limit = src_stride * src_vertex_count; - - for (u32 vertex = 0; vertex < vertex_count; ++vertex) - { - auto typed_dst = reinterpret_cast(dst_ptr); - auto typed_src = reinterpret_cast(src_ptr + src_offset); - - for (u32 i = 0; i < N; ++i) - { - typed_dst[i] = typed_src[i]; - } - - src_offset = (src_offset + src_stride) % src_limit; - dst_ptr += dst_stride; - } - } - - template - void copy_whole_attribute_array(void* raw_dst, const void* raw_src, const u8 attribute_size, const u8 dst_stride, const u32 src_stride, const u32 vertex_count, const u32 src_vertex_count) - { - //Eliminate the inner loop by templating the inner loop counter N - - if (src_vertex_count == vertex_count) - { - switch (attribute_size) - { - case 1: - copy_whole_attribute_array_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count); - break; - case 2: - copy_whole_attribute_array_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count); - break; - case 3: - copy_whole_attribute_array_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count); - break; - case 4: - copy_whole_attribute_array_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count); - break; - } - } - else - { - switch (attribute_size) - { - case 1: - copy_whole_attribute_array_repeating_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count, src_vertex_count); - break; - case 2: - copy_whole_attribute_array_repeating_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count, src_vertex_count); - break; - case 3: - copy_whole_attribute_array_repeating_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count, src_vertex_count); - break; - case 4: - copy_whole_attribute_array_repeating_impl(raw_dst, raw_src, dst_stride, src_stride, vertex_count, src_vertex_count); - break; - } - } - } -} - -void write_vertex_array_data_to_buffer(std::span raw_dst_span, std::span src_ptr, u32 count, rsx::vertex_base_type type, u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, bool swap_endianness) -{ - ensure((vector_element_count > 0)); - const u32 src_read_stride = rsx::get_vertex_type_size_on_host(type, vector_element_count); - - bool use_stream_no_stride = false; - bool use_stream_with_stride = false; - - //If stride is not defined, we have a packed array - if (attribute_src_stride == 0) attribute_src_stride = src_read_stride; - - //Sometimes, we get a vertex attribute to be repeated. Just copy the supplied vertices only - //TODO: Stop these requests from getting here in the first place! - //TODO: Check if it is possible to have a repeating array with more than one attribute instance - const u32 real_count = static_cast(src_ptr.size_bytes()) / attribute_src_stride; - if (real_count == 1) attribute_src_stride = 0; //Always fetch src[0] - - //TODO: Determine favourable vertex threshold where vector setup costs become negligible - //Tests show that even with 4 vertices, using traditional bswap is significantly slower over a large number of calls - - const u64 src_address = reinterpret_cast(src_ptr.data()); - const bool sse_aligned = ((src_address & 15) == 0); - -#if !DEBUG_VERTEX_STREAMING - - if (swap_endianness) - { - if (real_count >= count || real_count == 1) - { - if (attribute_src_stride == dst_stride && src_read_stride == dst_stride) - use_stream_no_stride = true; - else - use_stream_with_stride = true; - } - } - -#endif - - switch (type) - { - case rsx::vertex_base_type::ub: - case rsx::vertex_base_type::ub256: - { - if (use_stream_no_stride) - memcpy(raw_dst_span.data(), src_ptr.data(), count * dst_stride); - else if (use_stream_with_stride) - stream_data_to_memory_u8_non_continuous(raw_dst_span.data(), src_ptr.data(), count, vector_element_count, dst_stride, attribute_src_stride); - else - copy_whole_attribute_array(raw_dst_span.data(), src_ptr.data(), vector_element_count, dst_stride, attribute_src_stride, count, real_count); - - return; - } - case rsx::vertex_base_type::s1: - case rsx::vertex_base_type::sf: - case rsx::vertex_base_type::s32k: - { - if (use_stream_no_stride && sse_aligned) - stream_data_to_memory_swapped_u16(raw_dst_span.data(), src_ptr.data(), count, attribute_src_stride); - else if (use_stream_with_stride) - stream_data_to_memory_swapped_u16_non_continuous(raw_dst_span.data(), src_ptr.data(), count, dst_stride, attribute_src_stride); - else if (swap_endianness) - copy_whole_attribute_array, u16>(raw_dst_span.data(), src_ptr.data(), vector_element_count, dst_stride, attribute_src_stride, count, real_count); - else - copy_whole_attribute_array(raw_dst_span.data(), src_ptr.data(), vector_element_count, dst_stride, attribute_src_stride, count, real_count); - - return; - } - case rsx::vertex_base_type::f: - { - if (use_stream_no_stride && sse_aligned) - stream_data_to_memory_swapped_u32(raw_dst_span.data(), src_ptr.data(), count, attribute_src_stride); - else if (use_stream_with_stride) - stream_data_to_memory_swapped_u32_non_continuous(raw_dst_span.data(), src_ptr.data(), count, dst_stride, attribute_src_stride); - else if (swap_endianness) - copy_whole_attribute_array, u32>(raw_dst_span.data(), src_ptr.data(), vector_element_count, dst_stride, attribute_src_stride, count, real_count); - else - copy_whole_attribute_array(raw_dst_span.data(), src_ptr.data(), vector_element_count, dst_stride, attribute_src_stride, count, real_count); - - return; - } - case rsx::vertex_base_type::cmp: - { - std::span dst_span = utils::bless(raw_dst_span); - for (u32 i = 0; i < count; ++i) - { - u32 src_value; - memcpy(&src_value, src_ptr.subspan(attribute_src_stride * i).data(), sizeof(u32)); - - if (swap_endianness) src_value = stx::se_storage::swap(src_value); - - const auto& decoded_vector = decode_cmp_vector(src_value); - dst_span[i * dst_stride / sizeof(u16)] = decoded_vector[0]; - dst_span[i * dst_stride / sizeof(u16) + 1] = decoded_vector[1]; - dst_span[i * dst_stride / sizeof(u16) + 2] = decoded_vector[2]; - dst_span[i * dst_stride / sizeof(u16) + 3] = decoded_vector[3]; - } - return; - } - } -} - namespace { template diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.h b/rpcs3/Emu/RSX/Common/BufferUtils.h index 84684c33e4..1c872e7319 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.h +++ b/rpcs3/Emu/RSX/Common/BufferUtils.h @@ -5,12 +5,6 @@ #include -/** - * Write count vertex attributes from src_ptr. - * src_ptr array layout is deduced from the type, vector element count and src_stride arguments. - */ -void write_vertex_array_data_to_buffer(std::span raw_dst_span, std::span src_ptr, u32 count, rsx::vertex_base_type type, u32 vector_element_count, u32 attribute_src_stride, u8 dst_stride, bool swap_endianness); - /* * If primitive mode is not supported and need to be emulated (using an index buffer) returns false. */