diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index f48131593f..91bd04afdd 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -467,10 +467,8 @@ namespace gl return nullptr; case 2: return get_compute_task(); - break; case 4: return get_compute_task(); - break; default: fmt::throw_exception("Unsupported format"); } @@ -951,33 +949,19 @@ namespace gl auto pack_info = get_format_type(src); auto unpack_info = get_format_type(dst); - if (!caps.ARB_compute_shader_supported) + // D32FS8 can be read back as D24S8 or D32S8X24. In case of the latter, double memory requirements + if (pack_info.type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV) { - auto remove_depth_transformation = [](const texture* tex, pixel_buffer_layout& pack_info) - { - if (tex->aspect() & image_aspect::depth) - { - switch (pack_info.type) - { - case GL_FLOAT_32_UNSIGNED_INT_24_8_REV: - pack_info.type = GL_UNSIGNED_INT_24_8; - break; - case GL_FLOAT: - pack_info.type = GL_HALF_FLOAT; - break; - } - } - }; - - remove_depth_transformation(src, pack_info); - remove_depth_transformation(dst, unpack_info); + src_mem.image_size_in_bytes *= 2; + } + + if (unpack_info.type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV) + { + dst_mem.image_size_in_bytes *= 2; } - // Start pack operation - void* transfer_offset = nullptr; if (caps.ARB_compute_shader_supported) [[likely]] { - // Apply transformation bool skip_transform = false; if ((src->aspect() | dst->aspect()) == gl::image_aspect::color) { @@ -989,21 +973,20 @@ namespace gl if (skip_transform) [[likely]] { - const bool old_swap_bytes = pack_info.swap_bytes; + // Disable byteswap to make the transport operation passthrough pack_info.swap_bytes = false; - - copy_image_to_buffer(pack_info, src, &g_typeless_transfer_buffer, 0, src_region, &src_mem); - pack_info.swap_bytes = old_swap_bytes; - } - else - { - void* data_ptr = copy_image_to_buffer(pack_info, src, &g_typeless_transfer_buffer, 0, src_region, &src_mem); - copy_buffer_to_image(unpack_info, &g_typeless_transfer_buffer, dst, data_ptr, 0, dst_region, &dst_mem); + unpack_info.swap_bytes = false; } + void* data_ptr = copy_image_to_buffer(pack_info, src, &g_typeless_transfer_buffer, 0, src_region, &src_mem); + copy_buffer_to_image(unpack_info, &g_typeless_transfer_buffer, dst, data_ptr, 0, dst_region, &dst_mem); + + // Cleanup // NOTE: glBindBufferRange also binds the buffer to the old-school target. // Unbind it to avoid glitching later glBindBuffer(GL_SHADER_STORAGE_BUFFER, GL_NONE); + glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, GL_NONE); } else { @@ -1014,26 +997,72 @@ namespace gl g_typeless_transfer_buffer.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY); } + // Simplify pack/unpack information to something OpenGL can natively digest + auto remove_depth_transformation = [](const texture* tex, pixel_buffer_layout& pack_info) + { + if (tex->aspect() & image_aspect::depth) + { + switch (pack_info.type) + { + case GL_UNSIGNED_INT_24_8: + pack_info.swap_bytes = false; + break; + case GL_FLOAT_32_UNSIGNED_INT_24_8_REV: + pack_info.type = GL_UNSIGNED_INT_24_8; + pack_info.swap_bytes = false; + break; + case GL_FLOAT: + pack_info.type = GL_HALF_FLOAT; + break; + } + } + }; + + remove_depth_transformation(src, pack_info); + remove_depth_transformation(dst, unpack_info); + + // Attempt to compensate for the lack of compute shader modifiers + // If crossing the aspect boundary between color and depth + // and one image is depth, invert byteswap for the other one to compensate + const auto cross_aspect_test = (image_aspect::color | image_aspect::depth); + const auto test = (src->aspect() | dst->aspect()) & cross_aspect_test; + if (test == cross_aspect_test) + { + if (src->aspect() & image_aspect::depth) + { + // Source is depth, modify unpack rule + if (pack_info.size == 4 && unpack_info.size == 4) + { + unpack_info.swap_bytes = !unpack_info.swap_bytes; + } + } + else + { + // Dest is depth, modify pack rule + if (pack_info.size == 4 && unpack_info.size == 4) + { + pack_info.swap_bytes = !pack_info.swap_bytes; + } + } + } + + // Start pack operation pixel_pack_settings pack_settings{}; pack_settings.swap_bytes(pack_info.swap_bytes); g_typeless_transfer_buffer.bind(buffer::target::pixel_pack); src->copy_to(nullptr, static_cast(pack_info.format), static_cast(pack_info.type), 0, src_region, pack_settings); - } - glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); + glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); - // Start unpack operation - pixel_unpack_settings unpack_settings{}; - - if (!caps.ARB_compute_shader_supported) [[unlikely]] - { + // Start unpack operation + pixel_unpack_settings unpack_settings{}; unpack_settings.swap_bytes(unpack_info.swap_bytes); - } - g_typeless_transfer_buffer.bind(buffer::target::pixel_unpack); - dst->copy_from(transfer_offset, static_cast(unpack_info.format), static_cast(unpack_info.type), 0, dst_region, unpack_settings); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, GL_NONE); + g_typeless_transfer_buffer.bind(buffer::target::pixel_unpack); + dst->copy_from(nullptr, static_cast(unpack_info.format), static_cast(unpack_info.type), 0, dst_region, unpack_settings); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, GL_NONE); + } } void copy_typeless(texture* dst, const texture* src) diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index d0c9cdcd80..5e204c3f95 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -17,6 +17,7 @@ #include "GLTexture.h" #include "../Common/TextureUtils.h" #include "../Common/texture_cache.h" +#include "../Common/BufferUtils.h" class GLGSRender; @@ -187,6 +188,12 @@ namespace gl mem_info.image_size_in_bytes = src->pitch() * src->height(); mem_info.memory_required = 0; + if (pack_info.type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV) + { + // D32FS8 can be read back as D24S8 or D32S8X24. In case of the latter, double memory requirements + mem_info.image_size_in_bytes *= 2; + } + void* out_offset = copy_image_to_buffer(pack_info, src, &scratch_mem, 0, { {}, src->size3D() }, &mem_info); glBindBuffer(GL_SHADER_STORAGE_BUFFER, GL_NONE); @@ -351,11 +358,13 @@ namespace gl } case gl::texture::type::uint_24_8: { + // Swap bytes on D24S8 does not swap the whole dword, just shuffles the 3 bytes for D24 + // In this regard, D24S8 is the same structure on both PC and PS3, but the endianness of the whole block is reversed on PS3 verify(HERE), pack_unpack_swap_bytes == false; verify(HERE), real_pitch == (width * 4); if (rsx_pitch == real_pitch) [[likely]] { - rsx::convert_le_d24x8_to_be_d24x8(dst, dst, valid_length / 4, 1); + stream_data_to_memory_swapped_u32(dst, dst, valid_length / 4, 4); } else { @@ -363,7 +372,7 @@ namespace gl u8* data = static_cast(dst); for (u32 row = 0; row < num_rows; ++row) { - rsx::convert_le_d24x8_to_be_d24x8(data, data, width, 1); + stream_data_to_memory_swapped_u32(data, data, width, 4); data += rsx_pitch; } } diff --git a/rpcs3/Emu/RSX/rsx_utils.cpp b/rpcs3/Emu/RSX/rsx_utils.cpp index b44a965058..cc637fbf1d 100644 --- a/rpcs3/Emu/RSX/rsx_utils.cpp +++ b/rpcs3/Emu/RSX/rsx_utils.cpp @@ -101,355 +101,6 @@ namespace rsx } } - /* Fast image scaling routines - * Only uses fast nearest scaling and integral scaling factors - * T - Dst type - * U - Src type - * N - Sample count - */ - template - void scale_image_fallback_impl(T* dst, const U* src, u16 src_width, u16 src_height, u16 dst_pitch, u16 src_pitch, u8 element_size, u8 samples_u, u8 samples_v) - { - u32 dst_offset = 0; - u32 src_offset = 0; - - u32 padding = (dst_pitch - (src_pitch * samples_u)) / sizeof(T); - - for (u16 h = 0; h < src_height; ++h) - { - const auto row_start = dst_offset; - for (u16 w = 0; w < src_width; ++w) - { - for (u8 n = 0; n < samples_u; ++n) - { - dst[dst_offset++] = src[src_offset]; - } - - src_offset++; - } - - dst_offset += padding; - - for (int n = 1; n < samples_v; ++n) - { - memcpy(&dst[dst_offset], &dst[row_start], dst_pitch); - dst_offset += dst_pitch; - } - } - } - - void scale_image_fallback(void* dst, const void* src, u16 src_width, u16 src_height, u16 dst_pitch, u16 src_pitch, u8 element_size, u8 samples_u, u8 samples_v) - { - switch (element_size) - { - case 1: - scale_image_fallback_impl(static_cast(dst), static_cast(src), src_width, src_height, dst_pitch, src_pitch, element_size, samples_u, samples_v); - break; - case 2: - scale_image_fallback_impl(static_cast(dst), static_cast(src), src_width, src_height, dst_pitch, src_pitch, element_size, samples_u, samples_v); - break; - case 4: - scale_image_fallback_impl(static_cast(dst), static_cast(src), src_width, src_height, dst_pitch, src_pitch, element_size, samples_u, samples_v); - break; - default: - fmt::throw_exception("unsupported element size %d" HERE, element_size); - } - } - - void scale_image_fallback_with_byte_swap(void* dst, const void* src, u16 src_width, u16 src_height, u16 dst_pitch, u16 src_pitch, u8 element_size, u8 samples_u, u8 samples_v) - { - switch (element_size) - { - case 1: - scale_image_fallback_impl(static_cast(dst), static_cast(src), src_width, src_height, dst_pitch, src_pitch, element_size, samples_u, samples_v); - break; - case 2: - scale_image_fallback_impl>(static_cast(dst), static_cast*>(src), src_width, src_height, dst_pitch, src_pitch, element_size, samples_u, samples_v); - break; - case 4: - scale_image_fallback_impl>(static_cast(dst), static_cast*>(src), src_width, src_height, dst_pitch, src_pitch, element_size, samples_u, samples_v); - break; - default: - fmt::throw_exception("unsupported element size %d" HERE, element_size); - } - } - - template - void scale_image_impl(T* dst, const U* src, u16 src_width, u16 src_height, u16 padding) - { - u32 dst_offset = 0; - u32 src_offset = 0; - - for (u16 h = 0; h < src_height; ++h) - { - for (u16 w = 0; w < src_width; ++w) - { - for (u8 n = 0; n < N; ++n) - { - dst[dst_offset++] = src[src_offset]; - } - - //Fetch next pixel - src_offset++; - } - - //Pad this row - dst_offset += padding; - } - } - - template - void scale_image_fast(void *dst, const void *src, u8 element_size, u16 src_width, u16 src_height, u16 padding) - { - switch (element_size) - { - case 1: - scale_image_impl(static_cast(dst), static_cast(src), src_width, src_height, padding); - break; - case 2: - scale_image_impl(static_cast(dst), static_cast(src), src_width, src_height, padding); - break; - case 4: - scale_image_impl(static_cast(dst), static_cast(src), src_width, src_height, padding); - break; - case 8: - scale_image_impl(static_cast(dst), static_cast(src), src_width, src_height, padding); - break; - default: - fmt::throw_exception("unsupported pixel size %d" HERE, element_size); - } - } - - template - void scale_image_fast_with_byte_swap(void *dst, const void *src, u8 element_size, u16 src_width, u16 src_height, u16 padding) - { - switch (element_size) - { - case 1: - scale_image_impl(static_cast(dst), static_cast(src), src_width, src_height, padding); - break; - case 2: - scale_image_impl, N>(static_cast(dst), static_cast*>(src), src_width, src_height, padding); - break; - case 4: - scale_image_impl, N>(static_cast(dst), static_cast*>(src), src_width, src_height, padding); - break; - case 8: - scale_image_impl, N>(static_cast(dst), static_cast*>(src), src_width, src_height, padding); - break; - default: - fmt::throw_exception("unsupported pixel size %d" HERE, element_size); - } - } - - void scale_image_nearest(void* dst, const void* src, u16 src_width, u16 src_height, u16 dst_pitch, u16 src_pitch, u8 element_size, u8 samples_u, u8 samples_v, bool swap_bytes) - { - //Scale this image by repeating pixel data n times - //n = expected_pitch / real_pitch - //Use of fixed argument templates for performance reasons - - const u16 dst_width = dst_pitch / element_size; - const u16 padding = dst_width - (src_width * samples_u); - - if (!swap_bytes) - { - if (samples_v == 1) - { - switch (samples_u) - { - case 1: - scale_image_fast<1>(dst, src, element_size, src_width, src_height, padding); - break; - case 2: - scale_image_fast<2>(dst, src, element_size, src_width, src_height, padding); - break; - case 3: - scale_image_fast<3>(dst, src, element_size, src_width, src_height, padding); - break; - case 4: - scale_image_fast<4>(dst, src, element_size, src_width, src_height, padding); - break; - case 8: - scale_image_fast<8>(dst, src, element_size, src_width, src_height, padding); - break; - case 16: - scale_image_fast<16>(dst, src, element_size, src_width, src_height, padding); - break; - default: - scale_image_fallback(dst, src, src_width, src_height, dst_pitch, src_pitch, element_size, samples_u, 1); - } - } - else - { - scale_image_fallback(dst, src, src_width, src_height, dst_pitch, src_pitch, element_size, samples_u, samples_v); - } - } - else - { - if (samples_v == 1) - { - switch (samples_u) - { - case 1: - scale_image_fast_with_byte_swap<1>(dst, src, element_size, src_width, src_height, padding); - break; - case 2: - scale_image_fast_with_byte_swap<2>(dst, src, element_size, src_width, src_height, padding); - break; - case 3: - scale_image_fast_with_byte_swap<3>(dst, src, element_size, src_width, src_height, padding); - break; - case 4: - scale_image_fast_with_byte_swap<4>(dst, src, element_size, src_width, src_height, padding); - break; - case 8: - scale_image_fast_with_byte_swap<8>(dst, src, element_size, src_width, src_height, padding); - break; - case 16: - scale_image_fast_with_byte_swap<16>(dst, src, element_size, src_width, src_height, padding); - break; - default: - scale_image_fallback_with_byte_swap(dst, src, src_width, src_height, dst_pitch, src_pitch, element_size, samples_u, 1); - } - } - else - { - scale_image_fallback_with_byte_swap(dst, src, src_width, src_height, dst_pitch, src_pitch, element_size, samples_u, samples_v); - } - } - } - - void convert_le_f32_to_be_d24(void *dst, void *src, u32 row_length_in_texels, u32 num_rows) - { - const u32 num_pixels = row_length_in_texels * num_rows; - verify(HERE), (num_pixels & 3) == 0; - - const auto num_iterations = (num_pixels >> 2); - - __m128i* dst_ptr = static_cast<__m128i*>(dst); - __m128i* src_ptr = static_cast<__m128i*>(src); - - const __m128 scale_vector = _mm_set1_ps(16777214.f); - -#if defined (_MSC_VER) || defined (__SSSE3__) - if (utils::has_ssse3()) [[likely]] - { - const __m128i swap_mask = _mm_set_epi8 - ( - 0xF, 0xC, 0xD, 0xE, - 0xB, 0x8, 0x9, 0xA, - 0x7, 0x4, 0x5, 0x6, - 0x3, 0x0, 0x1, 0x2 - ); - - for (u32 n = 0; n < num_iterations; ++n) - { - const __m128i src_vector = _mm_loadu_si128(src_ptr); - const __m128i result = _mm_cvtps_epi32(_mm_mul_ps(_mm_castsi128_ps(src_vector), scale_vector)); - const __m128i shuffled_vector = _mm_shuffle_epi8(result, swap_mask); - _mm_stream_si128(dst_ptr, shuffled_vector); - ++dst_ptr; - ++src_ptr; - } - - return; - } -#endif - - const __m128i mask1 = _mm_set1_epi32(0xFF00FF00); - const __m128i mask2 = _mm_set1_epi32(0x00FF0000); - const __m128i mask3 = _mm_set1_epi32(0x000000FF); - - for (u32 n = 0; n < num_iterations; ++n) - { - const __m128i src_vector = _mm_loadu_si128(src_ptr); - const __m128i result = _mm_cvtps_epi32(_mm_mul_ps(_mm_castsi128_ps(src_vector), scale_vector)); - - const __m128i v1 = _mm_and_si128(result, mask1); - const __m128i v2 = _mm_and_si128(_mm_slli_epi32(result, 16), mask2); - const __m128i v3 = _mm_and_si128(_mm_srli_epi32(result, 16), mask3); - const __m128i shuffled_vector = _mm_or_si128(_mm_or_si128(v1, v2), v3); - - _mm_stream_si128(dst_ptr, shuffled_vector); - ++dst_ptr; - ++src_ptr; - } - } - - void convert_le_d24x8_to_be_d24x8(void *dst, void *src, u32 row_length_in_texels, u32 num_rows) - { - const u32 num_pixels = row_length_in_texels * num_rows; - verify(HERE), (num_pixels & 3) == 0; - - const auto num_iterations = (num_pixels >> 2); - - __m128i* dst_ptr = static_cast<__m128i*>(dst); - __m128i* src_ptr = static_cast<__m128i*>(src); - -#if defined (_MSC_VER) || defined (__SSSE3__) - if (utils::has_ssse3()) [[likely]] - { - const __m128i swap_mask = _mm_set_epi8 - ( - 0xF, 0xC, 0xD, 0xE, - 0xB, 0x8, 0x9, 0xA, - 0x7, 0x4, 0x5, 0x6, - 0x3, 0x0, 0x1, 0x2 - ); - - for (u32 n = 0; n < num_iterations; ++n) - { - const __m128i src_vector = _mm_loadu_si128(src_ptr); - const __m128i shuffled_vector = _mm_shuffle_epi8(src_vector, swap_mask); - _mm_stream_si128(dst_ptr, shuffled_vector); - ++dst_ptr; - ++src_ptr; - } - - return; - } -#endif - - const __m128i mask1 = _mm_set1_epi32(0xFF00FF00); - const __m128i mask2 = _mm_set1_epi32(0x00FF0000); - const __m128i mask3 = _mm_set1_epi32(0x000000FF); - - for (u32 n = 0; n < num_iterations; ++n) - { - const __m128i src_vector = _mm_loadu_si128(src_ptr); - const __m128i v1 = _mm_and_si128(src_vector, mask1); - const __m128i v2 = _mm_and_si128(_mm_slli_epi32(src_vector, 16), mask2); - const __m128i v3 = _mm_and_si128(_mm_srli_epi32(src_vector, 16), mask3); - const __m128i shuffled_vector = _mm_or_si128(_mm_or_si128(v1, v2), v3); - - _mm_stream_si128(dst_ptr, shuffled_vector); - ++dst_ptr; - ++src_ptr; - } - } - - void convert_le_d24x8_to_le_f32(void *dst, void *src, u32 row_length_in_texels, u32 num_rows) - { - const u32 num_pixels = row_length_in_texels * num_rows; - verify(HERE), (num_pixels & 3) == 0; - - const auto num_iterations = (num_pixels >> 2); - - __m128i* dst_ptr = static_cast<__m128i*>(dst); - __m128i* src_ptr = static_cast<__m128i*>(src); - - const __m128 scale_vector = _mm_set1_ps(1.f / 16777214.f); - const __m128i mask = _mm_set1_epi32(0x00FFFFFF); - for (u32 n = 0; n < num_iterations; ++n) - { - const __m128 src_vector = _mm_cvtepi32_ps(_mm_and_si128(mask, _mm_loadu_si128(src_ptr))); - const __m128 normalized_vector = _mm_mul_ps(src_vector, scale_vector); - _mm_stream_si128(dst_ptr, _mm_castps_si128(normalized_vector)); - ++dst_ptr; - ++src_ptr; - } - } - #ifdef TEXTURE_CACHE_DEBUG tex_cache_checker_t tex_cache_checker = {}; #endif diff --git a/rpcs3/Emu/RSX/rsx_utils.h b/rpcs3/Emu/RSX/rsx_utils.h index 6076ed5bed..199c0315ef 100644 --- a/rpcs3/Emu/RSX/rsx_utils.h +++ b/rpcs3/Emu/RSX/rsx_utils.h @@ -446,18 +446,12 @@ namespace rsx } } - void scale_image_nearest(void* dst, const void* src, u16 src_width, u16 src_height, u16 dst_pitch, u16 src_pitch, u8 element_size, u8 samples_u, u8 samples_v, bool swap_bytes = false); - void convert_scale_image(u8 *dst, AVPixelFormat dst_format, int dst_width, int dst_height, int dst_pitch, const u8 *src, AVPixelFormat src_format, int src_width, int src_height, int src_pitch, int src_slice_h, bool bilinear); void clip_image(u8 *dst, const u8 *src, int clip_x, int clip_y, int clip_w, int clip_h, int bpp, int src_pitch, int dst_pitch); void clip_image_may_overlap(u8 *dst, const u8 *src, int clip_x, int clip_y, int clip_w, int clip_h, int bpp, int src_pitch, int dst_pitch, u8* buffer); - void convert_le_f32_to_be_d24(void *dst, void *src, u32 row_length_in_texels, u32 num_rows); - void convert_le_d24x8_to_be_d24x8(void *dst, void *src, u32 row_length_in_texels, u32 num_rows); - void convert_le_d24x8_to_le_f32(void *dst, void *src, u32 row_length_in_texels, u32 num_rows); - std::array get_constant_blend_colors(); /**