From 6d6d0e4e369f520ed36892802c3d13f98364a067 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Wed, 29 Mar 2017 22:27:29 +0300 Subject: [PATCH] gl: Use the GPU to scale textures; use ARB_sampler_object Improve scaling and separate sampler state from texture state gl: Unify all texture cache objects under one structure separate by use case gl: Texture cache fixes - Acquire lock when finding matching textures - Account for swizzled surfaces when deciding whether to cpu memcpy - Handle swizzled images on the GPU --- rpcs3/Emu/RSX/GL/GLGSRender.cpp | 22 +- rpcs3/Emu/RSX/GL/GLGSRender.h | 8 +- rpcs3/Emu/RSX/GL/GLProcTable.h | 7 + rpcs3/Emu/RSX/GL/GLRenderTargets.h | 64 ++- rpcs3/Emu/RSX/GL/GLTexture.cpp | 216 ++++---- rpcs3/Emu/RSX/GL/GLTexture.h | 63 +-- rpcs3/Emu/RSX/GL/GLTextureCache.cpp | 20 +- rpcs3/Emu/RSX/GL/GLTextureCache.h | 743 ++++++++++++++++++++++------ rpcs3/Emu/RSX/rsx_cache.h | 44 +- rpcs3/Emu/RSX/rsx_methods.cpp | 52 +- rpcs3/Emu/RSX/rsx_utils.cpp | 11 +- 11 files changed, 880 insertions(+), 370 deletions(-) diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index 858e8f6f95..800999d306 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -408,6 +408,7 @@ void GLGSRender::end() std::chrono::time_point textures_start = steady_clock::now(); //Setup textures + //Setting unused texture to 0 is not needed, but makes program validation happy if we choose to enforce it for (int i = 0; i < rsx::limits::fragment_textures_count; ++i) { int location; @@ -422,6 +423,7 @@ void GLGSRender::end() { m_gl_textures[i].set_target(get_gl_target_for_texture(rsx::method_registers.fragment_textures[i])); __glcheck m_gl_texture_cache.upload_texture(i, rsx::method_registers.fragment_textures[i], m_gl_textures[i], m_rtts); + __glcheck m_gl_sampler_states[i].apply(rsx::method_registers.fragment_textures[i]); } } @@ -572,6 +574,12 @@ void GLGSRender::on_init_thread() if (g_cfg_rsx_overlay) m_text_printer.init(); + for (int i = 0; i < rsx::limits::fragment_textures_count; ++i) + { + m_gl_sampler_states[i].create(); + m_gl_sampler_states[i].bind(i); + } + m_gl_texture_cache.initialize(this); } @@ -606,6 +614,11 @@ void GLGSRender::on_exit() tex.remove(); } + for (auto &sampler : m_gl_sampler_states) + { + sampler.remove(); + } + m_attrib_ring_buffer->remove(); m_transform_constants_buffer->remove(); m_fragment_constants_buffer->remove(); @@ -879,7 +892,7 @@ void GLGSRender::flip(int buffer) gl::screen.clear(gl::buffers::color_depth_stencil); - __glcheck flip_fbo->blit(gl::screen, screen_area, areai(aspect_ratio).flipped_vertical()); + __glcheck flip_fbo->blit(gl::screen, screen_area, areai(aspect_ratio).flipped_vertical(), gl::buffers::color, gl::filter::linear); if (g_cfg_rsx_overlay) { @@ -960,7 +973,7 @@ void GLGSRender::do_local_task() } } -work_item& GLGSRender::post_flush_request(u32 address, gl::texture_cache::cached_rtt_section *section) +work_item& GLGSRender::post_flush_request(u32 address, gl::texture_cache::cached_texture_section *section) { std::lock_guard lock(queue_guard); @@ -979,3 +992,8 @@ void GLGSRender::synchronize_buffers() flush_draw_buffers = false; } } + +bool GLGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate) +{ + return m_gl_texture_cache.upload_scaled_image(src, dst, interpolate, m_rtts); +} diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h index 5cdc01610e..89348215e6 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.h +++ b/rpcs3/Emu/RSX/GL/GLGSRender.h @@ -9,6 +9,7 @@ #include "define_new_memleakdetect.h" #include "GLProgramBuffer.h" #include "GLTextOut.h" +#include "../rsx_cache.h" #pragma comment(lib, "opengl32.lib") @@ -18,7 +19,7 @@ struct work_item std::mutex guard_mutex; u32 address_to_flush = 0; - gl::texture_cache::cached_rtt_section *section_to_flush = nullptr; + gl::texture_cache::cached_texture_section *section_to_flush = nullptr; volatile bool processed = false; volatile bool result = false; @@ -57,6 +58,7 @@ private: rsx::gl::texture m_gl_textures[rsx::limits::fragment_textures_count]; rsx::gl::texture m_gl_vertex_textures[rsx::limits::vertex_textures_count]; + gl::sampler_state m_gl_sampler_states[rsx::limits::fragment_textures_count]; gl::glsl::program *m_program; @@ -129,7 +131,9 @@ public: void set_viewport(); void synchronize_buffers(); - work_item& post_flush_request(u32 address, gl::texture_cache::cached_rtt_section *section); + work_item& post_flush_request(u32 address, gl::texture_cache::cached_texture_section *section); + + bool scaled_image_from_memory(rsx::blit_src_info& src_info, rsx::blit_dst_info& dst_info, bool interpolate) override; protected: void begin() override; diff --git a/rpcs3/Emu/RSX/GL/GLProcTable.h b/rpcs3/Emu/RSX/GL/GLProcTable.h index 3c55e3fca9..5d03d216cf 100644 --- a/rpcs3/Emu/RSX/GL/GLProcTable.h +++ b/rpcs3/Emu/RSX/GL/GLProcTable.h @@ -172,6 +172,13 @@ OPENGL_PROC(PFNGLMULTIDRAWARRAYSPROC, MultiDrawArrays); OPENGL_PROC(PFNGLGETTEXTUREIMAGEEXTPROC, GetTextureImageEXT); +//Sampler Objects +OPENGL_PROC(PFNGLGENSAMPLERSPROC, GenSamplers); +OPENGL_PROC(PFNGLDELETESAMPLERSPROC, DeleteSamplers); +OPENGL_PROC(PFNGLBINDSAMPLERPROC, BindSampler); +OPENGL_PROC(PFNGLSAMPLERPARAMETERIPROC, SamplerParameteri); +OPENGL_PROC(PFNGLSAMPLERPARAMETERFVPROC, SamplerParameterfv); + //Texture Buffers OPENGL_PROC(PFNGLTEXBUFFERPROC, TexBuffer); OPENGL_PROC(PFNGLTEXTUREBUFFERRANGEEXTPROC, TextureBufferRangeEXT); diff --git a/rpcs3/Emu/RSX/GL/GLRenderTargets.h b/rpcs3/Emu/RSX/GL/GLRenderTargets.h index 04282ae6a6..761bbfa0f3 100644 --- a/rpcs3/Emu/RSX/GL/GLRenderTargets.h +++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.h @@ -116,7 +116,9 @@ namespace gl } // For an address within the texture, extract this sub-section's rect origin - std::tuple get_texture_subresource(u32 offset) + // Checks whether we need to scale the subresource if it is not handled in shader + // NOTE1: When surface->real_pitch < rsx_pitch, the surface is assumed to have been scaled to fill the rsx_region + std::tuple get_texture_subresource(u32 offset, bool scale_to_fit) { if (!offset) { @@ -132,9 +134,14 @@ namespace gl if (!surface_pixel_size) surface_pixel_size = native_pitch / surface_width; - u32 pixel_offset = (offset / surface_pixel_size); - u32 y = (pixel_offset / surface_width); - u32 x = (pixel_offset % surface_width); + const u32 y = (offset / rsx_pitch); + u32 x = (offset % rsx_pitch) / surface_pixel_size; + + if (scale_to_fit) + { + const f32 x_scale = (f32)rsx_pitch / native_pitch; + x = (u32)((f32)x / x_scale); + } return std::make_tuple(true, (u16)x, (u16)y); } @@ -291,18 +298,19 @@ struct surface_subresource bool is_bound = false; bool is_depth_surface = false; + bool is_clipped = false; surface_subresource() {} - surface_subresource(gl::render_target *src, u16 X, u16 Y, u16 W, u16 H, bool _Bound, bool _Depth) - : surface(src), x(X), y(Y), w(W), h(H), is_bound(_Bound), is_depth_surface(_Depth) + surface_subresource(gl::render_target *src, u16 X, u16 Y, u16 W, u16 H, bool _Bound, bool _Depth, bool _Clipped = false) + : surface(src), x(X), y(Y), w(W), h(H), is_bound(_Bound), is_depth_surface(_Depth), is_clipped(_Clipped) {} }; class gl_render_targets : public rsx::surface_store { private: - bool surface_overlaps(gl::render_target *surface, u32 surface_address, u32 texaddr, u16 *x, u16 *y) + bool surface_overlaps(gl::render_target *surface, u32 surface_address, u32 texaddr, u16 *x, u16 *y, bool scale_to_fit) { bool is_subslice = false; u16 x_offset = 0; @@ -314,7 +322,7 @@ private: u32 offset = texaddr - surface_address; if (offset >= 0) { - std::tie(is_subslice, x_offset, y_offset) = surface->get_texture_subresource(offset); + std::tie(is_subslice, x_offset, y_offset) = surface->get_texture_subresource(offset, scale_to_fit); if (is_subslice) { *x = x_offset; @@ -354,7 +362,7 @@ private: } public: - surface_subresource get_surface_subresource_if_applicable(u32 texaddr, u16 requested_width, u16 requested_height, u16 requested_pitch) + surface_subresource get_surface_subresource_if_applicable(u32 texaddr, u16 requested_width, u16 requested_height, u16 requested_pitch, bool scale_to_fit =false, bool crop=false) { gl::render_target *surface = nullptr; bool is_subslice = false; @@ -366,21 +374,35 @@ public: u32 this_address = std::get<0>(tex_info); surface = std::get<1>(tex_info).get(); - if (surface_overlaps(surface, this_address, texaddr, &x_offset, &y_offset)) + if (surface_overlaps(surface, this_address, texaddr, &x_offset, &y_offset, scale_to_fit)) { if (surface->get_rsx_pitch() != requested_pitch) continue; auto dims = surface->get_dimensions(); - + + if (scale_to_fit) + { + f32 pitch_scaling = (f32)requested_pitch / surface->get_native_pitch(); + requested_width /= pitch_scaling; + } + if (fits(surface, dims, x_offset, y_offset, requested_width, requested_height)) return{ surface, x_offset, y_offset, requested_width, requested_height, is_bound(this_address, false), false }; else { + if (crop) //Forcefully fit the requested region by clipping and scaling + { + u16 remaining_width = dims.first - x_offset; + u16 remaining_height = dims.second - y_offset; + + return{ surface, x_offset, y_offset, remaining_width, remaining_height, is_bound(this_address, false), false, true }; + } + if (dims.first >= requested_width && dims.second >= requested_height) { LOG_WARNING(RSX, "Overlapping surface exceeds bounds; returning full surface region"); - return{ surface, 0, 0, requested_width, requested_height, is_bound(this_address, false), false }; + return{ surface, 0, 0, requested_width, requested_height, is_bound(this_address, false), false, true }; } } } @@ -392,21 +414,35 @@ public: u32 this_address = std::get<0>(tex_info); surface = std::get<1>(tex_info).get(); - if (surface_overlaps(surface, this_address, texaddr, &x_offset, &y_offset)) + if (surface_overlaps(surface, this_address, texaddr, &x_offset, &y_offset, scale_to_fit)) { if (surface->get_rsx_pitch() != requested_pitch) continue; auto dims = surface->get_dimensions(); + if (scale_to_fit) + { + f32 pitch_scaling = (f32)requested_pitch / surface->get_native_pitch(); + requested_width /= pitch_scaling; + } + if (fits(surface, dims, x_offset, y_offset, requested_width, requested_height)) return{ surface, x_offset, y_offset, requested_width, requested_height, is_bound(this_address, true), true }; else { + if (crop) //Forcefully fit the requested region by clipping and scaling + { + u16 remaining_width = dims.first - x_offset; + u16 remaining_height = dims.second - y_offset; + + return{ surface, x_offset, y_offset, remaining_width, remaining_height, is_bound(this_address, true), true, true }; + } + if (dims.first >= requested_width && dims.second >= requested_height) { LOG_WARNING(RSX, "Overlapping depth surface exceeds bounds; returning full surface region"); - return{ surface, 0, 0, requested_width, requested_height, is_bound(this_address, true), true }; + return{ surface, 0, 0, requested_width, requested_height, is_bound(this_address, true), true, true }; } } } diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index ed211eddfd..968e1f0721 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -67,6 +67,116 @@ namespace gl } fmt::throw_exception("Compressed or unknown texture format 0x%x" HERE, texture_format); } + + GLenum wrap_mode(rsx::texture_wrap_mode wrap) + { + switch (wrap) + { + case rsx::texture_wrap_mode::wrap: return GL_REPEAT; + case rsx::texture_wrap_mode::mirror: return GL_MIRRORED_REPEAT; + case rsx::texture_wrap_mode::clamp_to_edge: return GL_CLAMP_TO_EDGE; + case rsx::texture_wrap_mode::border: return GL_CLAMP_TO_BORDER; + case rsx::texture_wrap_mode::clamp: return GL_CLAMP_TO_EDGE; + case rsx::texture_wrap_mode::mirror_once_clamp_to_edge: return GL_MIRROR_CLAMP_TO_EDGE_EXT; + case rsx::texture_wrap_mode::mirror_once_border: return GL_MIRROR_CLAMP_TO_BORDER_EXT; + case rsx::texture_wrap_mode::mirror_once_clamp: return GL_MIRROR_CLAMP_EXT; + } + + LOG_ERROR(RSX, "Texture wrap error: bad wrap (%d)", (u32)wrap); + return GL_REPEAT; + } + + float max_aniso(rsx::texture_max_anisotropy aniso) + { + switch (aniso) + { + case rsx::texture_max_anisotropy::x1: return 1.0f; + case rsx::texture_max_anisotropy::x2: return 2.0f; + case rsx::texture_max_anisotropy::x4: return 4.0f; + case rsx::texture_max_anisotropy::x6: return 6.0f; + case rsx::texture_max_anisotropy::x8: return 8.0f; + case rsx::texture_max_anisotropy::x10: return 10.0f; + case rsx::texture_max_anisotropy::x12: return 12.0f; + case rsx::texture_max_anisotropy::x16: return 16.0f; + } + + LOG_ERROR(RSX, "Texture anisotropy error: bad max aniso (%d)", (u32)aniso); + return 1.0f; + } + + int tex_min_filter(rsx::texture_minify_filter min_filter) + { + switch (min_filter) + { + case rsx::texture_minify_filter::nearest: return GL_NEAREST; + case rsx::texture_minify_filter::linear: return GL_LINEAR; + case rsx::texture_minify_filter::nearest_nearest: return GL_NEAREST_MIPMAP_NEAREST; + case rsx::texture_minify_filter::linear_nearest: return GL_LINEAR_MIPMAP_NEAREST; + case rsx::texture_minify_filter::nearest_linear: return GL_NEAREST_MIPMAP_LINEAR; + case rsx::texture_minify_filter::linear_linear: return GL_LINEAR_MIPMAP_LINEAR; + case rsx::texture_minify_filter::convolution_min: return GL_LINEAR_MIPMAP_LINEAR; + } + fmt::throw_exception("Unknow min filter" HERE); + } + + int tex_mag_filter(rsx::texture_magnify_filter mag_filter) + { + switch (mag_filter) + { + case rsx::texture_magnify_filter::nearest: return GL_NEAREST; + case rsx::texture_magnify_filter::linear: return GL_LINEAR; + case rsx::texture_magnify_filter::convolution_mag: return GL_LINEAR; + } + fmt::throw_exception("Unknow mag filter" HERE); + } + + //Apply sampler state settings + void sampler_state::apply(rsx::fragment_texture& tex) + { + const f32 border_color = (f32)tex.border_color() / 255; + const f32 border_color_array[] = { border_color, border_color, border_color, border_color }; + + glSamplerParameteri(samplerHandle, GL_TEXTURE_WRAP_S, wrap_mode(tex.wrap_s())); + glSamplerParameteri(samplerHandle, GL_TEXTURE_WRAP_T, wrap_mode(tex.wrap_t())); + glSamplerParameteri(samplerHandle, GL_TEXTURE_WRAP_R, wrap_mode(tex.wrap_r())); + glSamplerParameterfv(samplerHandle, GL_TEXTURE_BORDER_COLOR, border_color_array); + + if (tex.get_exact_mipmap_count() <= 1) + { + GLint min_filter = tex_min_filter(tex.min_filter()); + + if (min_filter != GL_LINEAR && min_filter != GL_NEAREST) + { + switch (min_filter) + { + case GL_NEAREST_MIPMAP_NEAREST: + case GL_NEAREST_MIPMAP_LINEAR: + min_filter = GL_NEAREST; break; + case GL_LINEAR_MIPMAP_NEAREST: + case GL_LINEAR_MIPMAP_LINEAR: + min_filter = GL_LINEAR; break; + default: + LOG_ERROR(RSX, "No mipmap fallback defined for rsx_min_filter = 0x%X", (u32)tex.min_filter()); + min_filter = GL_NEAREST; + } + } + + glSamplerParameteri(samplerHandle, GL_TEXTURE_MIN_FILTER, min_filter); + glSamplerParameteri(samplerHandle, GL_TEXTURE_LOD_BIAS, 0.); + glSamplerParameteri(samplerHandle, GL_TEXTURE_MIN_LOD, 0); + glSamplerParameteri(samplerHandle, GL_TEXTURE_MAX_LOD, 0); + } + else + { + glSamplerParameteri(samplerHandle, GL_TEXTURE_MIN_FILTER, tex_min_filter(tex.min_filter())); + glSamplerParameteri(samplerHandle, GL_TEXTURE_LOD_BIAS, tex.bias()); + glSamplerParameteri(samplerHandle, GL_TEXTURE_MIN_LOD, (tex.min_lod() >> 8)); + glSamplerParameteri(samplerHandle, GL_TEXTURE_MAX_LOD, (tex.max_lod() >> 8)); + } + + glSamplerParameteri(samplerHandle, GL_TEXTURE_MAG_FILTER, tex_mag_filter(tex.mag_filter())); + glSamplerParameteri(samplerHandle, GL_TEXTURE_MAX_ANISOTROPY_EXT, ::gl::max_aniso(tex.max_aniso())); + } } namespace @@ -182,32 +292,6 @@ namespace rsx { namespace gl { - int gl_tex_min_filter(rsx::texture_minify_filter min_filter) - { - switch (min_filter) - { - case rsx::texture_minify_filter::nearest: return GL_NEAREST; - case rsx::texture_minify_filter::linear: return GL_LINEAR; - case rsx::texture_minify_filter::nearest_nearest: return GL_NEAREST_MIPMAP_NEAREST; - case rsx::texture_minify_filter::linear_nearest: return GL_LINEAR_MIPMAP_NEAREST; - case rsx::texture_minify_filter::nearest_linear: return GL_NEAREST_MIPMAP_LINEAR; - case rsx::texture_minify_filter::linear_linear: return GL_LINEAR_MIPMAP_LINEAR; - case rsx::texture_minify_filter::convolution_min: return GL_LINEAR_MIPMAP_LINEAR; - } - fmt::throw_exception("Unknow min filter" HERE); - } - - int gl_tex_mag_filter(rsx::texture_magnify_filter mag_filter) - { - switch (mag_filter) - { - case rsx::texture_magnify_filter::nearest: return GL_NEAREST; - case rsx::texture_magnify_filter::linear: return GL_LINEAR; - case rsx::texture_magnify_filter::convolution_mag: return GL_LINEAR; - } - fmt::throw_exception("Unknow mag filter" HERE); - } - static const int gl_tex_zfunc[] = { GL_NEVER, @@ -230,42 +314,6 @@ namespace rsx glGenTextures(1, &m_id); } - int texture::gl_wrap(rsx::texture_wrap_mode wrap) - { - switch (wrap) - { - case rsx::texture_wrap_mode::wrap: return GL_REPEAT; - case rsx::texture_wrap_mode::mirror: return GL_MIRRORED_REPEAT; - case rsx::texture_wrap_mode::clamp_to_edge: return GL_CLAMP_TO_EDGE; - case rsx::texture_wrap_mode::border: return GL_CLAMP_TO_BORDER; - case rsx::texture_wrap_mode::clamp: return GL_CLAMP_TO_EDGE; - case rsx::texture_wrap_mode::mirror_once_clamp_to_edge: return GL_MIRROR_CLAMP_TO_EDGE_EXT; - case rsx::texture_wrap_mode::mirror_once_border: return GL_MIRROR_CLAMP_TO_BORDER_EXT; - case rsx::texture_wrap_mode::mirror_once_clamp: return GL_MIRROR_CLAMP_EXT; - } - - LOG_ERROR(RSX, "Texture wrap error: bad wrap (%d)", (u32)wrap); - return GL_REPEAT; - } - - float texture::max_aniso(rsx::texture_max_anisotropy aniso) - { - switch (aniso) - { - case rsx::texture_max_anisotropy::x1: return 1.0f; - case rsx::texture_max_anisotropy::x2: return 2.0f; - case rsx::texture_max_anisotropy::x4: return 4.0f; - case rsx::texture_max_anisotropy::x6: return 6.0f; - case rsx::texture_max_anisotropy::x8: return 8.0f; - case rsx::texture_max_anisotropy::x10: return 10.0f; - case rsx::texture_max_anisotropy::x12: return 12.0f; - case rsx::texture_max_anisotropy::x16: return 16.0f; - } - - LOG_ERROR(RSX, "Texture anisotropy error: bad max aniso (%d)", (u32)aniso); - return 1.0f; - } - u16 texture::get_pitch_modifier(u32 format) { switch (format) @@ -535,49 +583,7 @@ namespace rsx __glcheck glTexParameteri(m_target, GL_TEXTURE_SWIZZLE_G, remap_values[2]); __glcheck glTexParameteri(m_target, GL_TEXTURE_SWIZZLE_B, remap_values[3]); - __glcheck glTexParameteri(m_target, GL_TEXTURE_WRAP_S, gl_wrap(tex.wrap_s())); - __glcheck glTexParameteri(m_target, GL_TEXTURE_WRAP_T, gl_wrap(tex.wrap_t())); - __glcheck glTexParameteri(m_target, GL_TEXTURE_WRAP_R, gl_wrap(tex.wrap_r())); - - if (tex.get_exact_mipmap_count() <= 1 || m_target == GL_TEXTURE_RECTANGLE) - { - GLint min_filter = gl_tex_min_filter(tex.min_filter()); - - if (min_filter != GL_LINEAR && min_filter != GL_NEAREST) - { - LOG_WARNING(RSX, "Texture %d, target 0x%x, requesting mipmap filtering without any mipmaps set!", m_id, m_target); - - switch (min_filter) - { - case GL_NEAREST_MIPMAP_NEAREST: - case GL_NEAREST_MIPMAP_LINEAR: - min_filter = GL_NEAREST; break; - case GL_LINEAR_MIPMAP_NEAREST: - case GL_LINEAR_MIPMAP_LINEAR: - min_filter = GL_LINEAR; break; - default: - LOG_ERROR(RSX, "No mipmap fallback defined for rsx_min_filter = 0x%X", (u32)tex.min_filter()); - min_filter = GL_NEAREST; - } - } - - __glcheck glTexParameteri(m_target, GL_TEXTURE_MIN_FILTER, min_filter); - - __glcheck glTexParameterf(m_target, GL_TEXTURE_LOD_BIAS, 0.); - __glcheck glTexParameteri(m_target, GL_TEXTURE_MIN_LOD, 0); - __glcheck glTexParameteri(m_target, GL_TEXTURE_MAX_LOD, 0); - } - else - { - __glcheck glTexParameteri(m_target, GL_TEXTURE_MIN_FILTER, gl_tex_min_filter(tex.min_filter())); - - __glcheck glTexParameterf(m_target, GL_TEXTURE_LOD_BIAS, tex.bias()); - __glcheck glTexParameteri(m_target, GL_TEXTURE_MIN_LOD, (tex.min_lod() >> 8)); - __glcheck glTexParameteri(m_target, GL_TEXTURE_MAX_LOD, (tex.max_lod() >> 8)); - } - - __glcheck glTexParameteri(m_target, GL_TEXTURE_MAG_FILTER, gl_tex_mag_filter(tex.mag_filter())); - __glcheck glTexParameterf(m_target, GL_TEXTURE_MAX_ANISOTROPY_EXT, max_aniso(tex.max_aniso())); + //The rest of sampler state is now handled by sampler state objects } void texture::init(int index, rsx::vertex_texture& tex) diff --git a/rpcs3/Emu/RSX/GL/GLTexture.h b/rpcs3/Emu/RSX/GL/GLTexture.h index e12271fa3f..464a9c05a6 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.h +++ b/rpcs3/Emu/RSX/GL/GLTexture.h @@ -5,7 +5,42 @@ namespace rsx { class vertex_texture; class fragment_texture; +} +namespace gl +{ + GLenum get_sized_internal_format(u32 gcm_format); + std::tuple get_format_type(u32 texture_format); + GLenum wrap_mode(rsx::texture_wrap_mode wrap); + float max_aniso(rsx::texture_max_anisotropy aniso); + + class sampler_state + { + GLuint samplerHandle = 0; + + public: + + void create() + { + glGenSamplers(1, &samplerHandle); + } + + void remove() + { + glDeleteSamplers(1, &samplerHandle); + } + + void bind(int index) + { + glBindSampler(index, samplerHandle); + } + + void apply(rsx::fragment_texture& tex); + }; +} + +namespace rsx +{ namespace gl { class texture @@ -16,28 +51,6 @@ namespace rsx public: void create(); - int gl_wrap(rsx::texture_wrap_mode in); - - float max_aniso(rsx::texture_max_anisotropy aniso); - - inline static u8 convert_4_to_8(u8 v) - { - // Swizzle bits: 00001234 -> 12341234 - return (v << 4) | (v); - } - - inline static u8 convert_5_to_8(u8 v) - { - // Swizzle bits: 00012345 -> 12345123 - return (v << 3) | (v >> 2); - } - - inline static u8 convert_6_to_8(u8 v) - { - // Swizzle bits: 00123456 -> 12345612 - return (v << 2) | (v >> 4); - } - void init(int index, rsx::fragment_texture& tex); void init(int index, rsx::vertex_texture& tex); @@ -64,9 +77,3 @@ namespace rsx }; } } - -namespace gl -{ - GLenum get_sized_internal_format(u32 gcm_format); - std::tuple get_format_type(u32 texture_format); -} diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.cpp b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp index 6e338ffccc..65c7ad6c7f 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.cpp +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp @@ -9,25 +9,25 @@ namespace gl { bool texture_cache::flush_section(u32 address) { - if (address < rtt_cache_range.first || - address >= rtt_cache_range.second) + if (address < no_access_range.first || + address >= no_access_range.second) return false; bool post_task = false; - cached_rtt_section* section_to_post = nullptr; + cached_texture_section* section_to_post = nullptr; { std::lock_guard lock(m_section_mutex); - for (cached_rtt_section &rtt : m_rtt_cache) + for (cached_texture_section &tex : no_access_memory_sections) { - if (rtt.is_dirty()) continue; + if (tex.is_dirty()) continue; - if (rtt.is_locked() && rtt.overlaps(address)) + if (tex.is_locked() && tex.overlaps(address)) { - if (rtt.is_flushed()) + if (tex.is_flushed()) { - LOG_WARNING(RSX, "Section matches range, but marked as already flushed!, 0x%X+0x%X", rtt.get_section_base(), rtt.get_section_size()); + LOG_WARNING(RSX, "Section matches range, but marked as already flushed!, 0x%X+0x%X", tex.get_section_base(), tex.get_section_size()); continue; } @@ -36,11 +36,11 @@ namespace gl if (std::this_thread::get_id() != m_renderer_thread) { post_task = true; - section_to_post = &rtt; + section_to_post = &tex; break; } - rtt.flush(); + tex.flush(); return true; } } diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index e34c661d7e..3adde84f09 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -9,79 +9,31 @@ #include #include #include +#include #include "GLRenderTargets.h" #include "../Common/TextureUtils.h" -#include +#include "../../Memory/vm.h" +#include "Utilities/Config.h" class GLGSRender; +extern cfg::bool_entry g_cfg_rsx_write_color_buffers; +extern cfg::bool_entry g_cfg_rsx_write_depth_buffer; + namespace gl { class texture_cache { public: - class cached_texture_section : public rsx::buffered_section - { - u32 texture_id = 0; - u32 width = 0; - u32 height = 0; - u16 mipmaps = 0; - - public: - - void create(u32 id, u32 width, u32 height, u32 mipmaps) - { - verify(HERE), locked == false; - - texture_id = id; - this->width = width; - this->height = height; - this->mipmaps = mipmaps; - } - - bool matches(u32 rsx_address, u32 width, u32 height, u32 mipmaps) const - { - if (rsx_address == cpu_address_base && texture_id != 0) - { - if (!width && !height && !mipmaps) - return true; - - return (width == this->width && height == this->height && mipmaps == this->mipmaps); - } - - return false; - } - - void destroy() - { - if (locked) - unprotect(); - - glDeleteTextures(1, &texture_id); - texture_id = 0; - } - - bool is_empty() const - { - return (texture_id == 0); - } - - u32 id() const - { - return texture_id; - } - }; - - class cached_rtt_section : public rsx::buffered_section { private: fence m_fence; u32 pbo_id = 0; u32 pbo_size = 0; - u32 source_texture = 0; + u32 vram_texture = 0; bool copied = false; bool flushed = false; @@ -161,6 +113,7 @@ namespace gl return size; } + //TODO: Move swscale routines to RSX shared void scale_image_fallback(u8* dst, const u8* src, u16 src_width, u16 src_height, u16 dst_pitch, u16 src_pitch, u8 pixel_size, u8 samples) { u32 dst_offset = 0; @@ -249,15 +202,47 @@ namespace gl public: - void reset(u32 base, u32 size) + void reset(const u32 base, const u32 size, const bool flushable) { rsx::buffered_section::reset(base, size); - init_buffer(); + + if (flushable) + init_buffer(); flushed = false; copied = false; - source_texture = 0; + vram_texture = 0; + } + + void create_read_only(const u32 id, const u32 width, const u32 height) + { + //Only to be used for ro memory, we dont care about most members, just dimensions and the vram texture handle + current_width = width; + current_height = height; + vram_texture = id; + + current_pitch = 0; + real_pitch = 0; + } + + bool matches(const u32 rsx_address, const u32 rsx_size) + { + return rsx::buffered_section::matches(rsx_address, rsx_size); + } + + bool matches(const u32 rsx_address, const u32 width, const u32 height) + { + if (cpu_address_base == rsx_address && !dirty) + { + //Mostly only used to debug; matches textures without checking dimensions + if (width == 0 && height == 0) + return true; + + return (current_width == width && current_height == height); + } + + return false; } void set_dimensions(u32 width, u32 height, u32 pitch) @@ -269,7 +254,7 @@ namespace gl real_pitch = width * get_pixel_size(format, type); } - void set_format(texture::format gl_format, texture::type gl_type, bool swap_bytes) + void set_format(const texture::format gl_format, const texture::type gl_type, const bool swap_bytes) { format = gl_format; type = gl_type; @@ -280,20 +265,20 @@ namespace gl void set_source(gl::texture &source) { - source_texture = source.id(); + vram_texture = source.id(); } void copy_texture() { - if (!glIsTexture(source_texture)) + if (!glIsTexture(vram_texture)) { - LOG_ERROR(RSX, "Attempted to download rtt texture, but texture handle was invalid! (0x%X)", source_texture); + LOG_ERROR(RSX, "Attempted to download rtt texture, but texture handle was invalid! (0x%X)", vram_texture); return; } glPixelStorei(GL_PACK_SWAP_BYTES, pack_unpack_swap_bytes); glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo_id); - glGetTextureImageEXT(source_texture, GL_TEXTURE_2D, 0, (GLenum)format, (GLenum)type, nullptr); + glGetTextureImageEXT(vram_texture, GL_TEXTURE_2D, 0, (GLenum)format, (GLenum)type, nullptr); glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); m_fence.reset(); @@ -393,9 +378,19 @@ namespace gl if (locked) unprotect(); - glDeleteBuffers(1, &pbo_id); - pbo_id = 0; - pbo_size = 0; + if (pbo_id == 0) + { + //Read-only texture, destroy texture memory + glDeleteTextures(1, &vram_texture); + vram_texture = 0; + } + else + { + //Destroy pbo cache since vram texture is managed elsewhere + glDeleteBuffers(1, &pbo_id); + pbo_id = 0; + pbo_size = 0; + } m_fence.destroy(); } @@ -405,86 +400,229 @@ namespace gl return flushed; } - void set_flushed(bool state) + void set_flushed(const bool state) { flushed = state; } - void set_copied(bool state) + void set_copied(const bool state) { copied = state; } + + bool is_empty() const + { + return vram_texture == 0; + } + + const u32 id() + { + return vram_texture; + } + + std::tuple get_dimensions() + { + return std::make_tuple(current_width, current_height); + } + }; + + class blitter + { + fbo fbo_argb8; + fbo fbo_rgb565; + fbo blit_src; + + u32 argb8_surface = 0; + u32 rgb565_surface = 0; + + public: + + void init() + { + fbo_argb8.create(); + fbo_rgb565.create(); + blit_src.create(); + + glGenTextures(1, &argb8_surface); + glBindTexture(GL_TEXTURE_2D, argb8_surface); + glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, 4096, 4096); + + glGenTextures(1, &rgb565_surface); + glBindTexture(GL_TEXTURE_2D, rgb565_surface); + glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGB565, 4096, 4096); + + s32 old_fbo = 0; + glGetIntegerv(GL_FRAMEBUFFER_BINDING, &old_fbo); + + fbo_argb8.bind(); + glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, argb8_surface, 0); + + fbo_rgb565.bind(); + glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, rgb565_surface, 0); + + glBindFramebuffer(GL_FRAMEBUFFER, old_fbo); + + fbo_argb8.check(); + fbo_rgb565.check(); + } + + void destroy() + { + fbo_argb8.remove(); + fbo_rgb565.remove(); + blit_src.remove(); + + glDeleteTextures(1, &argb8_surface); + glDeleteTextures(1, &rgb565_surface); + } + + u32 scale_image(u32 src, u32 dst, const areai src_rect, const areai dst_rect, const position2i dst_offset, const position2i clip_offset, + const size2i dst_dims, const size2i clip_dims, bool is_argb8, bool linear_interpolation) + { + s32 old_fbo = 0; + glGetIntegerv(GL_FRAMEBUFFER_BINDING, &old_fbo); + + blit_src.bind(); + glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, src, 0); + blit_src.check(); + + u32 src_surface = 0; + u32 dst_tex = dst; + filter interp = linear_interpolation ? filter::linear : filter::nearest; + + if (!dst_tex) + { + glGenTextures(1, &dst_tex); + glBindTexture(GL_TEXTURE_2D, dst_tex); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + + if (is_argb8) + glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, dst_dims.width, dst_dims.height); + else + glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGB565, dst_dims.width, dst_dims.height); + } + + GLboolean scissor_test_enabled = glIsEnabled(GL_SCISSOR_TEST); + if (scissor_test_enabled) + glDisable(GL_SCISSOR_TEST); + + if (is_argb8) + { + blit_src.blit(fbo_argb8, src_rect, dst_rect, buffers::color, interp); + src_surface = argb8_surface; + } + else + { + blit_src.blit(fbo_rgb565, src_rect, dst_rect, buffers::color, interp); + src_surface = rgb565_surface; + } + + glCopyImageSubData(src_surface, GL_TEXTURE_2D, 0, clip_offset.x, clip_offset.y, 0, + dst_tex, GL_TEXTURE_2D, 0, dst_offset.x, dst_offset.y, 0, clip_dims.width, clip_dims.height, 1); + + if (scissor_test_enabled) + glEnable(GL_SCISSOR_TEST); + + glBindFramebuffer(GL_FRAMEBUFFER, old_fbo); + return dst_tex; + } }; private: - std::vector m_texture_cache; - std::vector m_rtt_cache; + std::vector read_only_memory_sections; + std::vector no_access_memory_sections; std::vector m_temporary_surfaces; - std::pair texture_cache_range = std::make_pair(0xFFFFFFFF, 0); - std::pair rtt_cache_range = std::make_pair(0xFFFFFFFF, 0); + std::pair read_only_range = std::make_pair(0xFFFFFFFF, 0); + std::pair no_access_range = std::make_pair(0xFFFFFFFF, 0); + + blitter m_hw_blitter; std::mutex m_section_mutex; GLGSRender *m_renderer; std::thread::id m_renderer_thread; - cached_texture_section *find_texture(u64 texaddr, u32 w, u32 h, u16 mipmaps) + cached_texture_section *find_texture_from_dimensions(u64 texaddr, u32 w, u32 h) { - for (cached_texture_section &tex : m_texture_cache) + std::lock_guard lock(m_section_mutex); + + for (cached_texture_section &tex : read_only_memory_sections) { - if (tex.matches(texaddr, w, h, mipmaps) && !tex.is_dirty()) + if (tex.matches(texaddr, w, h) && !tex.is_dirty()) return &tex; } return nullptr; } - cached_texture_section& create_texture(u32 id, u32 texaddr, u32 texsize, u32 w, u32 h, u16 mipmap) + /** + * Searches for a texture from read_only memory sections + * Texture origin + size must be a subsection of the existing texture + */ + cached_texture_section *find_texture_from_range(u32 texaddr, u32 range) { - for (cached_texture_section &tex : m_texture_cache) + std::lock_guard lock(m_section_mutex); + + auto test = std::make_pair(texaddr, range); + for (cached_texture_section &tex : read_only_memory_sections) + { + if (tex.get_section_base() > texaddr) + continue; + + if (tex.overlaps(test, true) && !tex.is_dirty()) + return &tex; + } + + return nullptr; + } + + cached_texture_section& create_texture(u32 id, u32 texaddr, u32 texsize, u32 w, u32 h) + { + for (cached_texture_section &tex : read_only_memory_sections) { if (tex.is_dirty()) { tex.destroy(); - tex.reset(texaddr, texsize); - tex.create(id, w, h, mipmap); + tex.reset(texaddr, texsize, false); + tex.create_read_only(id, w, h); - texture_cache_range = tex.get_min_max(texture_cache_range); + read_only_range = tex.get_min_max(read_only_range); return tex; } } cached_texture_section tex; - tex.reset(texaddr, texsize); - tex.create(id, w, h, mipmap); - texture_cache_range = tex.get_min_max(texture_cache_range); + tex.reset(texaddr, texsize, false); + tex.create_read_only(id, w, h); + read_only_range = tex.get_min_max(read_only_range); - m_texture_cache.push_back(tex); - return m_texture_cache.back(); + read_only_memory_sections.push_back(tex); + return read_only_memory_sections.back(); } void clear() { - for (cached_texture_section &tex : m_texture_cache) + for (cached_texture_section &tex : read_only_memory_sections) { tex.destroy(); } - for (cached_rtt_section &rtt : m_rtt_cache) + for (cached_texture_section &tex : no_access_memory_sections) { - rtt.destroy(); + tex.destroy(); } - m_rtt_cache.resize(0); - m_texture_cache.resize(0); + read_only_memory_sections.resize(0); + no_access_memory_sections.resize(0); clear_temporary_surfaces(); } - cached_rtt_section* find_cached_rtt_section(u32 base, u32 size) + cached_texture_section* find_cached_rtt_section(u32 base, u32 size) { - for (cached_rtt_section &rtt : m_rtt_cache) + for (cached_texture_section &rtt : no_access_memory_sections) { if (rtt.matches(base, size)) { @@ -495,17 +633,17 @@ namespace gl return nullptr; } - cached_rtt_section *create_locked_view_of_section(u32 base, u32 size) + cached_texture_section *create_locked_view_of_section(u32 base, u32 size) { - cached_rtt_section *region = find_cached_rtt_section(base, size); + cached_texture_section *region = find_cached_rtt_section(base, size); if (!region) { - for (cached_rtt_section &rtt : m_rtt_cache) + for (cached_texture_section &rtt : no_access_memory_sections) { if (rtt.is_dirty()) { - rtt.reset(base, size); + rtt.reset(base, size, true); rtt.protect(utils::protection::no); region = &rtt; break; @@ -514,16 +652,16 @@ namespace gl if (!region) { - cached_rtt_section section; - section.reset(base, size); + cached_texture_section section; + section.reset(base, size, true); section.set_dirty(true); section.protect(utils::protection::no); - m_rtt_cache.push_back(section); - region = &m_rtt_cache.back(); + no_access_memory_sections.push_back(section); + region = &no_access_memory_sections.back(); } - rtt_cache_range = region->get_min_max(rtt_cache_range); + no_access_range = region->get_min_max(no_access_range); } else { @@ -531,7 +669,7 @@ namespace gl if (region->get_section_size() != size) { region->unprotect(); - region->reset(base, size); + region->reset(base, size, true); } if (!region->is_locked() || region->is_flushed()) @@ -582,11 +720,15 @@ namespace gl { m_renderer = renderer; m_renderer_thread = std::this_thread::get_id(); + + m_hw_blitter.init(); } void close() { clear(); + + m_hw_blitter.destroy(); } template @@ -594,6 +736,12 @@ namespace gl { const u32 texaddr = rsx::get_address(tex.offset(), tex.location()); const u32 range = (u32)get_texture_size(tex); + + const u32 format = tex.format() & ~(CELL_GCM_TEXTURE_LN | CELL_GCM_TEXTURE_UN); + const u32 tex_width = tex.width(); + const u32 tex_height = tex.height(); + const u32 native_pitch = (tex_width * get_format_block_size_in_bytes(format)); + const u32 tex_pitch = (tex.pitch() == 0)? native_pitch: tex.pitch(); if (!texaddr || !range) { @@ -627,15 +775,19 @@ namespace gl * a bound render target. We can bypass the expensive download in this case */ - surface_subresource rsc = m_rtts.get_surface_subresource_if_applicable(texaddr, tex.width(), tex.height(), tex.pitch()); + const f32 internal_scale = (f32)tex_pitch / native_pitch; + const u32 internal_width = tex_width * internal_scale; + + const surface_subresource rsc = m_rtts.get_surface_subresource_if_applicable(texaddr, internal_width, tex_height, tex_pitch, true); if (rsc.surface) { //Check that this region is not cpu-dirty before doing a copy //This section is guaranteed to have a locking section *if* this bit has been bypassed before + //Is this really necessary? bool upload_from_cpu = false; - for (cached_rtt_section §ion : m_rtt_cache) + for (cached_texture_section §ion : no_access_memory_sections) { if (section.overlaps(std::make_pair(texaddr, range)) && section.is_dirty()) { @@ -655,21 +807,28 @@ namespace gl } else { - const u32 format = tex.format() & ~(CELL_GCM_TEXTURE_LN | CELL_GCM_TEXTURE_UN); - - GLenum src_format = (GLenum)rsc.surface->get_internal_format(); - GLenum dst_format = std::get<0>(get_format_type(format)); - u32 bound_index = ~0U; - if (src_format != dst_format) + bool dst_is_compressed = (format == CELL_GCM_TEXTURE_COMPRESSED_DXT1 || format == CELL_GCM_TEXTURE_COMPRESSED_DXT23 || format == CELL_GCM_TEXTURE_COMPRESSED_DXT45); + + if (!dst_is_compressed) { - LOG_WARNING(RSX, "Sampling from a section of a render target, but formats might be incompatible (0x%X vs 0x%X)", src_format, dst_format); + GLenum src_format = (GLenum)rsc.surface->get_internal_format(); + GLenum dst_format = std::get<0>(get_format_type(format)); + + if (src_format != dst_format) + { + LOG_WARNING(RSX, "Sampling from a section of a render target, but formats might be incompatible (0x%X vs 0x%X)", src_format, dst_format); + } + } + else + { + LOG_WARNING(RSX, "Surface blit from a compressed texture"); } if (!rsc.is_bound) { - if (rsc.w == tex.width() && rsc.h == tex.height()) + if (rsc.w == tex_width && rsc.h == tex_height) rsc.surface->bind(); else bound_index = create_temporary_subresource(rsc.surface->id(), (GLenum)rsc.surface->get_compatible_internal_format(), rsc.x, rsc.y, rsc.w, rsc.h); @@ -691,8 +850,7 @@ namespace gl * Search in cache and upload/bind */ - cached_texture_section *cached_texture = find_texture(texaddr, tex.width(), tex.height(), tex.get_exact_mipmap_count()); - + cached_texture_section *cached_texture = find_texture_from_dimensions(texaddr, tex_width, tex_height); if (cached_texture) { verify(HERE), cached_texture->is_empty() == false; @@ -705,17 +863,38 @@ namespace gl return; } - if (!tex.width() || !tex.height()) + /** + * Check for subslices from the cache in case we only have a subset a larger texture + */ + cached_texture = find_texture_from_range(texaddr, range); + if (cached_texture) { - LOG_ERROR(RSX, "Texture upload requested but invalid texture dimensions passed"); - return; + const u32 address_offset = texaddr - cached_texture->get_section_base(); + const u32 format = tex.format() & ~(CELL_GCM_TEXTURE_LN | CELL_GCM_TEXTURE_UN); + const GLenum ifmt = gl::get_sized_internal_format(format); + + u16 offset_x = 0, offset_y = 0; + + if (address_offset) + { + const u32 bpp = get_format_block_size_in_bytes(format); + + offset_y = address_offset / tex_pitch; + offset_x = address_offset % tex_pitch; + + offset_x /= bpp; + offset_y /= bpp; + } + + u32 texture_id = create_temporary_subresource(cached_texture->id(), ifmt, offset_x, offset_y, tex_width, tex_height); + if (texture_id) return; } gl_texture.init(index, tex); std::lock_guard lock(m_section_mutex); - cached_texture_section &cached = create_texture(gl_texture.id(), texaddr, get_texture_size(tex), tex.width(), tex.height(), tex.get_exact_mipmap_count()); + cached_texture_section &cached = create_texture(gl_texture.id(), texaddr, get_texture_size(tex), tex_width, tex_height); cached.protect(utils::protection::ro); cached.set_dirty(false); @@ -727,7 +906,7 @@ namespace gl { std::lock_guard lock(m_section_mutex); - cached_rtt_section *region = find_cached_rtt_section(base, size); + cached_texture_section *region = find_cached_rtt_section(base, size); if (!region) { @@ -751,7 +930,7 @@ namespace gl { std::lock_guard lock(m_section_mutex); - cached_rtt_section *region = create_locked_view_of_section(base, size); + cached_texture_section *region = create_locked_view_of_section(base, size); if (!region->matches(base, size)) { @@ -759,7 +938,7 @@ namespace gl if (region->is_locked()) region->unprotect(); - region->reset(base, size); + region->reset(base, size, true); region->protect(utils::protection::no); } @@ -776,7 +955,7 @@ namespace gl bool load_rtt(gl::texture &tex, const u32 address, const u32 pitch) { const u32 range = tex.height() * pitch; - cached_rtt_section *rtt = find_cached_rtt_section(address, range); + cached_texture_section *rtt = find_cached_rtt_section(address, range); if (rtt && !rtt->is_dirty()) { @@ -796,16 +975,45 @@ namespace gl //TODO: Optimize this function! //Multi-pass checking is slow. Pre-calculate dependency tree at section creation - if (address >= texture_cache_range.first && - address < texture_cache_range.second) + if (address >= read_only_range.first && + address < read_only_range.second) { std::lock_guard lock(m_section_mutex); - for (int i = 0; i < m_texture_cache.size(); ++i) + for (int i = 0; i < read_only_memory_sections.size(); ++i) { - auto &tex = m_texture_cache[i]; + auto &tex = read_only_memory_sections[i]; if (!tex.is_locked()) continue; + auto overlapped = tex.overlaps_page(trampled_range, address); + if (std::get<0>(overlapped)) + { + auto &new_range = std::get<1>(overlapped); + + if (new_range.first != trampled_range.first || + new_range.second != trampled_range.second) + { + trampled_range = new_range; + i = 0; + } + + tex.unprotect(); + tex.set_dirty(true); + response = true; + } + } + } + + if (address >= no_access_range.first && + address < no_access_range.second) + { + std::lock_guard lock(m_section_mutex); + + for (int i = 0; i < no_access_memory_sections.size(); ++i) + { + auto &tex = no_access_memory_sections[i]; + if (tex.is_dirty() || !tex.is_locked()) continue; + auto overlapped = tex.overlaps_page(trampled_range, address); if (std::get<0>(overlapped)) { @@ -826,36 +1034,6 @@ namespace gl } } - if (address >= rtt_cache_range.first && - address < rtt_cache_range.second) - { - std::lock_guard lock(m_section_mutex); - - for (int i = 0; i < m_rtt_cache.size(); ++i) - { - auto &rtt = m_rtt_cache[i]; - if (rtt.is_dirty() || !rtt.is_locked()) continue; - - auto overlapped = rtt.overlaps_page(trampled_range, address); - if (std::get<0>(overlapped)) - { - auto &new_range = std::get<1>(overlapped); - - if (new_range.first != trampled_range.first || - new_range.second != trampled_range.second) - { - trampled_range = new_range; - i = 0; - } - - rtt.unprotect(); - rtt.set_dirty(true); - - response = true; - } - } - } - return response; } @@ -864,25 +1042,25 @@ namespace gl std::lock_guard lock(m_section_mutex); std::pair range = std::make_pair(base, size); - if (base < texture_cache_range.second && - (base + size) >= texture_cache_range.first) + if (base < read_only_range.second && + (base + size) >= read_only_range.first) { - for (cached_texture_section &tex : m_texture_cache) + for (cached_texture_section &tex : read_only_memory_sections) { if (!tex.is_dirty() && tex.overlaps(range)) tex.destroy(); } } - if (base < rtt_cache_range.second && - (base + size) >= rtt_cache_range.first) + if (base < no_access_range.second && + (base + size) >= no_access_range.first) { - for (cached_rtt_section &rtt : m_rtt_cache) + for (cached_texture_section &tex : no_access_memory_sections) { - if (!rtt.is_dirty() && rtt.overlaps(range)) + if (!tex.is_dirty() && tex.overlaps(range)) { - rtt.unprotect(); - rtt.set_dirty(true); + tex.unprotect(); + tex.set_dirty(true); } } } @@ -899,5 +1077,240 @@ namespace gl m_temporary_surfaces.clear(); } + + bool upload_scaled_image(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate, gl_render_targets &m_rtts) + { + //Since we will have dst in vram, we can 'safely' ignore the swizzle flag + //TODO: Verify correct behavior + + bool src_is_render_target = false; + bool dst_is_render_target = false; + bool dst_is_argb8 = (dst.format == rsx::blit_engine::transfer_destination_format::a8r8g8b8); + bool src_is_argb8 = (src.format == rsx::blit_engine::transfer_source_format::a8r8g8b8); + + GLenum src_gl_sized_format = src_is_argb8? GL_RGBA8: GL_RGB565; + GLenum src_gl_format = src_is_argb8 ? GL_BGRA : GL_RGB; + GLenum src_gl_type = src_is_argb8? GL_UNSIGNED_INT_8_8_8_8: GL_UNSIGNED_SHORT_5_6_5; + + u32 vram_texture = 0; + u32 dest_texture = 0; + + const u32 src_address = (u32)((u64)src.pixels - (u64)vm::base(0)); + const u32 dst_address = (u32)((u64)dst.pixels - (u64)vm::base(0)); + + //Check if src/dst are parts of render targets + surface_subresource dst_subres = m_rtts.get_surface_subresource_if_applicable(dst_address, dst.width, dst.clip_height, dst.pitch, true, true); + dst_is_render_target = dst_subres.surface != nullptr; + + u16 max_dst_width = dst.width; + u16 max_dst_height = dst.height; + + //Prepare areas and offsets + //Copy from [src.offset_x, src.offset_y] a region of [clip.width, clip.height] + //Stretch onto [dst.offset_x, y] with clipping performed on the source region + //The implementation here adds the inverse scaled clip dimensions onto the source to completely bypass final clipping step + + float scale_x = (f32)dst.width / src.width; + float scale_y = (f32)dst.height / src.height; + + //Clip offset is unused if the clip offsets are reprojected onto the source + position2i clip_offset = { 0, 0 };//{ dst.clip_x, dst.clip_y }; + position2i dst_offset = { dst.offset_x, dst.offset_y }; + + size2i clip_dimensions = { dst.clip_width, dst.clip_height }; + const size2i dst_dimensions = { dst.pitch / (dst_is_argb8 ? 4 : 2), dst.height }; + + //Offset in x and y for src is 0 (it is already accounted for when getting pixels_src) + //Reproject final clip onto source... + const u16 src_w = clip_dimensions.width / scale_x; + const u16 src_h = clip_dimensions.height / scale_y; + + areai src_area = { 0, 0, src_w, src_h }; + areai dst_area = { 0, 0, dst.clip_width, dst.clip_height }; + + //If destination is neither a render target nor an existing texture in VRAM + //its possible that this method is being used to perform a memcpy into RSX memory, so we check + //parameters. Whenever a simple memcpy can get the job done, use it instead. + //Dai-3-ji Super Robot Taisen for example uses this to copy program code to GPU RAM + + bool is_memcpy = false; + u32 memcpy_bytes_length = 0; + if (dst_is_argb8 == src_is_argb8 && !dst.swizzled) + { + if ((src.slice_h == 1 && dst.clip_height == 1) || + (dst.clip_width == src.width && dst.clip_height == src.slice_h && src.pitch == dst.pitch)) + { + const u8 bpp = dst_is_argb8 ? 4 : 2; + is_memcpy = true; + memcpy_bytes_length = dst.clip_width * bpp * dst.clip_height; + } + } + + if (!dst_is_render_target) + { + //First check if this surface exists in VRAM with exact dimensions + //Since scaled GPU resources are not invalidated by the CPU, we need to reuse older surfaces if possible + auto cached_dest = find_texture_from_dimensions(dst.rsx_address, dst_dimensions.width, dst_dimensions.height); + + //Check for any available region that will fit this one + if (!cached_dest) cached_dest = find_texture_from_range(dst.rsx_address, dst.pitch * dst.clip_height); + + if (cached_dest) + { + //TODO: Verify that the new surface will fit + dest_texture = cached_dest->id(); + + //TODO: Move this code into utils since it is used alot + const u32 address_offset = dst.rsx_address - cached_dest->get_section_base(); + + const u16 bpp = dst_is_argb8 ? 4 : 2; + const u16 offset_y = address_offset / dst.pitch; + const u16 offset_x = address_offset % dst.pitch; + + dst_offset.x += offset_x / bpp; + dst_offset.y += offset_y; + + std::tie(max_dst_width, max_dst_height) = cached_dest->get_dimensions(); + } + else if (is_memcpy) + { + memcpy(dst.pixels, src.pixels, memcpy_bytes_length); + return true; + } + } + else + { + dst_offset.x = dst_subres.x; + dst_offset.y = dst_subres.y; + + dest_texture = dst_subres.surface->id(); + + auto dims = dst_subres.surface->get_dimensions(); + max_dst_width = dims.first; + max_dst_height = dims.second; + + if (is_memcpy) + { + //Some render target descriptions are actually invalid + //Confirm this is a flushable RTT + const auto rsx_pitch = dst_subres.surface->get_rsx_pitch(); + const auto native_pitch = dst_subres.surface->get_native_pitch(); + + if (rsx_pitch <= 64 && native_pitch != rsx_pitch) + { + memcpy(dst.pixels, src.pixels, memcpy_bytes_length); + return true; + } + } + } + + surface_subresource src_subres = m_rtts.get_surface_subresource_if_applicable(src_address, src.width, src.height, src.pitch, true, true); + src_is_render_target = src_subres.surface != nullptr; + + //Create source texture if does not exist + if (!src_is_render_target) + { + auto preloaded_texture = find_texture_from_dimensions(src_address, src.width, src.slice_h); + + if (preloaded_texture != nullptr) + { + vram_texture = preloaded_texture->id(); + } + else + { + flush_section(src_address); + + GLboolean swap_bytes = !src_is_argb8; + if (dst.swizzled) + { + //TODO: Check results against 565 textures + if (src_is_argb8) + { + src_gl_format = GL_RGBA; + swap_bytes = true; + } + else + { + LOG_ERROR(RSX, "RGB565 swizzled texture upload found"); + } + } + + glGenTextures(1, &vram_texture); + glBindTexture(GL_TEXTURE_2D, vram_texture); + glTexStorage2D(GL_TEXTURE_2D, 1, src_gl_sized_format, src.width, src.slice_h); + glPixelStorei(GL_UNPACK_ROW_LENGTH, src.pitch / (src_is_argb8 ? 4 : 2)); + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glPixelStorei(GL_UNPACK_SWAP_BYTES, swap_bytes); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, src.width, src.slice_h, src_gl_format, src_gl_type, src.pixels); + + std::lock_guard lock(m_section_mutex); + + auto §ion = create_texture(vram_texture, src_address, src.pitch * src.slice_h, src.width, src.slice_h); + section.protect(utils::protection::ro); + section.set_dirty(false); + } + } + else + { + if (src_subres.w != clip_dimensions.width || + src_subres.h != clip_dimensions.height) + { + f32 subres_scaling_x = (f32)src.pitch / src_subres.surface->get_native_pitch(); + + dst_area.x2 = (src_subres.w * scale_x * subres_scaling_x); + dst_area.y2 = (src_subres.h * scale_y); + } + + src_area.x2 = src_subres.w; + src_area.y2 = src_subres.h; + + src_area.x1 += src_subres.x; + src_area.x2 += src_subres.x; + src_area.y1 += src_subres.y; + src_area.y2 += src_subres.y; + + vram_texture = src_subres.surface->id(); + } + + //Validate clip offsets (Persona 4 Arena at 720p) + //Check if can fit + //NOTE: It is possible that the check is simpler (if (clip_x >= clip_width)) + //Needs verification + if ((dst.offset_x + dst.clip_x + dst.clip_width) > max_dst_width) dst.clip_x = 0; + if ((dst.offset_y + dst.clip_y + dst.clip_width) > max_dst_height) dst.clip_y = 0; + + if (dst.clip_x || dst.clip_y) + { + //Reproject clip offsets onto source + const u16 scaled_clip_offset_x = dst.clip_x / scale_x; + const u16 scaled_clip_offset_y = dst.clip_y / scale_y; + + src_area.x1 += scaled_clip_offset_x; + src_area.x2 += scaled_clip_offset_x; + src_area.y1 += scaled_clip_offset_y; + src_area.y2 += scaled_clip_offset_y; + } + + u32 texture_id = m_hw_blitter.scale_image(vram_texture, dest_texture, src_area, dst_area, dst_offset, clip_offset, + dst_dimensions, clip_dimensions, dst_is_argb8, interpolate); + + if (dest_texture) + return true; + + //TODO: Verify if any titles ever scale into CPU memory. It defeats the purpose of uploading data to the GPU, but it could happen + //If so, add this texture to the no_access queue not the read_only queue + std::lock_guard lock(m_section_mutex); + + cached_texture_section &cached = create_texture(texture_id, dst.rsx_address, dst.pitch * dst.clip_height, dst.width, dst.clip_height); + //These textures are completely GPU resident so we dont watch for CPU access + //There's no data to be fetched from the CPU + //Its is possible for a title to attempt to read from the region, but the CPU path should be used in such cases + cached.protect(utils::protection::rw); + cached.set_dirty(false); + + return true; + } }; } \ No newline at end of file diff --git a/rpcs3/Emu/RSX/rsx_cache.h b/rpcs3/Emu/RSX/rsx_cache.h index 8a31197b9b..7691fe152e 100644 --- a/rpcs3/Emu/RSX/rsx_cache.h +++ b/rpcs3/Emu/RSX/rsx_cache.h @@ -2,6 +2,7 @@ #include #include "Utilities/VirtualMemory.h" #include "Emu/Memory/vm.h" +#include "gcm_enums.h" namespace rsx { @@ -37,14 +38,18 @@ namespace rsx u16 offset_y; u16 width; u16 height; - u16 slice; + u16 slice_h; u16 pitch; void *pixels; + + u32 rsx_address; }; struct blit_dst_info { blit_engine::transfer_destination_format format; + u16 offset_x; + u16 offset_y; u16 width; u16 height; u16 pitch; @@ -52,8 +57,11 @@ namespace rsx u16 clip_y; u16 clip_width; u16 clip_height; + bool swizzled; void *pixels; + + u32 rsx_address; }; class shaders_cache @@ -107,26 +115,9 @@ namespace rsx bool locked = false; bool dirty = false; - bool region_overlaps(u32 base1, u32 limit1, u32 base2, u32 limit2) + inline bool region_overlaps(u32 base1, u32 limit1, u32 base2, u32 limit2) { - //Check for memory area overlap. unlock page(s) if needed and add this index to array. - //Axis separation test - const u32 &block_start = base1; - const u32 block_end = limit1; - - if (limit2 < block_start) return false; - if (base2 > block_end) return false; - - u32 min_separation = (limit2 - base2) + (limit1 - base1); - u32 range_limit = (block_end > limit2) ? block_end : limit2; - u32 range_base = (block_start < base2) ? block_start : base2; - - u32 actual_separation = (range_limit - range_base); - - if (actual_separation < min_separation) - return true; - - return false; + return (base1 < limit2 && base2 < limit1); } public: @@ -171,6 +162,19 @@ namespace rsx return (locked_address_base <= address && (address - locked_address_base) < locked_address_range); } + /** + * Check if range overlaps with this section. + * ignore_protection_range - if true, the test should not check against the aligned protection range, instead + * tests against actual range of contents in memory + */ + bool overlaps(std::pair range, bool ignore_protection_range) + { + if (!ignore_protection_range) + return region_overlaps(locked_address_base, locked_address_base + locked_address_range, range.first, range.first + range.second); + else + return region_overlaps(cpu_address_base, cpu_address_base + cpu_address_range, range.first, range.first + range.second); + } + /** * Check if the page containing the address tramples this section. Also compares a former trampled page range to compare * If true, returns the range with updated invalid range diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp index 3600dfb4d5..cde5cfd4e5 100644 --- a/rpcs3/Emu/RSX/rsx_methods.cpp +++ b/rpcs3/Emu/RSX/rsx_methods.cpp @@ -573,29 +573,41 @@ namespace rsx } } - blit_src_info src_info; - blit_dst_info dst_info; + if (dst_dma == CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER) + { + //For now, only use this for actual scaled images, there are use cases that should not go through 3d engine, e.g program ucode transfer + //TODO: Figure out more instances where we can use this without problems - src_info.format = src_color_format; - src_info.width = in_w; - src_info.height = in_h; - src_info.pitch = in_pitch; - src_info.slice = slice_h; - src_info.pixels = pixels_src; + blit_src_info src_info; + blit_dst_info dst_info; - dst_info.format = dst_color_format; - dst_info.width = convert_w; - dst_info.height = convert_h; - dst_info.clip_x = clip_x; - dst_info.clip_y = clip_y; - dst_info.clip_width = clip_w; - dst_info.clip_height = clip_h; - dst_info.pitch = in_pitch; - dst_info.pixels = pixels_dst; - dst_info.swizzled = (method_registers.blit_engine_context_surface() == blit_engine::context_surface::swizzle2d); + src_info.format = src_color_format; + src_info.width = in_w; + src_info.height = in_h; + src_info.pitch = in_pitch; + src_info.slice_h = slice_h; + src_info.offset_x = in_x; + src_info.offset_y = in_y; + src_info.pixels = pixels_src; + src_info.rsx_address = get_address(src_offset, src_dma); - if (rsx->scaled_image_from_memory(src_info, dst_info, in_inter == blit_engine::transfer_interpolator::foh)) - return; + dst_info.format = dst_color_format; + dst_info.width = convert_w; + dst_info.height = convert_h; + dst_info.clip_x = clip_x; + dst_info.clip_y = clip_y; + dst_info.clip_width = clip_w; + dst_info.clip_height = clip_h; + dst_info.offset_x = out_x; + dst_info.offset_y = out_y; + dst_info.pitch = out_pitch; + dst_info.pixels = pixels_dst; + dst_info.rsx_address = get_address(dst_offset, dst_dma); + dst_info.swizzled = (method_registers.blit_engine_context_surface() == blit_engine::context_surface::swizzle2d); + + if (rsx->scaled_image_from_memory(src_info, dst_info, in_inter == blit_engine::transfer_interpolator::foh)) + return; + } if (method_registers.blit_engine_context_surface() != blit_engine::context_surface::swizzle2d) { diff --git a/rpcs3/Emu/RSX/rsx_utils.cpp b/rpcs3/Emu/RSX/rsx_utils.cpp index 2b231da3c8..bd1506076d 100644 --- a/rpcs3/Emu/RSX/rsx_utils.cpp +++ b/rpcs3/Emu/RSX/rsx_utils.cpp @@ -30,12 +30,15 @@ namespace rsx void clip_image(u8 *dst, const u8 *src, int clip_x, int clip_y, int clip_w, int clip_h, int bpp, int src_pitch, int dst_pitch) { + u8 *pixels_src = (u8*)src + clip_y * src_pitch + clip_x * bpp; + u8 *pixels_dst = dst; + const u32 row_length = clip_w * bpp; + for (int y = 0; y < clip_h; ++y) { - u8 *dst_row = dst + y * dst_pitch; - const u8 *src_row = src + (y + clip_y) * src_pitch + clip_x * bpp; - - std::memmove(dst_row, src_row, clip_w * bpp); + std::memmove(pixels_dst, pixels_src, row_length); + pixels_src += src_pitch; + pixels_dst += dst_pitch; } }