From 6d6d0e4e369f520ed36892802c3d13f98364a067 Mon Sep 17 00:00:00 2001
From: kd-11 <karokidii@gmail.com>
Date: Wed, 29 Mar 2017 22:27:29 +0300
Subject: [PATCH] gl: Use the GPU to scale textures; use ARB_sampler_object

Improve scaling and separate sampler state from texture state

gl: Unify all texture cache objects under one structure separate by use case
gl: Texture cache fixes

- Acquire lock when finding matching textures
- Account for swizzled surfaces when deciding whether to cpu memcpy
- Handle swizzled images on the GPU
---
 rpcs3/Emu/RSX/GL/GLGSRender.cpp     |  22 +-
 rpcs3/Emu/RSX/GL/GLGSRender.h       |   8 +-
 rpcs3/Emu/RSX/GL/GLProcTable.h      |   7 +
 rpcs3/Emu/RSX/GL/GLRenderTargets.h  |  64 ++-
 rpcs3/Emu/RSX/GL/GLTexture.cpp      | 216 ++++----
 rpcs3/Emu/RSX/GL/GLTexture.h        |  63 +--
 rpcs3/Emu/RSX/GL/GLTextureCache.cpp |  20 +-
 rpcs3/Emu/RSX/GL/GLTextureCache.h   | 743 ++++++++++++++++++++++------
 rpcs3/Emu/RSX/rsx_cache.h           |  44 +-
 rpcs3/Emu/RSX/rsx_methods.cpp       |  52 +-
 rpcs3/Emu/RSX/rsx_utils.cpp         |  11 +-
 11 files changed, 880 insertions(+), 370 deletions(-)

diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
index 858e8f6f95..800999d306 100644
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@@ -408,6 +408,7 @@ void GLGSRender::end()
 	std::chrono::time_point<steady_clock> textures_start = steady_clock::now();
 
 	//Setup textures
+	//Setting unused texture to 0 is not needed, but makes program validation happy if we choose to enforce it
 	for (int i = 0; i < rsx::limits::fragment_textures_count; ++i)
 	{
 		int location;
@@ -422,6 +423,7 @@ void GLGSRender::end()
 		{
 			m_gl_textures[i].set_target(get_gl_target_for_texture(rsx::method_registers.fragment_textures[i]));
 			__glcheck m_gl_texture_cache.upload_texture(i, rsx::method_registers.fragment_textures[i], m_gl_textures[i], m_rtts);
+			__glcheck m_gl_sampler_states[i].apply(rsx::method_registers.fragment_textures[i]);
 		}
 	}
 
@@ -572,6 +574,12 @@ void GLGSRender::on_init_thread()
 	if (g_cfg_rsx_overlay)
 		m_text_printer.init();
 
+	for (int i = 0; i < rsx::limits::fragment_textures_count; ++i)
+	{
+		m_gl_sampler_states[i].create();
+		m_gl_sampler_states[i].bind(i);
+	}
+
 	m_gl_texture_cache.initialize(this);
 }
 
@@ -606,6 +614,11 @@ void GLGSRender::on_exit()
 		tex.remove();
 	}
 
+	for (auto &sampler : m_gl_sampler_states)
+	{
+		sampler.remove();
+	}
+
 	m_attrib_ring_buffer->remove();
 	m_transform_constants_buffer->remove();
 	m_fragment_constants_buffer->remove();
@@ -879,7 +892,7 @@ void GLGSRender::flip(int buffer)
 
 	gl::screen.clear(gl::buffers::color_depth_stencil);
 
-	__glcheck flip_fbo->blit(gl::screen, screen_area, areai(aspect_ratio).flipped_vertical());
+	__glcheck flip_fbo->blit(gl::screen, screen_area, areai(aspect_ratio).flipped_vertical(), gl::buffers::color, gl::filter::linear);
 
 	if (g_cfg_rsx_overlay)
 	{
@@ -960,7 +973,7 @@ void GLGSRender::do_local_task()
 	}
 }
 
-work_item& GLGSRender::post_flush_request(u32 address, gl::texture_cache::cached_rtt_section *section)
+work_item& GLGSRender::post_flush_request(u32 address, gl::texture_cache::cached_texture_section *section)
 {
 	std::lock_guard<std::mutex> lock(queue_guard);
 
@@ -979,3 +992,8 @@ void GLGSRender::synchronize_buffers()
 		flush_draw_buffers = false;
 	}
 }
+
+bool GLGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate)
+{
+	return m_gl_texture_cache.upload_scaled_image(src, dst, interpolate, m_rtts);
+}
diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h
index 5cdc01610e..89348215e6 100644
--- a/rpcs3/Emu/RSX/GL/GLGSRender.h
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.h
@@ -9,6 +9,7 @@
 #include "define_new_memleakdetect.h"
 #include "GLProgramBuffer.h"
 #include "GLTextOut.h"
+#include "../rsx_cache.h"
 
 #pragma comment(lib, "opengl32.lib")
 
@@ -18,7 +19,7 @@ struct work_item
 	std::mutex guard_mutex;
 	
 	u32  address_to_flush = 0;
-	gl::texture_cache::cached_rtt_section *section_to_flush = nullptr;
+	gl::texture_cache::cached_texture_section *section_to_flush = nullptr;
 
 	volatile bool processed = false;
 	volatile bool result = false;
@@ -57,6 +58,7 @@ private:
 
 	rsx::gl::texture m_gl_textures[rsx::limits::fragment_textures_count];
 	rsx::gl::texture m_gl_vertex_textures[rsx::limits::vertex_textures_count];
+	gl::sampler_state m_gl_sampler_states[rsx::limits::fragment_textures_count];
 
 	gl::glsl::program *m_program;
 
@@ -129,7 +131,9 @@ public:
 	void set_viewport();
 
 	void synchronize_buffers();
-	work_item& post_flush_request(u32 address, gl::texture_cache::cached_rtt_section *section);
+	work_item& post_flush_request(u32 address, gl::texture_cache::cached_texture_section *section);
+
+	bool scaled_image_from_memory(rsx::blit_src_info& src_info, rsx::blit_dst_info& dst_info, bool interpolate) override;
 
 protected:
 	void begin() override;
diff --git a/rpcs3/Emu/RSX/GL/GLProcTable.h b/rpcs3/Emu/RSX/GL/GLProcTable.h
index 3c55e3fca9..5d03d216cf 100644
--- a/rpcs3/Emu/RSX/GL/GLProcTable.h
+++ b/rpcs3/Emu/RSX/GL/GLProcTable.h
@@ -172,6 +172,13 @@ OPENGL_PROC(PFNGLMULTIDRAWARRAYSPROC, MultiDrawArrays);
 
 OPENGL_PROC(PFNGLGETTEXTUREIMAGEEXTPROC, GetTextureImageEXT);
 
+//Sampler Objects
+OPENGL_PROC(PFNGLGENSAMPLERSPROC, GenSamplers);
+OPENGL_PROC(PFNGLDELETESAMPLERSPROC, DeleteSamplers);
+OPENGL_PROC(PFNGLBINDSAMPLERPROC, BindSampler);
+OPENGL_PROC(PFNGLSAMPLERPARAMETERIPROC, SamplerParameteri);
+OPENGL_PROC(PFNGLSAMPLERPARAMETERFVPROC, SamplerParameterfv);
+
 //Texture Buffers
 OPENGL_PROC(PFNGLTEXBUFFERPROC, TexBuffer);
 OPENGL_PROC(PFNGLTEXTUREBUFFERRANGEEXTPROC, TextureBufferRangeEXT);
diff --git a/rpcs3/Emu/RSX/GL/GLRenderTargets.h b/rpcs3/Emu/RSX/GL/GLRenderTargets.h
index 04282ae6a6..761bbfa0f3 100644
--- a/rpcs3/Emu/RSX/GL/GLRenderTargets.h
+++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.h
@@ -116,7 +116,9 @@ namespace gl
 		}
 
 		// For an address within the texture, extract this sub-section's rect origin
-		std::tuple<bool, u16, u16> get_texture_subresource(u32 offset)
+		// Checks whether we need to scale the subresource if it is not handled in shader
+		// NOTE1: When surface->real_pitch < rsx_pitch, the surface is assumed to have been scaled to fill the rsx_region
+		std::tuple<bool, u16, u16> get_texture_subresource(u32 offset, bool scale_to_fit)
 		{
 			if (!offset)
 			{
@@ -132,9 +134,14 @@ namespace gl
 				if (!surface_pixel_size)
 					surface_pixel_size = native_pitch / surface_width;
 
-				u32 pixel_offset = (offset / surface_pixel_size);
-				u32 y = (pixel_offset / surface_width);
-				u32 x = (pixel_offset % surface_width);
+				const u32 y = (offset / rsx_pitch);
+				u32 x = (offset % rsx_pitch) / surface_pixel_size;
+
+				if (scale_to_fit)
+				{
+					const f32 x_scale = (f32)rsx_pitch / native_pitch;
+					x = (u32)((f32)x / x_scale);
+				}
 
 				return std::make_tuple(true, (u16)x, (u16)y);
 			}
@@ -291,18 +298,19 @@ struct surface_subresource
 
 	bool is_bound = false;
 	bool is_depth_surface = false;
+	bool is_clipped = false;
 
 	surface_subresource() {}
 
-	surface_subresource(gl::render_target *src, u16 X, u16 Y, u16 W, u16 H, bool _Bound, bool _Depth)
-		: surface(src), x(X), y(Y), w(W), h(H), is_bound(_Bound), is_depth_surface(_Depth)
+	surface_subresource(gl::render_target *src, u16 X, u16 Y, u16 W, u16 H, bool _Bound, bool _Depth, bool _Clipped = false)
+		: surface(src), x(X), y(Y), w(W), h(H), is_bound(_Bound), is_depth_surface(_Depth), is_clipped(_Clipped)
 	{}
 };
 
 class gl_render_targets : public rsx::surface_store<gl_render_target_traits>
 {
 private:
-	bool surface_overlaps(gl::render_target *surface, u32 surface_address, u32 texaddr, u16 *x, u16 *y)
+	bool surface_overlaps(gl::render_target *surface, u32 surface_address, u32 texaddr, u16 *x, u16 *y, bool scale_to_fit)
 	{
 		bool is_subslice = false;
 		u16  x_offset = 0;
@@ -314,7 +322,7 @@ private:
 		u32 offset = texaddr - surface_address;
 		if (offset >= 0)
 		{
-			std::tie(is_subslice, x_offset, y_offset) = surface->get_texture_subresource(offset);
+			std::tie(is_subslice, x_offset, y_offset) = surface->get_texture_subresource(offset, scale_to_fit);
 			if (is_subslice)
 			{
 				*x = x_offset;
@@ -354,7 +362,7 @@ private:
 	}
 
 public:
-	surface_subresource get_surface_subresource_if_applicable(u32 texaddr, u16 requested_width, u16 requested_height, u16 requested_pitch)
+	surface_subresource get_surface_subresource_if_applicable(u32 texaddr, u16 requested_width, u16 requested_height, u16 requested_pitch, bool scale_to_fit =false, bool crop=false)
 	{
 		gl::render_target *surface = nullptr;
 		bool is_subslice = false;
@@ -366,21 +374,35 @@ public:
 			u32 this_address = std::get<0>(tex_info);
 			surface = std::get<1>(tex_info).get();
 
-			if (surface_overlaps(surface, this_address, texaddr, &x_offset, &y_offset))
+			if (surface_overlaps(surface, this_address, texaddr, &x_offset, &y_offset, scale_to_fit))
 			{
 				if (surface->get_rsx_pitch() != requested_pitch)
 					continue;
 
 				auto dims = surface->get_dimensions();
-				
+
+				if (scale_to_fit)
+				{
+					f32  pitch_scaling = (f32)requested_pitch / surface->get_native_pitch();
+					requested_width /= pitch_scaling;
+				}
+
 				if (fits(surface, dims, x_offset, y_offset, requested_width, requested_height))
 					return{ surface, x_offset, y_offset, requested_width, requested_height, is_bound(this_address, false), false };
 				else
 				{
+					if (crop) //Forcefully fit the requested region by clipping and scaling
+					{
+						u16 remaining_width = dims.first - x_offset;
+						u16 remaining_height = dims.second - y_offset;
+
+						return{ surface, x_offset, y_offset, remaining_width, remaining_height, is_bound(this_address, false), false, true };
+					}
+
 					if (dims.first >= requested_width && dims.second >= requested_height)
 					{
 						LOG_WARNING(RSX, "Overlapping surface exceeds bounds; returning full surface region");
-						return{ surface, 0, 0, requested_width, requested_height, is_bound(this_address, false), false };
+						return{ surface, 0, 0, requested_width, requested_height, is_bound(this_address, false), false, true };
 					}
 				}
 			}
@@ -392,21 +414,35 @@ public:
 			u32 this_address = std::get<0>(tex_info);
 			surface = std::get<1>(tex_info).get();
 
-			if (surface_overlaps(surface, this_address, texaddr, &x_offset, &y_offset))
+			if (surface_overlaps(surface, this_address, texaddr, &x_offset, &y_offset, scale_to_fit))
 			{
 				if (surface->get_rsx_pitch() != requested_pitch)
 					continue;
 
 				auto dims = surface->get_dimensions();
 				
+				if (scale_to_fit)
+				{
+					f32  pitch_scaling = (f32)requested_pitch / surface->get_native_pitch();
+					requested_width /= pitch_scaling;
+				}
+
 				if (fits(surface, dims, x_offset, y_offset, requested_width, requested_height))
 					return{ surface, x_offset, y_offset, requested_width, requested_height, is_bound(this_address, true), true };
 				else
 				{
+					if (crop) //Forcefully fit the requested region by clipping and scaling
+					{
+						u16 remaining_width = dims.first - x_offset;
+						u16 remaining_height = dims.second - y_offset;
+
+						return{ surface, x_offset, y_offset, remaining_width, remaining_height, is_bound(this_address, true), true, true };
+					}
+
 					if (dims.first >= requested_width && dims.second >= requested_height)
 					{
 						LOG_WARNING(RSX, "Overlapping depth surface exceeds bounds; returning full surface region");
-						return{ surface, 0, 0, requested_width, requested_height, is_bound(this_address, true), true };
+						return{ surface, 0, 0, requested_width, requested_height, is_bound(this_address, true), true, true };
 					}
 				}
 			}
diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp
index ed211eddfd..968e1f0721 100644
--- a/rpcs3/Emu/RSX/GL/GLTexture.cpp
+++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp
@@ -67,6 +67,116 @@ namespace gl
 		}
 		fmt::throw_exception("Compressed or unknown texture format 0x%x" HERE, texture_format);
 	}
+
+	GLenum wrap_mode(rsx::texture_wrap_mode wrap)
+	{
+		switch (wrap)
+		{
+		case rsx::texture_wrap_mode::wrap: return GL_REPEAT;
+		case rsx::texture_wrap_mode::mirror: return GL_MIRRORED_REPEAT;
+		case rsx::texture_wrap_mode::clamp_to_edge: return GL_CLAMP_TO_EDGE;
+		case rsx::texture_wrap_mode::border: return GL_CLAMP_TO_BORDER;
+		case rsx::texture_wrap_mode::clamp: return GL_CLAMP_TO_EDGE;
+		case rsx::texture_wrap_mode::mirror_once_clamp_to_edge: return GL_MIRROR_CLAMP_TO_EDGE_EXT;
+		case rsx::texture_wrap_mode::mirror_once_border: return GL_MIRROR_CLAMP_TO_BORDER_EXT;
+		case rsx::texture_wrap_mode::mirror_once_clamp: return GL_MIRROR_CLAMP_EXT;
+		}
+
+		LOG_ERROR(RSX, "Texture wrap error: bad wrap (%d)", (u32)wrap);
+		return GL_REPEAT;
+	}
+
+	float max_aniso(rsx::texture_max_anisotropy aniso)
+	{
+		switch (aniso)
+		{
+		case rsx::texture_max_anisotropy::x1: return 1.0f;
+		case rsx::texture_max_anisotropy::x2: return 2.0f;
+		case rsx::texture_max_anisotropy::x4: return 4.0f;
+		case rsx::texture_max_anisotropy::x6: return 6.0f;
+		case rsx::texture_max_anisotropy::x8: return 8.0f;
+		case rsx::texture_max_anisotropy::x10: return 10.0f;
+		case rsx::texture_max_anisotropy::x12: return 12.0f;
+		case rsx::texture_max_anisotropy::x16: return 16.0f;
+		}
+
+		LOG_ERROR(RSX, "Texture anisotropy error: bad max aniso (%d)", (u32)aniso);
+		return 1.0f;
+	}
+
+	int tex_min_filter(rsx::texture_minify_filter min_filter)
+	{
+		switch (min_filter)
+		{
+		case rsx::texture_minify_filter::nearest: return GL_NEAREST;
+		case rsx::texture_minify_filter::linear: return GL_LINEAR;
+		case rsx::texture_minify_filter::nearest_nearest: return GL_NEAREST_MIPMAP_NEAREST;
+		case rsx::texture_minify_filter::linear_nearest: return GL_LINEAR_MIPMAP_NEAREST;
+		case rsx::texture_minify_filter::nearest_linear: return GL_NEAREST_MIPMAP_LINEAR;
+		case rsx::texture_minify_filter::linear_linear: return GL_LINEAR_MIPMAP_LINEAR;
+		case rsx::texture_minify_filter::convolution_min: return GL_LINEAR_MIPMAP_LINEAR;
+		}
+		fmt::throw_exception("Unknow min filter" HERE);
+	}
+
+	int tex_mag_filter(rsx::texture_magnify_filter mag_filter)
+	{
+		switch (mag_filter)
+		{
+		case rsx::texture_magnify_filter::nearest: return GL_NEAREST;
+		case rsx::texture_magnify_filter::linear: return GL_LINEAR;
+		case rsx::texture_magnify_filter::convolution_mag: return GL_LINEAR;
+		}
+		fmt::throw_exception("Unknow mag filter" HERE);
+	}
+
+	//Apply sampler state settings
+	void sampler_state::apply(rsx::fragment_texture& tex)
+	{
+		const f32 border_color = (f32)tex.border_color() / 255;
+		const f32 border_color_array[] = { border_color, border_color, border_color, border_color };
+
+		glSamplerParameteri(samplerHandle, GL_TEXTURE_WRAP_S, wrap_mode(tex.wrap_s()));
+		glSamplerParameteri(samplerHandle, GL_TEXTURE_WRAP_T, wrap_mode(tex.wrap_t()));
+		glSamplerParameteri(samplerHandle, GL_TEXTURE_WRAP_R, wrap_mode(tex.wrap_r()));
+		glSamplerParameterfv(samplerHandle, GL_TEXTURE_BORDER_COLOR, border_color_array);
+
+		if (tex.get_exact_mipmap_count() <= 1)
+		{
+			GLint min_filter = tex_min_filter(tex.min_filter());
+
+			if (min_filter != GL_LINEAR && min_filter != GL_NEAREST)
+			{
+				switch (min_filter)
+				{
+				case GL_NEAREST_MIPMAP_NEAREST:
+				case GL_NEAREST_MIPMAP_LINEAR:
+					min_filter = GL_NEAREST; break;
+				case GL_LINEAR_MIPMAP_NEAREST:
+				case GL_LINEAR_MIPMAP_LINEAR:
+					min_filter = GL_LINEAR; break;
+				default:
+					LOG_ERROR(RSX, "No mipmap fallback defined for rsx_min_filter = 0x%X", (u32)tex.min_filter());
+					min_filter = GL_NEAREST;
+				}
+			}
+
+			glSamplerParameteri(samplerHandle, GL_TEXTURE_MIN_FILTER, min_filter);
+			glSamplerParameteri(samplerHandle,  GL_TEXTURE_LOD_BIAS, 0.);
+			glSamplerParameteri(samplerHandle,  GL_TEXTURE_MIN_LOD, 0);
+			glSamplerParameteri(samplerHandle,  GL_TEXTURE_MAX_LOD, 0);
+		}
+		else
+		{
+			glSamplerParameteri(samplerHandle,  GL_TEXTURE_MIN_FILTER, tex_min_filter(tex.min_filter()));
+			glSamplerParameteri(samplerHandle,  GL_TEXTURE_LOD_BIAS, tex.bias());
+			glSamplerParameteri(samplerHandle,  GL_TEXTURE_MIN_LOD, (tex.min_lod() >> 8));
+			glSamplerParameteri(samplerHandle,  GL_TEXTURE_MAX_LOD, (tex.max_lod() >> 8));
+		}
+
+		glSamplerParameteri(samplerHandle,  GL_TEXTURE_MAG_FILTER, tex_mag_filter(tex.mag_filter()));
+		glSamplerParameteri(samplerHandle,  GL_TEXTURE_MAX_ANISOTROPY_EXT, ::gl::max_aniso(tex.max_aniso()));
+	}
 }
 
 namespace
@@ -182,32 +292,6 @@ namespace rsx
 {
 	namespace gl
 	{
-		int gl_tex_min_filter(rsx::texture_minify_filter min_filter)
-		{
-			switch (min_filter)
-			{
-			case rsx::texture_minify_filter::nearest: return GL_NEAREST;
-			case rsx::texture_minify_filter::linear: return GL_LINEAR;
-			case rsx::texture_minify_filter::nearest_nearest: return GL_NEAREST_MIPMAP_NEAREST;
-			case rsx::texture_minify_filter::linear_nearest: return GL_LINEAR_MIPMAP_NEAREST;
-			case rsx::texture_minify_filter::nearest_linear: return GL_NEAREST_MIPMAP_LINEAR;
-			case rsx::texture_minify_filter::linear_linear: return GL_LINEAR_MIPMAP_LINEAR;
-			case rsx::texture_minify_filter::convolution_min: return GL_LINEAR_MIPMAP_LINEAR;
-			}
-			fmt::throw_exception("Unknow min filter" HERE);
-		}
-
-		int gl_tex_mag_filter(rsx::texture_magnify_filter mag_filter)
-		{
-			switch (mag_filter)
-			{
-			case rsx::texture_magnify_filter::nearest: return GL_NEAREST;
-			case rsx::texture_magnify_filter::linear: return GL_LINEAR;
-			case rsx::texture_magnify_filter::convolution_mag: return GL_LINEAR;
-			}
-			fmt::throw_exception("Unknow mag filter" HERE);
-		}
-
 		static const int gl_tex_zfunc[] =
 		{
 			GL_NEVER,
@@ -230,42 +314,6 @@ namespace rsx
 			glGenTextures(1, &m_id);
 		}
 
-		int texture::gl_wrap(rsx::texture_wrap_mode wrap)
-		{
-			switch (wrap)
-			{
-			case rsx::texture_wrap_mode::wrap: return GL_REPEAT;
-			case rsx::texture_wrap_mode::mirror: return GL_MIRRORED_REPEAT;
-			case rsx::texture_wrap_mode::clamp_to_edge: return GL_CLAMP_TO_EDGE;
-			case rsx::texture_wrap_mode::border: return GL_CLAMP_TO_BORDER;
-			case rsx::texture_wrap_mode::clamp: return GL_CLAMP_TO_EDGE;
-			case rsx::texture_wrap_mode::mirror_once_clamp_to_edge: return GL_MIRROR_CLAMP_TO_EDGE_EXT;
-			case rsx::texture_wrap_mode::mirror_once_border: return GL_MIRROR_CLAMP_TO_BORDER_EXT;
-			case rsx::texture_wrap_mode::mirror_once_clamp: return GL_MIRROR_CLAMP_EXT;
-			}
-
-			LOG_ERROR(RSX, "Texture wrap error: bad wrap (%d)", (u32)wrap);
-			return GL_REPEAT;
-		}
-
-		float texture::max_aniso(rsx::texture_max_anisotropy aniso)
-		{
-			switch (aniso)
-			{
-			case rsx::texture_max_anisotropy::x1: return 1.0f;
-			case rsx::texture_max_anisotropy::x2: return 2.0f;
-			case rsx::texture_max_anisotropy::x4: return 4.0f;
-			case rsx::texture_max_anisotropy::x6: return 6.0f;
-			case rsx::texture_max_anisotropy::x8: return 8.0f;
-			case rsx::texture_max_anisotropy::x10: return 10.0f;
-			case rsx::texture_max_anisotropy::x12: return 12.0f;
-			case rsx::texture_max_anisotropy::x16: return 16.0f;
-			}
-
-			LOG_ERROR(RSX, "Texture anisotropy error: bad max aniso (%d)", (u32)aniso);
-			return 1.0f;
-		}
-
 		u16 texture::get_pitch_modifier(u32 format)
 		{
 			switch (format)
@@ -535,49 +583,7 @@ namespace rsx
 			__glcheck glTexParameteri(m_target, GL_TEXTURE_SWIZZLE_G, remap_values[2]);
 			__glcheck glTexParameteri(m_target, GL_TEXTURE_SWIZZLE_B, remap_values[3]);
 
-			__glcheck glTexParameteri(m_target, GL_TEXTURE_WRAP_S, gl_wrap(tex.wrap_s()));
-			__glcheck glTexParameteri(m_target, GL_TEXTURE_WRAP_T, gl_wrap(tex.wrap_t()));
-			__glcheck glTexParameteri(m_target, GL_TEXTURE_WRAP_R, gl_wrap(tex.wrap_r()));
-
-			if (tex.get_exact_mipmap_count() <= 1 || m_target == GL_TEXTURE_RECTANGLE)
-			{
-				GLint min_filter = gl_tex_min_filter(tex.min_filter());
-				
-				if (min_filter != GL_LINEAR && min_filter != GL_NEAREST)
-				{
-					LOG_WARNING(RSX, "Texture %d, target 0x%x, requesting mipmap filtering without any mipmaps set!", m_id, m_target);
-					
-					switch (min_filter)
-					{
-					case GL_NEAREST_MIPMAP_NEAREST:
-					case GL_NEAREST_MIPMAP_LINEAR:
-						min_filter = GL_NEAREST; break;
-					case GL_LINEAR_MIPMAP_NEAREST:
-					case GL_LINEAR_MIPMAP_LINEAR:
-						min_filter = GL_LINEAR; break;
-					default:
-						LOG_ERROR(RSX, "No mipmap fallback defined for rsx_min_filter = 0x%X", (u32)tex.min_filter());
-						min_filter = GL_NEAREST;
-					}
-				}
-
-				__glcheck glTexParameteri(m_target, GL_TEXTURE_MIN_FILTER, min_filter);
-
-				__glcheck glTexParameterf(m_target, GL_TEXTURE_LOD_BIAS, 0.);
-				__glcheck glTexParameteri(m_target, GL_TEXTURE_MIN_LOD, 0);
-				__glcheck glTexParameteri(m_target, GL_TEXTURE_MAX_LOD, 0);
-			}
-			else
-			{
-				__glcheck glTexParameteri(m_target, GL_TEXTURE_MIN_FILTER, gl_tex_min_filter(tex.min_filter()));
-
-				__glcheck glTexParameterf(m_target, GL_TEXTURE_LOD_BIAS, tex.bias());
-				__glcheck glTexParameteri(m_target, GL_TEXTURE_MIN_LOD, (tex.min_lod() >> 8));
-				__glcheck glTexParameteri(m_target, GL_TEXTURE_MAX_LOD, (tex.max_lod() >> 8));
-			}
-
-			__glcheck glTexParameteri(m_target, GL_TEXTURE_MAG_FILTER, gl_tex_mag_filter(tex.mag_filter()));
-			__glcheck glTexParameterf(m_target, GL_TEXTURE_MAX_ANISOTROPY_EXT, max_aniso(tex.max_aniso()));
+			//The rest of sampler state is now handled by sampler state objects
 		}
 
 		void texture::init(int index, rsx::vertex_texture& tex)
diff --git a/rpcs3/Emu/RSX/GL/GLTexture.h b/rpcs3/Emu/RSX/GL/GLTexture.h
index e12271fa3f..464a9c05a6 100644
--- a/rpcs3/Emu/RSX/GL/GLTexture.h
+++ b/rpcs3/Emu/RSX/GL/GLTexture.h
@@ -5,7 +5,42 @@ namespace rsx
 {
 	class vertex_texture;
 	class fragment_texture;
+}
 
+namespace gl
+{
+	GLenum get_sized_internal_format(u32 gcm_format);
+	std::tuple<GLenum, GLenum> get_format_type(u32 texture_format);
+	GLenum wrap_mode(rsx::texture_wrap_mode wrap);
+	float max_aniso(rsx::texture_max_anisotropy aniso);
+
+	class sampler_state
+	{
+		GLuint samplerHandle = 0;
+
+	public:
+
+		void create()
+		{
+			glGenSamplers(1, &samplerHandle);
+		}
+
+		void remove()
+		{
+			glDeleteSamplers(1, &samplerHandle);
+		}
+
+		void bind(int index)
+		{
+			glBindSampler(index, samplerHandle);
+		}
+
+		void apply(rsx::fragment_texture& tex);
+	};
+}
+
+namespace rsx
+{
 	namespace gl
 	{
 		class texture
@@ -16,28 +51,6 @@ namespace rsx
 		public:
 			void create();
 
-			int gl_wrap(rsx::texture_wrap_mode in);
-
-			float max_aniso(rsx::texture_max_anisotropy aniso);
-
-			inline static u8 convert_4_to_8(u8 v)
-			{
-				// Swizzle bits: 00001234 -> 12341234
-				return (v << 4) | (v);
-			}
-
-			inline static u8 convert_5_to_8(u8 v)
-			{
-				// Swizzle bits: 00012345 -> 12345123
-				return (v << 3) | (v >> 2);
-			}
-
-			inline static u8 convert_6_to_8(u8 v)
-			{
-				// Swizzle bits: 00123456 -> 12345612
-				return (v << 2) | (v >> 4);
-			}
-
 			void init(int index, rsx::fragment_texture& tex);
 			void init(int index, rsx::vertex_texture& tex);
 			
@@ -64,9 +77,3 @@ namespace rsx
 		};
 	}
 }
-
-namespace gl
-{
-	GLenum get_sized_internal_format(u32 gcm_format);
-	std::tuple<GLenum, GLenum> get_format_type(u32 texture_format);
-}
diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.cpp b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp
index 6e338ffccc..65c7ad6c7f 100644
--- a/rpcs3/Emu/RSX/GL/GLTextureCache.cpp
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp
@@ -9,25 +9,25 @@ namespace gl
 {
 	bool texture_cache::flush_section(u32 address)
 	{
-		if (address < rtt_cache_range.first ||
-			address >= rtt_cache_range.second)
+		if (address < no_access_range.first ||
+			address >= no_access_range.second)
 			return false;
 
 		bool post_task = false;
-		cached_rtt_section* section_to_post = nullptr;
+		cached_texture_section* section_to_post = nullptr;
 
 		{
 			std::lock_guard<std::mutex> lock(m_section_mutex);
 
-			for (cached_rtt_section &rtt : m_rtt_cache)
+			for (cached_texture_section &tex : no_access_memory_sections)
 			{
-				if (rtt.is_dirty()) continue;
+				if (tex.is_dirty()) continue;
 
-				if (rtt.is_locked() && rtt.overlaps(address))
+				if (tex.is_locked() && tex.overlaps(address))
 				{
-					if (rtt.is_flushed())
+					if (tex.is_flushed())
 					{
-						LOG_WARNING(RSX, "Section matches range, but marked as already flushed!, 0x%X+0x%X", rtt.get_section_base(), rtt.get_section_size());
+						LOG_WARNING(RSX, "Section matches range, but marked as already flushed!, 0x%X+0x%X", tex.get_section_base(), tex.get_section_size());
 						continue;
 					}
 
@@ -36,11 +36,11 @@ namespace gl
 					if (std::this_thread::get_id() != m_renderer_thread)
 					{
 						post_task = true;
-						section_to_post = &rtt;
+						section_to_post = &tex;
 						break;
 					}
 
-					rtt.flush();
+					tex.flush();
 					return true;
 				}
 			}
diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h
index e34c661d7e..3adde84f09 100644
--- a/rpcs3/Emu/RSX/GL/GLTextureCache.h
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h
@@ -9,79 +9,31 @@
 #include <memory>
 #include <thread>
 #include <condition_variable>
+#include <chrono>
 
 #include "GLRenderTargets.h"
 #include "../Common/TextureUtils.h"
-#include <chrono>
+#include "../../Memory/vm.h"
+#include "Utilities/Config.h"
 
 class GLGSRender;
 
+extern cfg::bool_entry g_cfg_rsx_write_color_buffers;
+extern cfg::bool_entry g_cfg_rsx_write_depth_buffer;
+
 namespace gl
 {
 	class texture_cache
 	{
 	public:
-
 		class cached_texture_section : public rsx::buffered_section
-		{
-			u32 texture_id = 0;
-			u32 width = 0;
-			u32 height = 0;
-			u16 mipmaps = 0;
-
-		public:
-
-			void create(u32 id, u32 width, u32 height, u32 mipmaps)
-			{
-				verify(HERE), locked == false;
-
-				texture_id = id;
-				this->width = width;
-				this->height = height;
-				this->mipmaps = mipmaps;
-			}
-
-			bool matches(u32 rsx_address, u32 width, u32 height, u32 mipmaps) const
-			{
-				if (rsx_address == cpu_address_base && texture_id != 0)
-				{
-					if (!width && !height && !mipmaps)
-						return true;
-
-					return (width == this->width && height == this->height && mipmaps == this->mipmaps);
-				}
-
-				return false;
-			}
-
-			void destroy()
-			{
-				if (locked)
-					unprotect();
-
-				glDeleteTextures(1, &texture_id);
-				texture_id = 0;
-			}
-
-			bool is_empty() const
-			{
-				return (texture_id == 0);
-			}
-
-			u32 id() const
-			{
-				return texture_id;
-			}
-		};
-
-		class cached_rtt_section : public rsx::buffered_section
 		{
 		private:
 			fence m_fence;
 			u32 pbo_id = 0;
 			u32 pbo_size = 0;
 
-			u32 source_texture = 0;
+			u32 vram_texture = 0;
 
 			bool copied = false;
 			bool flushed = false;
@@ -161,6 +113,7 @@ namespace gl
 				return size;
 			}
 
+			//TODO: Move swscale routines to RSX shared
 			void scale_image_fallback(u8* dst, const u8* src, u16 src_width, u16 src_height, u16 dst_pitch, u16 src_pitch, u8 pixel_size, u8 samples)
 			{
 				u32 dst_offset = 0;
@@ -249,15 +202,47 @@ namespace gl
 
 		public:
 
-			void reset(u32 base, u32 size)
+			void reset(const u32 base, const u32 size, const bool flushable)
 			{
 				rsx::buffered_section::reset(base, size);
-				init_buffer();
+				
+				if (flushable)
+					init_buffer();
 				
 				flushed = false;
 				copied = false;
 
-				source_texture = 0;
+				vram_texture = 0;
+			}
+
+			void create_read_only(const u32 id, const u32 width, const u32 height)
+			{
+				//Only to be used for ro memory, we dont care about most members, just dimensions and the vram texture handle
+				current_width = width;
+				current_height = height;
+				vram_texture = id;
+
+				current_pitch = 0;
+				real_pitch = 0;
+			}
+
+			bool matches(const u32 rsx_address, const u32 rsx_size)
+			{
+				return rsx::buffered_section::matches(rsx_address, rsx_size);
+			}
+
+			bool matches(const u32 rsx_address, const u32 width, const u32 height)
+			{
+				if (cpu_address_base == rsx_address && !dirty)
+				{
+					//Mostly only used to debug; matches textures without checking dimensions
+					if (width == 0 && height == 0)
+						return true;
+
+					return (current_width == width && current_height == height);
+				}
+
+				return false;
 			}
 
 			void set_dimensions(u32 width, u32 height, u32 pitch)
@@ -269,7 +254,7 @@ namespace gl
 				real_pitch = width * get_pixel_size(format, type);
 			}
 
-			void set_format(texture::format gl_format, texture::type gl_type, bool swap_bytes)
+			void set_format(const texture::format gl_format, const texture::type gl_type, const bool swap_bytes)
 			{
 				format = gl_format;
 				type = gl_type;
@@ -280,20 +265,20 @@ namespace gl
 
 			void set_source(gl::texture &source)
 			{
-				source_texture = source.id();
+				vram_texture = source.id();
 			}
 
 			void copy_texture()
 			{
-				if (!glIsTexture(source_texture))
+				if (!glIsTexture(vram_texture))
 				{
-					LOG_ERROR(RSX, "Attempted to download rtt texture, but texture handle was invalid! (0x%X)", source_texture);
+					LOG_ERROR(RSX, "Attempted to download rtt texture, but texture handle was invalid! (0x%X)", vram_texture);
 					return;
 				}
 
 				glPixelStorei(GL_PACK_SWAP_BYTES, pack_unpack_swap_bytes);
 				glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo_id);
-				glGetTextureImageEXT(source_texture, GL_TEXTURE_2D, 0, (GLenum)format, (GLenum)type, nullptr);
+				glGetTextureImageEXT(vram_texture, GL_TEXTURE_2D, 0, (GLenum)format, (GLenum)type, nullptr);
 				glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
 
 				m_fence.reset();
@@ -393,9 +378,19 @@ namespace gl
 				if (locked)
 					unprotect();
 
-				glDeleteBuffers(1, &pbo_id);
-				pbo_id = 0;
-				pbo_size = 0;
+				if (pbo_id == 0)
+				{
+					//Read-only texture, destroy texture memory
+					glDeleteTextures(1, &vram_texture);
+					vram_texture = 0;
+				}
+				else
+				{
+					//Destroy pbo cache since vram texture is managed elsewhere
+					glDeleteBuffers(1, &pbo_id);
+					pbo_id = 0;
+					pbo_size = 0;
+				}
 
 				m_fence.destroy();
 			}
@@ -405,86 +400,229 @@ namespace gl
 				return flushed;
 			}
 
-			void set_flushed(bool state)
+			void set_flushed(const bool state)
 			{
 				flushed = state;
 			}
 
-			void set_copied(bool state)
+			void set_copied(const bool state)
 			{
 				copied = state;
 			}
+
+			bool is_empty() const
+			{
+				return vram_texture == 0;
+			}
+
+			const u32 id()
+			{
+				return vram_texture;
+			}
+
+			std::tuple<u32, u32> get_dimensions()
+			{
+				return std::make_tuple(current_width, current_height);
+			}
+		};
+
+		class blitter
+		{
+			fbo fbo_argb8;
+			fbo fbo_rgb565;
+			fbo blit_src;
+
+			u32 argb8_surface = 0;
+			u32 rgb565_surface = 0;
+
+		public:
+
+			void init()
+			{
+				fbo_argb8.create();
+				fbo_rgb565.create();
+				blit_src.create();
+
+				glGenTextures(1, &argb8_surface);
+				glBindTexture(GL_TEXTURE_2D, argb8_surface);
+				glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, 4096, 4096);
+
+				glGenTextures(1, &rgb565_surface);
+				glBindTexture(GL_TEXTURE_2D, rgb565_surface);
+				glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGB565, 4096, 4096);
+
+				s32 old_fbo = 0;
+				glGetIntegerv(GL_FRAMEBUFFER_BINDING, &old_fbo);
+
+				fbo_argb8.bind();
+				glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, argb8_surface, 0);
+				
+				fbo_rgb565.bind();
+				glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, rgb565_surface, 0);
+
+				glBindFramebuffer(GL_FRAMEBUFFER, old_fbo);
+
+				fbo_argb8.check();
+				fbo_rgb565.check();
+			}
+
+			void destroy()
+			{
+				fbo_argb8.remove();
+				fbo_rgb565.remove();
+				blit_src.remove();
+
+				glDeleteTextures(1, &argb8_surface);
+				glDeleteTextures(1, &rgb565_surface);
+			}
+
+			u32 scale_image(u32 src, u32 dst, const areai src_rect, const areai dst_rect, const position2i dst_offset, const position2i clip_offset,
+					const size2i dst_dims, const size2i clip_dims, bool is_argb8, bool linear_interpolation)
+			{
+				s32 old_fbo = 0;
+				glGetIntegerv(GL_FRAMEBUFFER_BINDING, &old_fbo);
+				
+				blit_src.bind();
+				glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, src, 0);
+				blit_src.check();
+
+				u32 src_surface = 0;
+				u32 dst_tex = dst;
+				filter interp = linear_interpolation ? filter::linear : filter::nearest;
+
+				if (!dst_tex)
+				{
+					glGenTextures(1, &dst_tex);
+					glBindTexture(GL_TEXTURE_2D, dst_tex);
+					glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+					glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+
+					if (is_argb8)
+						glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, dst_dims.width, dst_dims.height);
+					else
+						glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGB565, dst_dims.width, dst_dims.height);
+				}
+
+				GLboolean scissor_test_enabled = glIsEnabled(GL_SCISSOR_TEST);
+				if (scissor_test_enabled)
+					glDisable(GL_SCISSOR_TEST);
+
+				if (is_argb8)
+				{
+					blit_src.blit(fbo_argb8, src_rect, dst_rect, buffers::color, interp);
+					src_surface = argb8_surface;
+				}
+				else
+				{
+					blit_src.blit(fbo_rgb565, src_rect, dst_rect, buffers::color, interp);
+					src_surface = rgb565_surface;
+				}
+
+				glCopyImageSubData(src_surface, GL_TEXTURE_2D, 0, clip_offset.x, clip_offset.y, 0,
+					dst_tex, GL_TEXTURE_2D, 0, dst_offset.x, dst_offset.y, 0, clip_dims.width, clip_dims.height, 1);
+
+				if (scissor_test_enabled)
+					glEnable(GL_SCISSOR_TEST);
+
+				glBindFramebuffer(GL_FRAMEBUFFER, old_fbo);
+				return dst_tex;
+			}
 		};
 
 	private:
-		std::vector<cached_texture_section> m_texture_cache;
-		std::vector<cached_rtt_section> m_rtt_cache;
+		std::vector<cached_texture_section> read_only_memory_sections;
+		std::vector<cached_texture_section> no_access_memory_sections;
 		std::vector<u32> m_temporary_surfaces;
 
-		std::pair<u32, u32> texture_cache_range = std::make_pair(0xFFFFFFFF, 0);
-		std::pair<u32, u32> rtt_cache_range = std::make_pair(0xFFFFFFFF, 0);
+		std::pair<u32, u32> read_only_range = std::make_pair(0xFFFFFFFF, 0);
+		std::pair<u32, u32> no_access_range = std::make_pair(0xFFFFFFFF, 0);
+
+		blitter m_hw_blitter;
 
 		std::mutex m_section_mutex;
 
 		GLGSRender *m_renderer;
 		std::thread::id m_renderer_thread;
 
-		cached_texture_section *find_texture(u64 texaddr, u32 w, u32 h, u16 mipmaps)
+		cached_texture_section *find_texture_from_dimensions(u64 texaddr, u32 w, u32 h)
 		{
-			for (cached_texture_section &tex : m_texture_cache)
+			std::lock_guard<std::mutex> lock(m_section_mutex);
+
+			for (cached_texture_section &tex : read_only_memory_sections)
 			{
-				if (tex.matches(texaddr, w, h, mipmaps) && !tex.is_dirty())
+				if (tex.matches(texaddr, w, h) && !tex.is_dirty())
 					return &tex;
 			}
 
 			return nullptr;
 		}
 
-		cached_texture_section& create_texture(u32 id, u32 texaddr, u32 texsize, u32 w, u32 h, u16 mipmap)
+		/**
+		 * Searches for a texture from read_only memory sections
+		 * Texture origin + size must be a subsection of the existing texture
+		 */
+		cached_texture_section *find_texture_from_range(u32 texaddr, u32 range)
 		{
-			for (cached_texture_section &tex : m_texture_cache)
+			std::lock_guard<std::mutex> lock(m_section_mutex);
+
+			auto test = std::make_pair(texaddr, range);
+			for (cached_texture_section &tex : read_only_memory_sections)
+			{
+				if (tex.get_section_base() > texaddr)
+					continue;
+
+				if (tex.overlaps(test, true) && !tex.is_dirty())
+					return &tex;
+			}
+
+			return nullptr;
+		}
+
+		cached_texture_section& create_texture(u32 id, u32 texaddr, u32 texsize, u32 w, u32 h)
+		{
+			for (cached_texture_section &tex : read_only_memory_sections)
 			{
 				if (tex.is_dirty())
 				{
 					tex.destroy();
-					tex.reset(texaddr, texsize);
-					tex.create(id, w, h, mipmap);
+					tex.reset(texaddr, texsize, false);
+					tex.create_read_only(id, w, h);
 					
-					texture_cache_range = tex.get_min_max(texture_cache_range);
+					read_only_range = tex.get_min_max(read_only_range);
 					return tex;
 				}
 			}
 
 			cached_texture_section tex;
-			tex.reset(texaddr, texsize);
-			tex.create(id, w, h, mipmap);
-			texture_cache_range = tex.get_min_max(texture_cache_range);
+			tex.reset(texaddr, texsize, false);
+			tex.create_read_only(id, w, h);
+			read_only_range = tex.get_min_max(read_only_range);
 
-			m_texture_cache.push_back(tex);
-			return m_texture_cache.back();
+			read_only_memory_sections.push_back(tex);
+			return read_only_memory_sections.back();
 		}
 
 		void clear()
 		{
-			for (cached_texture_section &tex : m_texture_cache)
+			for (cached_texture_section &tex : read_only_memory_sections)
 			{
 				tex.destroy();
 			}
 
-			for (cached_rtt_section &rtt : m_rtt_cache)
+			for (cached_texture_section &tex : no_access_memory_sections)
 			{
-				rtt.destroy();
+				tex.destroy();
 			}
 
-			m_rtt_cache.resize(0);
-			m_texture_cache.resize(0);
+			read_only_memory_sections.resize(0);
+			no_access_memory_sections.resize(0);
 
 			clear_temporary_surfaces();
 		}
 
-		cached_rtt_section* find_cached_rtt_section(u32 base, u32 size)
+		cached_texture_section* find_cached_rtt_section(u32 base, u32 size)
 		{
-			for (cached_rtt_section &rtt : m_rtt_cache)
+			for (cached_texture_section &rtt : no_access_memory_sections)
 			{
 				if (rtt.matches(base, size))
 				{
@@ -495,17 +633,17 @@ namespace gl
 			return nullptr;
 		}
 
-		cached_rtt_section *create_locked_view_of_section(u32 base, u32 size)
+		cached_texture_section *create_locked_view_of_section(u32 base, u32 size)
 		{
-			cached_rtt_section *region = find_cached_rtt_section(base, size);
+			cached_texture_section *region = find_cached_rtt_section(base, size);
 
 			if (!region)
 			{
-				for (cached_rtt_section &rtt : m_rtt_cache)
+				for (cached_texture_section &rtt : no_access_memory_sections)
 				{
 					if (rtt.is_dirty())
 					{
-						rtt.reset(base, size);
+						rtt.reset(base, size, true);
 						rtt.protect(utils::protection::no);
 						region = &rtt;
 						break;
@@ -514,16 +652,16 @@ namespace gl
 
 				if (!region)
 				{
-					cached_rtt_section section;
-					section.reset(base, size);
+					cached_texture_section section;
+					section.reset(base, size, true);
 					section.set_dirty(true);
 					section.protect(utils::protection::no);
 
-					m_rtt_cache.push_back(section);
-					region = &m_rtt_cache.back();
+					no_access_memory_sections.push_back(section);
+					region = &no_access_memory_sections.back();
 				}
 
-				rtt_cache_range = region->get_min_max(rtt_cache_range);
+				no_access_range = region->get_min_max(no_access_range);
 			}
 			else
 			{
@@ -531,7 +669,7 @@ namespace gl
 				if (region->get_section_size() != size)
 				{
 					region->unprotect();
-					region->reset(base, size);
+					region->reset(base, size, true);
 				}
 
 				if (!region->is_locked() || region->is_flushed())
@@ -582,11 +720,15 @@ namespace gl
 		{
 			m_renderer = renderer;
 			m_renderer_thread = std::this_thread::get_id();
+
+			m_hw_blitter.init();
 		}
 
 		void close()
 		{
 			clear();
+
+			m_hw_blitter.destroy();
 		}
 
 		template<typename RsxTextureType>
@@ -594,6 +736,12 @@ namespace gl
 		{
 			const u32 texaddr = rsx::get_address(tex.offset(), tex.location());
 			const u32 range = (u32)get_texture_size(tex);
+			
+			const u32 format = tex.format() & ~(CELL_GCM_TEXTURE_LN | CELL_GCM_TEXTURE_UN);
+			const u32 tex_width = tex.width();
+			const u32 tex_height = tex.height();
+			const u32 native_pitch = (tex_width * get_format_block_size_in_bytes(format));
+			const u32 tex_pitch = (tex.pitch() == 0)? native_pitch: tex.pitch();
 
 			if (!texaddr || !range)
 			{
@@ -627,15 +775,19 @@ namespace gl
 			 * a bound render target. We can bypass the expensive download in this case
 			 */
 
-			surface_subresource rsc = m_rtts.get_surface_subresource_if_applicable(texaddr, tex.width(), tex.height(), tex.pitch());
+			const f32 internal_scale = (f32)tex_pitch / native_pitch;
+			const u32 internal_width = tex_width * internal_scale;
+
+			const surface_subresource rsc = m_rtts.get_surface_subresource_if_applicable(texaddr, internal_width, tex_height, tex_pitch, true);
 			if (rsc.surface)
 			{
 				//Check that this region is not cpu-dirty before doing a copy
 				//This section is guaranteed to have a locking section *if* this bit has been bypassed before
 
+				//Is this really necessary?
 				bool upload_from_cpu = false;
 
-				for (cached_rtt_section &section : m_rtt_cache)
+				for (cached_texture_section &section : no_access_memory_sections)
 				{
 					if (section.overlaps(std::make_pair(texaddr, range)) && section.is_dirty())
 					{
@@ -655,21 +807,28 @@ namespace gl
 					}
 					else
 					{
-						const u32 format = tex.format() & ~(CELL_GCM_TEXTURE_LN | CELL_GCM_TEXTURE_UN);
-
-						GLenum src_format = (GLenum)rsc.surface->get_internal_format();
-						GLenum dst_format = std::get<0>(get_format_type(format));
-
 						u32 bound_index = ~0U;
 
-						if (src_format != dst_format)
+						bool dst_is_compressed = (format == CELL_GCM_TEXTURE_COMPRESSED_DXT1 || format == CELL_GCM_TEXTURE_COMPRESSED_DXT23 || format == CELL_GCM_TEXTURE_COMPRESSED_DXT45);
+
+						if (!dst_is_compressed)
 						{
-							LOG_WARNING(RSX, "Sampling from a section of a render target, but formats might be incompatible (0x%X vs 0x%X)", src_format, dst_format);
+							GLenum src_format = (GLenum)rsc.surface->get_internal_format();
+							GLenum dst_format = std::get<0>(get_format_type(format));
+
+							if (src_format != dst_format)
+							{
+								LOG_WARNING(RSX, "Sampling from a section of a render target, but formats might be incompatible (0x%X vs 0x%X)", src_format, dst_format);
+							}
+						}
+						else
+						{
+							LOG_WARNING(RSX, "Surface blit from a compressed texture");
 						}
 
 						if (!rsc.is_bound)
 						{
-							if (rsc.w == tex.width() && rsc.h == tex.height())
+							if (rsc.w == tex_width && rsc.h == tex_height)
 								rsc.surface->bind();
 							else
 								bound_index = create_temporary_subresource(rsc.surface->id(), (GLenum)rsc.surface->get_compatible_internal_format(), rsc.x, rsc.y, rsc.w, rsc.h);
@@ -691,8 +850,7 @@ namespace gl
 			 * Search in cache and upload/bind
 			 */
 
-			cached_texture_section *cached_texture = find_texture(texaddr, tex.width(), tex.height(), tex.get_exact_mipmap_count());
-
+			cached_texture_section *cached_texture = find_texture_from_dimensions(texaddr, tex_width, tex_height);
 			if (cached_texture)
 			{
 				verify(HERE), cached_texture->is_empty() == false;
@@ -705,17 +863,38 @@ namespace gl
 				return;
 			}
 
-			if (!tex.width() || !tex.height())
+			/**
+			 * Check for subslices from the cache in case we only have a subset a larger texture
+			 */
+			cached_texture = find_texture_from_range(texaddr, range);
+			if (cached_texture)
 			{
-				LOG_ERROR(RSX, "Texture upload requested but invalid texture dimensions passed");
-				return;
+				const u32 address_offset = texaddr - cached_texture->get_section_base();
+				const u32 format = tex.format() & ~(CELL_GCM_TEXTURE_LN | CELL_GCM_TEXTURE_UN);
+				const GLenum ifmt = gl::get_sized_internal_format(format);
+
+				u16 offset_x = 0, offset_y = 0;
+
+				if (address_offset)
+				{
+					const u32 bpp = get_format_block_size_in_bytes(format);
+
+					offset_y = address_offset / tex_pitch;
+					offset_x = address_offset % tex_pitch;
+
+					offset_x /= bpp;
+					offset_y /= bpp;
+				}
+
+				u32 texture_id = create_temporary_subresource(cached_texture->id(), ifmt, offset_x, offset_y, tex_width, tex_height);
+				if (texture_id) return;
 			}
 
 			gl_texture.init(index, tex);
 
 			std::lock_guard<std::mutex> lock(m_section_mutex);
 
-			cached_texture_section &cached = create_texture(gl_texture.id(), texaddr, get_texture_size(tex), tex.width(), tex.height(), tex.get_exact_mipmap_count());
+			cached_texture_section &cached = create_texture(gl_texture.id(), texaddr, get_texture_size(tex), tex_width, tex_height);
 			cached.protect(utils::protection::ro);
 			cached.set_dirty(false);
 
@@ -727,7 +906,7 @@ namespace gl
 		{
 			std::lock_guard<std::mutex> lock(m_section_mutex);
 
-			cached_rtt_section *region = find_cached_rtt_section(base, size);
+			cached_texture_section *region = find_cached_rtt_section(base, size);
 
 			if (!region)
 			{
@@ -751,7 +930,7 @@ namespace gl
 		{
 			std::lock_guard<std::mutex> lock(m_section_mutex);
 
-			cached_rtt_section *region = create_locked_view_of_section(base, size);
+			cached_texture_section *region = create_locked_view_of_section(base, size);
 
 			if (!region->matches(base, size))
 			{
@@ -759,7 +938,7 @@ namespace gl
 				if (region->is_locked())
 					region->unprotect();
 
-				region->reset(base, size);
+				region->reset(base, size, true);
 				region->protect(utils::protection::no);
 			}
 
@@ -776,7 +955,7 @@ namespace gl
 		bool load_rtt(gl::texture &tex, const u32 address, const u32 pitch)
 		{
 			const u32 range = tex.height() * pitch;
-			cached_rtt_section *rtt = find_cached_rtt_section(address, range);
+			cached_texture_section *rtt = find_cached_rtt_section(address, range);
 
 			if (rtt && !rtt->is_dirty())
 			{
@@ -796,16 +975,45 @@ namespace gl
 			//TODO: Optimize this function!
 			//Multi-pass checking is slow. Pre-calculate dependency tree at section creation
 
-			if (address >= texture_cache_range.first &&
-				address < texture_cache_range.second)
+			if (address >= read_only_range.first &&
+				address < read_only_range.second)
 			{
 				std::lock_guard<std::mutex> lock(m_section_mutex);
 
-				for (int i = 0; i < m_texture_cache.size(); ++i)
+				for (int i = 0; i < read_only_memory_sections.size(); ++i)
 				{
-					auto &tex = m_texture_cache[i];
+					auto &tex = read_only_memory_sections[i];
 					if (!tex.is_locked()) continue;
 
+					auto overlapped = tex.overlaps_page(trampled_range, address);
+					if (std::get<0>(overlapped))
+					{
+						auto &new_range = std::get<1>(overlapped);
+
+						if (new_range.first != trampled_range.first ||
+							new_range.second != trampled_range.second)
+						{
+							trampled_range = new_range;
+							i = 0;
+						}
+
+						tex.unprotect();
+						tex.set_dirty(true);
+						response = true;
+					}
+				}
+			}
+
+			if (address >= no_access_range.first &&
+				address < no_access_range.second)
+			{
+				std::lock_guard<std::mutex> lock(m_section_mutex);
+
+				for (int i = 0; i < no_access_memory_sections.size(); ++i)
+				{
+					auto &tex = no_access_memory_sections[i];
+					if (tex.is_dirty() || !tex.is_locked()) continue;
+
 					auto overlapped = tex.overlaps_page(trampled_range, address);
 					if (std::get<0>(overlapped))
 					{
@@ -826,36 +1034,6 @@ namespace gl
 				}
 			}
 
-			if (address >= rtt_cache_range.first &&
-				address < rtt_cache_range.second)
-			{
-				std::lock_guard<std::mutex> lock(m_section_mutex);
-
-				for (int i = 0; i < m_rtt_cache.size(); ++i)
-				{
-					auto &rtt = m_rtt_cache[i];
-					if (rtt.is_dirty() || !rtt.is_locked()) continue;
-
-					auto overlapped = rtt.overlaps_page(trampled_range, address);
-					if (std::get<0>(overlapped))
-					{
-						auto &new_range = std::get<1>(overlapped);
-
-						if (new_range.first != trampled_range.first ||
-							new_range.second != trampled_range.second)
-						{
-							trampled_range = new_range;
-							i = 0;
-						}
-
-						rtt.unprotect();
-						rtt.set_dirty(true);
-
-						response = true;
-					}
-				}
-			}
-
 			return response;
 		}
 
@@ -864,25 +1042,25 @@ namespace gl
 			std::lock_guard<std::mutex> lock(m_section_mutex);
 			std::pair<u32, u32> range = std::make_pair(base, size);
 
-			if (base < texture_cache_range.second &&
-				(base + size) >= texture_cache_range.first)
+			if (base < read_only_range.second &&
+				(base + size) >= read_only_range.first)
 			{
-				for (cached_texture_section &tex : m_texture_cache)
+				for (cached_texture_section &tex : read_only_memory_sections)
 				{
 					if (!tex.is_dirty() && tex.overlaps(range))
 						tex.destroy();
 				}
 			}
 
-			if (base < rtt_cache_range.second &&
-				(base + size) >= rtt_cache_range.first)
+			if (base < no_access_range.second &&
+				(base + size) >= no_access_range.first)
 			{
-				for (cached_rtt_section &rtt : m_rtt_cache)
+				for (cached_texture_section &tex : no_access_memory_sections)
 				{
-					if (!rtt.is_dirty() && rtt.overlaps(range))
+					if (!tex.is_dirty() && tex.overlaps(range))
 					{
-						rtt.unprotect();
-						rtt.set_dirty(true);
+						tex.unprotect();
+						tex.set_dirty(true);
 					}
 				}
 			}
@@ -899,5 +1077,240 @@ namespace gl
 
 			m_temporary_surfaces.clear();
 		}
+
+		bool upload_scaled_image(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate, gl_render_targets &m_rtts)
+		{
+			//Since we will have dst in vram, we can 'safely' ignore the swizzle flag
+			//TODO: Verify correct behavior
+
+			bool src_is_render_target = false;
+			bool dst_is_render_target = false;
+			bool dst_is_argb8 = (dst.format == rsx::blit_engine::transfer_destination_format::a8r8g8b8);
+			bool src_is_argb8 = (src.format == rsx::blit_engine::transfer_source_format::a8r8g8b8);
+
+			GLenum src_gl_sized_format = src_is_argb8? GL_RGBA8: GL_RGB565;
+			GLenum src_gl_format = src_is_argb8 ? GL_BGRA : GL_RGB;
+			GLenum src_gl_type = src_is_argb8? GL_UNSIGNED_INT_8_8_8_8: GL_UNSIGNED_SHORT_5_6_5;
+
+			u32 vram_texture = 0;
+			u32 dest_texture = 0;
+
+			const u32 src_address = (u32)((u64)src.pixels - (u64)vm::base(0));
+			const u32 dst_address = (u32)((u64)dst.pixels - (u64)vm::base(0));
+
+			//Check if src/dst are parts of render targets
+			surface_subresource dst_subres = m_rtts.get_surface_subresource_if_applicable(dst_address, dst.width, dst.clip_height, dst.pitch, true, true);
+			dst_is_render_target = dst_subres.surface != nullptr;
+
+			u16 max_dst_width = dst.width;
+			u16 max_dst_height = dst.height;
+
+			//Prepare areas and offsets
+			//Copy from [src.offset_x, src.offset_y] a region of [clip.width, clip.height]
+			//Stretch onto [dst.offset_x, y] with clipping performed on the source region
+			//The implementation here adds the inverse scaled clip dimensions onto the source to completely bypass final clipping step
+
+			float scale_x = (f32)dst.width / src.width;
+			float scale_y = (f32)dst.height / src.height;
+
+			//Clip offset is unused if the clip offsets are reprojected onto the source
+			position2i clip_offset = { 0, 0 };//{ dst.clip_x, dst.clip_y };
+			position2i dst_offset = { dst.offset_x, dst.offset_y };
+
+			size2i clip_dimensions = { dst.clip_width, dst.clip_height };
+			const size2i dst_dimensions = { dst.pitch / (dst_is_argb8 ? 4 : 2), dst.height };
+
+			//Offset in x and y for src is 0 (it is already accounted for when getting pixels_src)
+			//Reproject final clip onto source...
+			const u16 src_w = clip_dimensions.width / scale_x;
+			const u16 src_h = clip_dimensions.height / scale_y;
+
+			areai src_area = { 0, 0, src_w, src_h };
+			areai dst_area = { 0, 0, dst.clip_width, dst.clip_height };
+
+			//If destination is neither a render target nor an existing texture in VRAM
+			//its possible that this method is being used to perform a memcpy into RSX memory, so we check
+			//parameters. Whenever a simple memcpy can get the job done, use it instead.
+			//Dai-3-ji Super Robot Taisen for example uses this to copy program code to GPU RAM
+			
+			bool is_memcpy = false;
+			u32 memcpy_bytes_length = 0;
+			if (dst_is_argb8 == src_is_argb8 && !dst.swizzled)
+			{
+				if ((src.slice_h == 1 && dst.clip_height == 1) ||
+					(dst.clip_width == src.width && dst.clip_height == src.slice_h && src.pitch == dst.pitch))
+				{
+					const u8 bpp = dst_is_argb8 ? 4 : 2;
+					is_memcpy = true;
+					memcpy_bytes_length = dst.clip_width * bpp * dst.clip_height;
+				}
+			}
+
+			if (!dst_is_render_target)
+			{
+				//First check if this surface exists in VRAM with exact dimensions
+				//Since scaled GPU resources are not invalidated by the CPU, we need to reuse older surfaces if possible
+				auto cached_dest = find_texture_from_dimensions(dst.rsx_address, dst_dimensions.width, dst_dimensions.height);
+
+				//Check for any available region that will fit this one
+				if (!cached_dest) cached_dest = find_texture_from_range(dst.rsx_address, dst.pitch * dst.clip_height);
+
+				if (cached_dest)
+				{
+					//TODO: Verify that the new surface will fit
+					dest_texture = cached_dest->id();
+
+					//TODO: Move this code into utils since it is used alot
+					const u32 address_offset = dst.rsx_address - cached_dest->get_section_base();
+
+					const u16 bpp = dst_is_argb8 ? 4 : 2;
+					const u16 offset_y = address_offset / dst.pitch;
+					const u16 offset_x = address_offset % dst.pitch;
+
+					dst_offset.x += offset_x / bpp;
+					dst_offset.y += offset_y;
+
+					std::tie(max_dst_width, max_dst_height) = cached_dest->get_dimensions();
+				}
+				else if (is_memcpy)
+				{
+					memcpy(dst.pixels, src.pixels, memcpy_bytes_length);
+					return true;
+				}
+			}
+			else
+			{
+				dst_offset.x = dst_subres.x;
+				dst_offset.y = dst_subres.y;
+
+				dest_texture = dst_subres.surface->id();
+
+				auto dims = dst_subres.surface->get_dimensions();
+				max_dst_width = dims.first;
+				max_dst_height = dims.second;
+
+				if (is_memcpy)
+				{
+					//Some render target descriptions are actually invalid
+					//Confirm this is a flushable RTT
+					const auto rsx_pitch = dst_subres.surface->get_rsx_pitch();
+					const auto native_pitch = dst_subres.surface->get_native_pitch();
+
+					if (rsx_pitch <= 64 && native_pitch != rsx_pitch)
+					{
+						memcpy(dst.pixels, src.pixels, memcpy_bytes_length);
+						return true;
+					}
+				}
+			}
+
+			surface_subresource src_subres = m_rtts.get_surface_subresource_if_applicable(src_address, src.width, src.height, src.pitch, true, true);
+			src_is_render_target = src_subres.surface != nullptr;
+
+			//Create source texture if does not exist
+			if (!src_is_render_target)
+			{
+				auto preloaded_texture = find_texture_from_dimensions(src_address, src.width, src.slice_h);
+
+				if (preloaded_texture != nullptr)
+				{
+					vram_texture = preloaded_texture->id();
+				}
+				else
+				{
+					flush_section(src_address);
+
+					GLboolean swap_bytes = !src_is_argb8;
+					if (dst.swizzled)
+					{
+						//TODO: Check results against 565 textures
+						if (src_is_argb8)
+						{
+							src_gl_format = GL_RGBA;
+							swap_bytes = true;
+						}
+						else
+						{
+							LOG_ERROR(RSX, "RGB565 swizzled texture upload found");
+						}
+					}
+
+					glGenTextures(1, &vram_texture);
+					glBindTexture(GL_TEXTURE_2D, vram_texture);
+					glTexStorage2D(GL_TEXTURE_2D, 1, src_gl_sized_format, src.width, src.slice_h);
+					glPixelStorei(GL_UNPACK_ROW_LENGTH, src.pitch / (src_is_argb8 ? 4 : 2));
+					glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+					glPixelStorei(GL_UNPACK_SWAP_BYTES, swap_bytes);
+					glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+					glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+					glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, src.width, src.slice_h, src_gl_format, src_gl_type, src.pixels);
+
+					std::lock_guard<std::mutex> lock(m_section_mutex);
+					
+					auto &section = create_texture(vram_texture, src_address, src.pitch * src.slice_h, src.width, src.slice_h);
+					section.protect(utils::protection::ro);
+					section.set_dirty(false);
+				}
+			}
+			else
+			{
+				if (src_subres.w != clip_dimensions.width ||
+					src_subres.h != clip_dimensions.height)
+				{
+					f32 subres_scaling_x = (f32)src.pitch / src_subres.surface->get_native_pitch();
+					
+					dst_area.x2 = (src_subres.w * scale_x * subres_scaling_x);
+					dst_area.y2 = (src_subres.h * scale_y);
+				}
+
+				src_area.x2 = src_subres.w;				
+				src_area.y2 = src_subres.h;
+
+				src_area.x1 += src_subres.x;
+				src_area.x2 += src_subres.x;
+				src_area.y1 += src_subres.y;
+				src_area.y2 += src_subres.y;
+
+				vram_texture = src_subres.surface->id();
+			}
+
+			//Validate clip offsets (Persona 4 Arena at 720p)
+			//Check if can fit
+			//NOTE: It is possible that the check is simpler (if (clip_x >= clip_width))
+			//Needs verification
+			if ((dst.offset_x + dst.clip_x + dst.clip_width) > max_dst_width) dst.clip_x = 0;
+			if ((dst.offset_y + dst.clip_y + dst.clip_width) > max_dst_height) dst.clip_y = 0;
+
+			if (dst.clip_x || dst.clip_y)
+			{
+				//Reproject clip offsets onto source
+				const u16 scaled_clip_offset_x = dst.clip_x / scale_x;
+				const u16 scaled_clip_offset_y = dst.clip_y / scale_y;
+
+				src_area.x1 += scaled_clip_offset_x;
+				src_area.x2 += scaled_clip_offset_x;
+				src_area.y1 += scaled_clip_offset_y;
+				src_area.y2 += scaled_clip_offset_y;
+			}
+
+			u32 texture_id = m_hw_blitter.scale_image(vram_texture, dest_texture, src_area, dst_area, dst_offset, clip_offset,
+					dst_dimensions, clip_dimensions, dst_is_argb8, interpolate);
+
+			if (dest_texture)
+				return true;
+
+			//TODO: Verify if any titles ever scale into CPU memory. It defeats the purpose of uploading data to the GPU, but it could happen
+			//If so, add this texture to the no_access queue not the read_only queue
+			std::lock_guard<std::mutex> lock(m_section_mutex);
+
+			cached_texture_section &cached = create_texture(texture_id, dst.rsx_address, dst.pitch * dst.clip_height, dst.width, dst.clip_height);
+			//These textures are completely GPU resident so we dont watch for CPU access
+			//There's no data to be fetched from the CPU
+			//Its is possible for a title to attempt to read from the region, but the CPU path should be used in such cases
+			cached.protect(utils::protection::rw);
+			cached.set_dirty(false);
+
+			return true;
+		}
 	};
 }
\ No newline at end of file
diff --git a/rpcs3/Emu/RSX/rsx_cache.h b/rpcs3/Emu/RSX/rsx_cache.h
index 8a31197b9b..7691fe152e 100644
--- a/rpcs3/Emu/RSX/rsx_cache.h
+++ b/rpcs3/Emu/RSX/rsx_cache.h
@@ -2,6 +2,7 @@
 #include <rsx_decompiler.h>
 #include "Utilities/VirtualMemory.h"
 #include "Emu/Memory/vm.h"
+#include "gcm_enums.h"
 
 namespace rsx
 {
@@ -37,14 +38,18 @@ namespace rsx
 		u16 offset_y;
 		u16 width;
 		u16 height;
-		u16 slice;
+		u16 slice_h;
 		u16 pitch;
 		void *pixels;
+
+		u32 rsx_address;
 	};
 
 	struct blit_dst_info
 	{
 		blit_engine::transfer_destination_format format;
+		u16 offset_x;
+		u16 offset_y;
 		u16 width;
 		u16 height;
 		u16 pitch;
@@ -52,8 +57,11 @@ namespace rsx
 		u16 clip_y;
 		u16 clip_width;
 		u16 clip_height;
+
 		bool swizzled;
 		void *pixels;
+
+		u32  rsx_address;
 	};
 
 	class shaders_cache
@@ -107,26 +115,9 @@ namespace rsx
 		bool locked = false;
 		bool dirty = false;
 
-		bool region_overlaps(u32 base1, u32 limit1, u32 base2, u32 limit2)
+		inline bool region_overlaps(u32 base1, u32 limit1, u32 base2, u32 limit2)
 		{
-			//Check for memory area overlap. unlock page(s) if needed and add this index to array.
-			//Axis separation test
-			const u32 &block_start = base1;
-			const u32 block_end = limit1;
-
-			if (limit2 < block_start) return false;
-			if (base2 > block_end) return false;
-
-			u32 min_separation = (limit2 - base2) + (limit1 - base1);
-			u32 range_limit = (block_end > limit2) ? block_end : limit2;
-			u32 range_base = (block_start < base2) ? block_start : base2;
-
-			u32 actual_separation = (range_limit - range_base);
-
-			if (actual_separation < min_separation)
-				return true;
-
-			return false;
+			return (base1 < limit2 && base2 < limit1);
 		}
 
 	public:
@@ -171,6 +162,19 @@ namespace rsx
 			return (locked_address_base <= address && (address - locked_address_base) < locked_address_range);
 		}
 
+		/**
+		 * Check if range overlaps with this section.
+		 * ignore_protection_range - if true, the test should not check against the aligned protection range, instead
+		 * tests against actual range of contents in memory
+		 */
+		bool overlaps(std::pair<u32, u32> range, bool ignore_protection_range)
+		{
+			if (!ignore_protection_range)
+				return region_overlaps(locked_address_base, locked_address_base + locked_address_range, range.first, range.first + range.second);
+			else
+				return region_overlaps(cpu_address_base, cpu_address_base + cpu_address_range, range.first, range.first + range.second);
+		}
+
 		/**
 		 * Check if the page containing the address tramples this section. Also compares a former trampled page range to compare
 		 * If true, returns the range <min, max> with updated invalid range 
diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp
index 3600dfb4d5..cde5cfd4e5 100644
--- a/rpcs3/Emu/RSX/rsx_methods.cpp
+++ b/rpcs3/Emu/RSX/rsx_methods.cpp
@@ -573,29 +573,41 @@ namespace rsx
 				}
 			}
 
-			blit_src_info src_info;
-			blit_dst_info dst_info;
+			if (dst_dma == CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER)
+			{
+				//For now, only use this for actual scaled images, there are use cases that should not go through 3d engine, e.g program ucode transfer
+				//TODO: Figure out more instances where we can use this without problems
 
-			src_info.format = src_color_format;
-			src_info.width = in_w;
-			src_info.height = in_h;
-			src_info.pitch = in_pitch;
-			src_info.slice = slice_h;
-			src_info.pixels = pixels_src;
+				blit_src_info src_info;
+				blit_dst_info dst_info;
 
-			dst_info.format = dst_color_format;
-			dst_info.width = convert_w;
-			dst_info.height = convert_h;
-			dst_info.clip_x = clip_x;
-			dst_info.clip_y = clip_y;
-			dst_info.clip_width = clip_w;
-			dst_info.clip_height = clip_h;
-			dst_info.pitch = in_pitch;
-			dst_info.pixels = pixels_dst;
-			dst_info.swizzled = (method_registers.blit_engine_context_surface() == blit_engine::context_surface::swizzle2d);
+				src_info.format = src_color_format;
+				src_info.width = in_w;
+				src_info.height = in_h;
+				src_info.pitch = in_pitch;
+				src_info.slice_h = slice_h;
+				src_info.offset_x = in_x;
+				src_info.offset_y = in_y;
+				src_info.pixels = pixels_src;
+				src_info.rsx_address = get_address(src_offset, src_dma);
 
-			if (rsx->scaled_image_from_memory(src_info, dst_info, in_inter == blit_engine::transfer_interpolator::foh))
-				return;
+				dst_info.format = dst_color_format;
+				dst_info.width = convert_w;
+				dst_info.height = convert_h;
+				dst_info.clip_x = clip_x;
+				dst_info.clip_y = clip_y;
+				dst_info.clip_width = clip_w;
+				dst_info.clip_height = clip_h;
+				dst_info.offset_x = out_x;
+				dst_info.offset_y = out_y;
+				dst_info.pitch = out_pitch;
+				dst_info.pixels = pixels_dst;
+				dst_info.rsx_address = get_address(dst_offset, dst_dma);
+				dst_info.swizzled = (method_registers.blit_engine_context_surface() == blit_engine::context_surface::swizzle2d);
+
+				if (rsx->scaled_image_from_memory(src_info, dst_info, in_inter == blit_engine::transfer_interpolator::foh))
+					return;
+			}
 
 			if (method_registers.blit_engine_context_surface() != blit_engine::context_surface::swizzle2d)
 			{
diff --git a/rpcs3/Emu/RSX/rsx_utils.cpp b/rpcs3/Emu/RSX/rsx_utils.cpp
index 2b231da3c8..bd1506076d 100644
--- a/rpcs3/Emu/RSX/rsx_utils.cpp
+++ b/rpcs3/Emu/RSX/rsx_utils.cpp
@@ -30,12 +30,15 @@ namespace rsx
 
 	void clip_image(u8 *dst, const u8 *src, int clip_x, int clip_y, int clip_w, int clip_h, int bpp, int src_pitch, int dst_pitch)
 	{
+		u8 *pixels_src = (u8*)src + clip_y * src_pitch + clip_x * bpp;
+		u8 *pixels_dst = dst;
+		const u32 row_length = clip_w * bpp;
+
 		for (int y = 0; y < clip_h; ++y)
 		{
-			u8 *dst_row = dst + y * dst_pitch;
-			const u8 *src_row = src + (y + clip_y) * src_pitch + clip_x * bpp;
-
-			std::memmove(dst_row, src_row, clip_w * bpp);
+			std::memmove(pixels_dst, pixels_src, row_length);
+			pixels_src += src_pitch;
+			pixels_dst += dst_pitch;
 		}
 	}