vulkan API fixes

- Fix for texture barriers
- vulkan: Rework texture cache handling of depth surfaces
- Support for scaled depth blit using overlay pass
- Support proper readback of D24S8 in both D32F_S8 and D24U_S8 variants
- Optimize the depth conversion routines with SSE
- vulkan: Replace slow single element copy with std::memcpy
- Check heap status before attempting blit operations
- Bump guard size on upload buffer as well
This commit is contained in:
kd-11 2018-02-03 16:42:02 +03:00
parent 3bbecd998a
commit c191a98ec3
10 changed files with 346 additions and 71 deletions

View File

@ -109,7 +109,7 @@ public:
bool is_critical()
{
const size_t guard_length = std::max(m_min_guard_size, m_largest_allocated_pool);
return (m_current_allocated_size + guard_length) > m_size;
return (m_current_allocated_size + guard_length) >= m_size;
}
void reset_allocation_stats()

View File

@ -35,10 +35,8 @@ namespace gl
switch (type)
{
case GL_DEBUG_TYPE_ERROR:
{
LOG_ERROR(RSX, "%s", message);
return;
}
default:
LOG_WARNING(RSX, "%s", message);
return;

View File

@ -603,7 +603,7 @@ VKGSRender::VKGSRender() : GSRender()
m_uniform_buffer_ring_info.heap.reset(new vk::buffer(*m_device, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, 0));
m_index_buffer_ring_info.init(VK_INDEX_RING_BUFFER_SIZE_M * 0x100000, "index buffer");
m_index_buffer_ring_info.heap.reset(new vk::buffer(*m_device, VK_INDEX_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, 0));
m_texture_upload_buffer_ring_info.init(VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, "texture upload buffer", 0x400000);
m_texture_upload_buffer_ring_info.init(VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, "texture upload buffer", 32 * 0x100000);
m_texture_upload_buffer_ring_info.heap.reset(new vk::buffer(*m_device, VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, 0));
for (auto &ctx : frame_context_storage)
@ -627,6 +627,9 @@ VKGSRender::VKGSRender() : GSRender()
m_depth_converter.reset(new vk::depth_convert_pass());
m_depth_converter->create(*m_device);
m_depth_scaler.reset(new vk::depth_scaling_pass());
m_depth_scaler->create(*m_device);
m_prog_buffer.reset(new VKProgramBuffer(m_render_passes.data()));
if (g_cfg.video.disable_vertex_cache)
@ -750,6 +753,10 @@ VKGSRender::~VKGSRender()
m_depth_converter->destroy();
m_depth_converter.reset();
//Depth surface blitter
m_depth_scaler->destroy();
m_depth_scaler.reset();
//Pipeline descriptors
vkDestroyPipelineLayout(*m_device, pipeline_layout, nullptr);
vkDestroyDescriptorSetLayout(*m_device, descriptor_layouts, nullptr);
@ -884,29 +891,8 @@ void VKGSRender::notify_tile_unbound(u32 tile)
}
}
void VKGSRender::begin()
void VKGSRender::check_heap_status()
{
rsx::thread::begin();
if (skip_frame || renderer_unavailable ||
(conditional_render_enabled && conditional_render_test_failed))
return;
init_buffers(rsx::framebuffer_creation_context::context_draw);
if (!framebuffer_status_valid)
return;
//Ease resource pressure if the number of draw calls becomes too high or we are running low on memory resources
if (m_current_frame->used_descriptors >= DESCRIPTOR_MAX_DRAW_CALLS)
{
//No need to stall if we have more than one frame queue anyway
flush_command_queue();
CHECK_RESULT(vkResetDescriptorPool(*m_device, m_current_frame->descriptor_pool, 0));
m_current_frame->used_descriptors = 0;
}
if (m_attrib_ring_info.is_critical() ||
m_texture_upload_buffer_ring_info.is_critical() ||
m_uniform_buffer_ring_info.is_critical() ||
@ -953,6 +939,32 @@ void VKGSRender::begin()
std::chrono::time_point<steady_clock> submit_end = steady_clock::now();
m_flip_time += std::chrono::duration_cast<std::chrono::microseconds>(submit_end - submit_start).count();
}
}
void VKGSRender::begin()
{
rsx::thread::begin();
if (skip_frame || renderer_unavailable ||
(conditional_render_enabled && conditional_render_test_failed))
return;
init_buffers(rsx::framebuffer_creation_context::context_draw);
if (!framebuffer_status_valid)
return;
//Ease resource pressure if the number of draw calls becomes too high or we are running low on memory resources
if (m_current_frame->used_descriptors >= DESCRIPTOR_MAX_DRAW_CALLS)
{
//No need to stall if we have more than one frame queue anyway
flush_command_queue();
CHECK_RESULT(vkResetDescriptorPool(*m_device, m_current_frame->descriptor_pool, 0));
m_current_frame->used_descriptors = 0;
}
check_heap_status();
VkDescriptorSetAllocateInfo alloc_info = {};
alloc_info.descriptorPool = m_current_frame->descriptor_pool;
@ -1994,6 +2006,7 @@ void VKGSRender::process_swap_request(frame_context_t *ctx, bool free_resources)
}
m_depth_converter->free_resources();
m_depth_scaler->free_resources();
m_ui_renderer->free_resources();
ctx->buffer_views_to_clean.clear();
@ -2736,7 +2749,7 @@ void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context)
const u32 range = pitch * m_depth_surface_info.height * aa_factor;
m_texture_cache.lock_memory_region(std::get<1>(m_rtts.m_bound_depth_stencil), m_depth_surface_info.address, range,
m_depth_surface_info.width, m_depth_surface_info.height, m_depth_surface_info.pitch, gcm_format, true);
m_depth_surface_info.width, m_depth_surface_info.height, m_depth_surface_info.pitch, gcm_format, false);
}
}
@ -3165,11 +3178,39 @@ bool VKGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst
if (renderer_unavailable)
return false;
//Verify enough memory exists before attempting to handle data transfer
check_heap_status();
//Stop all parallel operations until this is finished
std::lock_guard<std::mutex> lock(m_secondary_cb_guard);
auto result = m_texture_cache.blit(src, dst, interpolate, m_rtts, *m_current_command_buffer);
m_current_command_buffer->begin();
if (auto deferred_op_dst = std::get<1>(result))
{
//Requires manual scaling; depth/stencil surface
auto deferred_op_src = std::get<2>(result);
auto src_view = std::get<3>(result);
auto rp = vk::get_render_pass_location(VK_FORMAT_UNDEFINED, deferred_op_dst->info.format, 0);
auto render_pass = m_render_passes[rp];
auto old_src_layout = deferred_op_src->current_layout;
auto old_dst_layout = deferred_op_dst->current_layout;
vk::change_image_layout(*m_current_command_buffer, deferred_op_src, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
vk::change_image_layout(*m_current_command_buffer, deferred_op_dst, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL);
m_depth_scaler->run(*m_current_command_buffer, deferred_op_dst->width(), deferred_op_dst->height(), deferred_op_dst,
src_view, render_pass, m_framebuffers_to_clean);
vk::change_image_layout(*m_current_command_buffer, deferred_op_src, old_src_layout);
vk::change_image_layout(*m_current_command_buffer, deferred_op_dst, old_dst_layout);
}
m_samplers_dirty.store(true);
return result;
return std::get<0>(result);
}
void VKGSRender::clear_zcull_stats(u32 type)

View File

@ -252,6 +252,7 @@ private:
std::unique_ptr<vk::text_writer> m_text_writer;
std::unique_ptr<vk::depth_convert_pass> m_depth_converter;
std::unique_ptr<vk::depth_scaling_pass> m_depth_scaler;
std::unique_ptr<vk::ui_overlay_renderer> m_ui_renderer;
std::mutex m_sampler_mutex;
@ -376,6 +377,8 @@ private:
void update_draw_state();
void check_heap_status();
/// returns primitive topology, index_count, allocated_verts, vertex_base_index, (offset in index buffer, index type)
std::tuple<VkPrimitiveTopology, u32, u32, u32, std::optional<std::tuple<VkDeviceSize, VkIndexType> > > upload_vertex_data();
public:

View File

@ -388,6 +388,26 @@ namespace vk
image->current_layout = new_layout;
}
void change_image_layout(VkCommandBuffer cmd, vk::image *image, VkImageLayout new_layout)
{
if (image->current_layout == new_layout) return;
VkImageAspectFlags flags = VK_IMAGE_ASPECT_COLOR_BIT;
switch (image->info.format)
{
case VK_FORMAT_D16_UNORM:
flags = VK_IMAGE_ASPECT_DEPTH_BIT;
break;
case VK_FORMAT_D24_UNORM_S8_UINT:
case VK_FORMAT_D32_SFLOAT_S8_UINT:
flags = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
break;
}
change_image_layout(cmd, image->value, image->current_layout, new_layout, { flags, 0, 1, 0, 1 });
image->current_layout = new_layout;
}
void insert_texture_barrier(VkCommandBuffer cmd, VkImage image, VkImageLayout layout, VkImageSubresourceRange range)
{
VkImageMemoryBarrier barrier = {};
@ -419,7 +439,9 @@ namespace vk
{
if (image->info.usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)
{
insert_texture_barrier(cmd, image->value, image->current_layout, { VK_IMAGE_ASPECT_DEPTH_BIT, 0, 1, 0, 1 });
VkImageAspectFlags aspect = VK_IMAGE_ASPECT_DEPTH_BIT;
if (image->info.format != VK_FORMAT_D16_UNORM) aspect |= VK_IMAGE_ASPECT_STENCIL_BIT;
insert_texture_barrier(cmd, image->value, image->current_layout, { aspect, 0, 1, 0, 1 });
}
else
{

View File

@ -82,6 +82,7 @@ namespace vk
void change_image_layout(VkCommandBuffer cmd, VkImage image, VkImageLayout current_layout, VkImageLayout new_layout, VkImageSubresourceRange range);
void change_image_layout(VkCommandBuffer cmd, vk::image *image, VkImageLayout new_layout, VkImageSubresourceRange range);
void change_image_layout(VkCommandBuffer cmd, vk::image *image, VkImageLayout new_layout);
void copy_image(VkCommandBuffer cmd, VkImage &src, VkImage &dst, VkImageLayout srcLayout, VkImageLayout dstLayout, u32 width, u32 height, u32 mipmaps, VkImageAspectFlagBits aspect);
void copy_scaled_image(VkCommandBuffer cmd, VkImage &src, VkImage &dst, VkImageLayout srcLayout, VkImageLayout dstLayout, u32 src_x_offset, u32 src_y_offset, u32 src_width, u32 src_height, u32 dst_x_offset, u32 dst_y_offset, u32 dst_width, u32 dst_height, u32 mipmaps, VkImageAspectFlagBits aspect, bool compatible_formats);

View File

@ -716,4 +716,42 @@ namespace vk
ui.update();
}
};
struct depth_scaling_pass : public overlay_pass
{
depth_scaling_pass()
{
vs_src =
{
"#version 450\n"
"#extension GL_ARB_separate_shader_objects : enable\n"
"layout(location=0) out vec2 tc0;\n"
"\n"
"void main()\n"
"{\n"
" vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n"
" vec2 coords[] = {vec2(0., 0.), vec2(1., 0.), vec2(0., 1.), vec2(1., 1.)};\n"
" gl_Position = vec4(positions[gl_VertexIndex % 4], 0., 1.);\n"
" tc0 = coords[gl_VertexIndex % 4];\n"
"}\n"
};
fs_src =
{
"#version 420\n"
"#extension GL_ARB_separate_shader_objects : enable\n"
"layout(set=0, binding=0) uniform sampler2D fs0;\n"
"layout(location=0) in vec2 tc0;\n"
"\n"
"void main()\n"
"{\n"
" gl_FragDepth = texture(fs0, tc0).x;\n"
"}\n"
};
renderpass_config.write_color = false;
m_vertex_shader.id = 100006;
m_fragment_shader.id = 100007;
}
};
}

View File

@ -183,15 +183,16 @@ namespace vk
break;
}
//TODO: Read back stencil values (is this really necessary?)
VkBufferImageCopy copyRegion = {};
copyRegion.bufferOffset = 0;
copyRegion.bufferRowLength = internal_width;
copyRegion.bufferImageHeight = internal_height;
copyRegion.imageSubresource = {aspect_flag, 0, 0, 1};
copyRegion.imageSubresource = {aspect_flag & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 0, 1};
copyRegion.imageOffset = {};
copyRegion.imageExtent = {internal_width, internal_height, 1};
VkImageSubresourceRange subresource_range = { aspect_flag & ~(VK_IMAGE_ASPECT_STENCIL_BIT), 0, 1, 0, 1 };
VkImageSubresourceRange subresource_range = { aspect_flag, 0, 1, 0, 1 };
VkImageLayout layout = vram_texture->current_layout;
change_image_layout(cmd, vram_texture, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, subresource_range);
@ -235,11 +236,7 @@ namespace vk
}
else
{
auto typed_dst = (T *)pixels_dst;
auto typed_src = (T *)pixels_src;
for (u32 px = 0; px < block_size; ++px)
typed_dst[px] = typed_src[px];
memcpy(pixels_dst, pixels_src, block_size * sizeof(T));
}
}
}
@ -273,38 +270,55 @@ namespace vk
//We have to do our own byte swapping since the driver doesnt do it for us
if (real_pitch == rsx_pitch)
{
switch (bpp)
bool is_depth_format = true;
switch (vram_texture->info.format)
{
case VK_FORMAT_D32_SFLOAT_S8_UINT:
rsx::convert_le_f32_to_be_d24(pixels_dst, pixels_src, cpu_address_range >> 2, 1);
break;
case VK_FORMAT_D24_UNORM_S8_UINT:
rsx::convert_le_d24x8_to_be_d24x8(pixels_dst, pixels_src, cpu_address_range >> 2, 1);
break;
default:
LOG_ERROR(RSX, "Invalid bpp %d", bpp);
case 1:
do_memory_transfer<u8, false>(pixels_dst, pixels_src);
break;
case 2:
if (pack_unpack_swap_bytes)
do_memory_transfer<u16, true>(pixels_dst, pixels_src);
else
do_memory_transfer<u16, false>(pixels_dst, pixels_src);
break;
case 4:
if (pack_unpack_swap_bytes)
do_memory_transfer<u32, true>(pixels_dst, pixels_src);
else
do_memory_transfer<u32, false>(pixels_dst, pixels_src);
break;
case 8:
if (pack_unpack_swap_bytes)
do_memory_transfer<u64, true>(pixels_dst, pixels_src);
else
do_memory_transfer<u64, false>(pixels_dst, pixels_src);
break;
case 16:
if (pack_unpack_swap_bytes)
do_memory_transfer<u128, true>(pixels_dst, pixels_src);
else
do_memory_transfer<u128, false>(pixels_dst, pixels_src);
is_depth_format = false;
break;
}
if (!is_depth_format)
{
switch (bpp)
{
default:
LOG_ERROR(RSX, "Invalid bpp %d", bpp);
case 1:
do_memory_transfer<u8, false>(pixels_dst, pixels_src);
break;
case 2:
if (pack_unpack_swap_bytes)
do_memory_transfer<u16, true>(pixels_dst, pixels_src);
else
do_memory_transfer<u16, false>(pixels_dst, pixels_src);
break;
case 4:
if (pack_unpack_swap_bytes)
do_memory_transfer<u32, true>(pixels_dst, pixels_src);
else
do_memory_transfer<u32, false>(pixels_dst, pixels_src);
break;
case 8:
if (pack_unpack_swap_bytes)
do_memory_transfer<u64, true>(pixels_dst, pixels_src);
else
do_memory_transfer<u64, false>(pixels_dst, pixels_src);
break;
case 16:
if (pack_unpack_swap_bytes)
do_memory_transfer<u128, true>(pixels_dst, pixels_src);
else
do_memory_transfer<u128, false>(pixels_dst, pixels_src);
break;
}
}
}
else
{
@ -324,6 +338,16 @@ namespace vk
}
rsx::scale_image_nearest(pixels_dst, pixels_src, width, height, rsx_pitch, real_pitch, bpp, samples_u, samples_v, pack_unpack_swap_bytes);
switch (vram_texture->info.format)
{
case VK_FORMAT_D32_SFLOAT_S8_UINT:
rsx::convert_le_f32_to_be_d24(pixels_dst, pixels_dst, cpu_address_range >> 2, 1);
break;
case VK_FORMAT_D24_UNORM_S8_UINT:
rsx::convert_le_d24x8_to_be_d24x8(pixels_dst, pixels_dst, cpu_address_range >> 2, 1);
break;
}
}
dma_buffer->unmap();
@ -690,6 +714,7 @@ namespace vk
VkImageAspectFlags aspect_flags;
VkImageType image_type;
VkImageViewType image_view_type;
VkImageUsageFlags usage_flags = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
u8 layer = 0;
switch (type)
@ -724,10 +749,12 @@ namespace vk
{
case CELL_GCM_TEXTURE_DEPTH24_D8:
aspect_flags = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
usage_flags |= VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT;
vk_format = m_formats_support.d24_unorm_s8? VK_FORMAT_D24_UNORM_S8_UINT : VK_FORMAT_D32_SFLOAT_S8_UINT;
break;
case CELL_GCM_TEXTURE_DEPTH16:
aspect_flags = VK_IMAGE_ASPECT_DEPTH_BIT;
usage_flags |= VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT;
vk_format = VK_FORMAT_D16_UNORM;
break;
default:
@ -740,8 +767,7 @@ namespace vk
image_type,
vk_format,
width, height, depth, mipmaps, layer, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED,
VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_SAMPLED_BIT,
is_cubemap ? VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT : 0);
VK_IMAGE_TILING_OPTIMAL, usage_flags, is_cubemap ? VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT : 0);
mapping = apply_component_mapping_flags(gcm_format, flags, remap_vector);
@ -768,7 +794,7 @@ namespace vk
{
//TODO: Confirm byte swap patterns
region.protect(utils::protection::no);
region.set_unpack_swap_bytes(true);
region.set_unpack_swap_bytes((aspect_flags & VK_IMAGE_ASPECT_COLOR_BIT) == VK_IMAGE_ASPECT_COLOR_BIT);
no_access_range = region.get_min_max(no_access_range);
}
@ -954,12 +980,16 @@ namespace vk
return upload_texture(cmd, tex, m_rtts, cmd, m_memory_types, const_cast<const VkQueue>(m_submit_queue));
}
bool blit(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate, rsx::vk_render_targets& m_rtts, vk::command_buffer& cmd)
std::tuple<bool, vk::image*, vk::image*, vk::image_view*> blit(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate, rsx::vk_render_targets& m_rtts, vk::command_buffer& cmd)
{
struct blit_helper
{
vk::command_buffer* commands;
blit_helper(vk::command_buffer *c) : commands(c) {}
vk::image* deferred_op_src = nullptr;
vk::image* deferred_op_dst = nullptr;
void scale_image(vk::image* src, vk::image* dst, areai src_area, areai dst_area, bool /*interpolate*/, bool is_depth)
{
VkImageAspectFlagBits aspect = VK_IMAGE_ASPECT_COLOR_BIT;
@ -984,15 +1014,44 @@ namespace vk
return;
}
copy_scaled_image(*commands, src->value, dst->value, src->current_layout, dst->current_layout, src_area.x1, src_area.y1, src_area.x2 - src_area.x1, src_area.y2 - src_area.y1,
dst_area.x1, dst_area.y1, dst_area.x2 - dst_area.x1, dst_area.y2 - dst_area.y1, 1, aspect, src->info.format == dst->info.format);
const auto src_width = src_area.x2 - src_area.x1;
const auto src_height = src_area.y2 - src_area.y1;
const auto dst_width = dst_area.x2 - dst_area.x1;
const auto dst_height = dst_area.y2 - dst_area.y1;
if (aspect & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))
{
if (src_width != dst_width || src_height != dst_height || src->info.format != dst->info.format)
{
//Scaled depth scaling
deferred_op_src = src;
deferred_op_dst = dst;
}
}
if (!deferred_op_src)
{
copy_scaled_image(*commands, src->value, dst->value, src->current_layout, dst->current_layout, src_area.x1, src_area.y1, src_width, src_height,
dst_area.x1, dst_area.y1, dst_width, dst_height, 1, aspect, src->info.format == dst->info.format);
}
change_image_layout(*commands, dst, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, {(VkImageAspectFlags)aspect, 0, dst->info.mipLevels, 0, dst->info.arrayLayers});
}
}
helper(&cmd);
return upload_scaled_image(src, dst, interpolate, cmd, m_rtts, helper, cmd, m_memory_types, const_cast<const VkQueue>(m_submit_queue));
bool reply = upload_scaled_image(src, dst, interpolate, cmd, m_rtts, helper, cmd, m_memory_types, const_cast<const VkQueue>(m_submit_queue));
if (helper.deferred_op_src == nullptr)
return std::make_tuple(reply, nullptr, nullptr, nullptr);
VkImageSubresourceRange view_range = { VK_IMAGE_ASPECT_DEPTH_BIT, 0, 1, 0, 1 };
auto tmp_view = std::make_unique<vk::image_view>(*vk::get_current_renderer(), helper.deferred_op_src->value, VK_IMAGE_VIEW_TYPE_2D,
helper.deferred_op_src->info.format, helper.deferred_op_src->native_component_map, view_range);
auto src_view = tmp_view.get();
m_discardable_storage.push_back(tmp_view);
return std::make_tuple(reply, helper.deferred_op_dst, helper.deferred_op_src, src_view);
}
const u32 get_unreleased_textures_count() const override

View File

@ -4,6 +4,7 @@
#include "Emu/RSX/GCM.h"
#include "Common/BufferUtils.h"
#include "overlays.h"
#include "Utilities/sysinfo.h"
extern "C"
{
@ -363,4 +364,113 @@ namespace rsx
}
}
}
void convert_le_f32_to_be_d24(void *dst, void *src, u32 row_length_in_texels, u32 num_rows)
{
const u32 num_pixels = row_length_in_texels * num_rows;
verify(HERE), (num_pixels & 3) == 0;
const auto num_iterations = (num_pixels >> 2);
__m128i* dst_ptr = (__m128i*)dst;
__m128i* src_ptr = (__m128i*)src;
const __m128 scale_vector = _mm_set1_ps(16777214.f);
#if defined (_MSC_VER) || defined (__SSSE3__)
if (LIKELY(utils::has_ssse3()))
{
const __m128i swap_mask = _mm_set_epi8
(
0xF, 0xC, 0xD, 0xE,
0xB, 0x8, 0x9, 0xA,
0x7, 0x4, 0x5, 0x6,
0x3, 0x0, 0x1, 0x2
);
for (u32 n = 0; n < num_iterations; ++n)
{
const __m128i src_vector = _mm_loadu_si128(src_ptr);
const __m128i result = _mm_cvtps_epi32(_mm_mul_ps((__m128&)src_vector, scale_vector));
const __m128i shuffled_vector = _mm_shuffle_epi8(result, swap_mask);
_mm_stream_si128(dst_ptr, shuffled_vector);
++dst_ptr;
++src_ptr;
}
return;
}
#endif
const __m128i mask1 = _mm_set1_epi32(0xFF00FF00);
const __m128i mask2 = _mm_set1_epi32(0x00FF0000);
const __m128i mask3 = _mm_set1_epi32(0x000000FF);
for (u32 n = 0; n < num_iterations; ++n)
{
const __m128i src_vector = _mm_loadu_si128(src_ptr);
const __m128i result = _mm_cvtps_epi32(_mm_mul_ps((__m128&)src_vector, scale_vector));
const __m128i v1 = _mm_and_si128(result, mask1);
const __m128i v2 = _mm_and_si128(_mm_slli_epi32(result, 16), mask2);
const __m128i v3 = _mm_and_si128(_mm_srli_epi32(result, 16), mask3);
const __m128i shuffled_vector = _mm_or_si128(_mm_or_si128(v1, v2), v3);
_mm_stream_si128(dst_ptr, shuffled_vector);
++dst_ptr;
++src_ptr;
}
}
void convert_le_d24x8_to_be_d24x8(void *dst, void *src, u32 row_length_in_texels, u32 num_rows)
{
const u32 num_pixels = row_length_in_texels * num_rows;
verify(HERE), (num_pixels & 3) == 0;
const auto num_iterations = (num_pixels >> 2);
__m128i* dst_ptr = (__m128i*)dst;
__m128i* src_ptr = (__m128i*)src;
#if defined (_MSC_VER) || defined (__SSSE3__)
if (LIKELY(utils::has_ssse3()))
{
const __m128i swap_mask = _mm_set_epi8
(
0xF, 0xC, 0xD, 0xE,
0xB, 0x8, 0x9, 0xA,
0x7, 0x4, 0x5, 0x6,
0x3, 0x0, 0x1, 0x2
);
for (u32 n = 0; n < num_iterations; ++n)
{
const __m128i src_vector = _mm_loadu_si128(src_ptr);
const __m128i shuffled_vector = _mm_shuffle_epi8(src_vector, swap_mask);
_mm_stream_si128(dst_ptr, shuffled_vector);
++dst_ptr;
++src_ptr;
}
return;
}
#endif
const __m128i mask1 = _mm_set1_epi32(0xFF00FF00);
const __m128i mask2 = _mm_set1_epi32(0x00FF0000);
const __m128i mask3 = _mm_set1_epi32(0x000000FF);
for (u32 n = 0; n < num_iterations; ++n)
{
const __m128i src_vector = _mm_loadu_si128(src_ptr);
const __m128i v1 = _mm_and_si128(src_vector, mask1);
const __m128i v2 = _mm_and_si128(_mm_slli_epi32(src_vector, 16), mask2);
const __m128i v3 = _mm_and_si128(_mm_srli_epi32(src_vector, 16), mask3);
const __m128i shuffled_vector = _mm_or_si128(_mm_or_si128(v1, v2), v3);
_mm_stream_si128(dst_ptr, shuffled_vector);
++dst_ptr;
++src_ptr;
}
}
}

View File

@ -169,6 +169,9 @@ namespace rsx
void clip_image(u8 *dst, const u8 *src, int clip_x, int clip_y, int clip_w, int clip_h, int bpp, int src_pitch, int dst_pitch);
void clip_image(std::unique_ptr<u8[]>& dst, const u8 *src, int clip_x, int clip_y, int clip_w, int clip_h, int bpp, int src_pitch, int dst_pitch);
void convert_le_f32_to_be_d24(void *dst, void *src, u32 row_length_in_texels, u32 num_rows);
void convert_le_d24x8_to_be_d24x8(void *dst, void *src, u32 row_length_in_texels, u32 num_rows);
void fill_scale_offset_matrix(void *dest_, bool transpose,
float offset_x, float offset_y, float offset_z,
float scale_x, float scale_y, float scale_z);