rsx: Detiler improvements

- Detile on blit src read
- Improve blit engine integration
This commit is contained in:
kd-11 2023-10-02 23:49:44 +03:00 committed by kd-11
parent 9dca70ec9e
commit e95cff0bde
5 changed files with 117 additions and 68 deletions

View File

@ -2548,12 +2548,12 @@ namespace rsx
src_address += (src.width - src_w) * src_bpp;
}
const auto is_tiled = [&]()
const auto is_tiled_mem = [&](const utils::address_range& range)
{
auto rsxthr = rsx::get_current_renderer();
auto region = rsxthr->get_tiled_memory_region(utils::address_range::start_length(dst_address, dst.pitch * dst.clip_height));
auto region = rsxthr->get_tiled_memory_region(range);
return region.tile != nullptr;
}();
};
auto rtt_lookup = [&m_rtts, &cmd, &scale_x, &scale_y, this](u32 address, u32 width, u32 height, u32 pitch, u8 bpp, rsx::flags32_t access, bool allow_clipped) -> typename surface_store_type::surface_overlap_info
{
@ -2637,6 +2637,10 @@ namespace rsx
return true;
};
// Check tiled mem
const auto dst_is_tiled = is_tiled_mem(utils::address_range::start_length(dst_address, dst.pitch * dst.clip_height));
const auto src_is_tiled = is_tiled_mem(utils::address_range::start_length(src_address, src.pitch * src.height));
// Check if src/dst are parts of render targets
typename surface_store_type::surface_overlap_info dst_subres;
bool use_null_region = false;
@ -2646,7 +2650,6 @@ namespace rsx
auto src_subres = rtt_lookup(src_address, src_w, src_h, src.pitch, src_bpp, surface_access::transfer_read, false);
src_is_render_target = src_subres.surface != nullptr;
if (get_location(dst_address) == CELL_GCM_LOCATION_LOCAL)
{
// TODO: HACK
@ -2657,7 +2660,7 @@ namespace rsx
else
{
// Surface exists in local memory.
use_null_region = (is_copy_op && !is_format_convert);
use_null_region = (is_copy_op && !is_format_convert && !src_is_tiled);
// Invalidate surfaces in range. Sample tests should catch overlaps in theory.
m_rtts.invalidate_range(utils::address_range::start_length(dst_address, dst.pitch* dst_h));
@ -2693,7 +2696,7 @@ namespace rsx
else
{
// Determine whether to perform this transfer on CPU or GPU (src data may not be graphical)
const bool is_trivial_copy = is_copy_op && !is_format_convert && !dst.swizzled && !is_tiled;
const bool is_trivial_copy = is_copy_op && !is_format_convert && !dst.swizzled && !dst_is_tiled && !src_is_tiled;
const bool is_block_transfer = (dst_w == src_w && dst_h == src_h && (src.pitch == dst.pitch || src_h == 1));
const bool is_mirror_op = (dst.scale_x < 0.f || dst.scale_y < 0.f);
@ -2723,17 +2726,11 @@ namespace rsx
skip_if_collision_exists = true;
}
if (!g_cfg.video.use_gpu_texture_scaling)
if (!g_cfg.video.use_gpu_texture_scaling && !dst_is_tiled && !src_is_tiled)
{
if (dst.swizzled)
{
// Swizzle operation requested. Use fallback
if (is_tiled)
{
// Corner case
// FIXME: We have had hw-accelerated swizzle support for some time now
rsx_log.error("Swizzled write to tiled area.");
}
return false;
}

View File

@ -18,6 +18,11 @@
#define OCCLUSION_MAX_POOL_SIZE DESCRIPTOR_MAX_DRAW_CALLS
namespace rsx
{
struct GCM_tile_reference;
}
namespace vk
{
// Forward declarations
@ -86,6 +91,10 @@ namespace vk
const std::vector<rsx::subresource_layout>& subresource_layout, int format, bool is_swizzled, u16 layer_count,
VkImageAspectFlags flags, vk::data_heap &upload_heap, u32 heap_align, rsx::flags32_t image_setup_flags);
std::pair<buffer*, u32> detile_memory_block(
const vk::command_buffer& cmd, const rsx::GCM_tile_reference& tiled_region, const utils::address_range& range,
u16 width, u16 height, u8 bpp);
// Other texture management helpers
void copy_image_to_buffer(const vk::command_buffer& cmd, const vk::image* src, const vk::buffer* dst, const VkBufferImageCopy& region, const image_readback_options_t& options = {});
void copy_buffer_to_image(const vk::command_buffer& cmd, const vk::buffer* src, const vk::image* dst, const VkBufferImageCopy& region);

View File

@ -709,62 +709,11 @@ namespace vk
);
subres.data = std::span(ext_data);
#else
const auto available_tile_size = tiled_region.tile->size - (range.start - tiled_region.base_address);
const auto max_content_size = tiled_region.tile->pitch * utils::align<u32>(subres.height_in_block, 64);
const auto section_length = std::min(max_content_size, available_tile_size);
const auto dma_mapping = vk::map_dma(range.start, section_length);
vk::load_dma(range.start, section_length);
const auto scratch_buf = vk::get_scratch_buffer(cmd, section_length * 3); // 0 = linear data, 1 = padding (deswz), 2 = tiled data
const auto tiled_data_scratch_offset = section_length * 2;
const auto linear_data_scratch_offset = 0;
// Schedule the job
const RSX_detiler_config config =
{
.tile_base_address = tiled_region.base_address,
.tile_base_offset = range.start - tiled_region.base_address,
.tile_size = tiled_region.tile->size,
.tile_pitch = tiled_region.tile->pitch,
.bank = tiled_region.tile->bank,
.dst = scratch_buf,
.dst_offset = linear_data_scratch_offset,
.src = scratch_buf,
.src_offset = section_length * 2,
.image_width = subres.width_in_block,
.image_height = subres.height_in_block,
.image_pitch = subres.width_in_block * static_cast<u32>(get_bpp()),
.image_bpp = get_bpp()
};
// Transfer
VkBufferCopy copy_rgn
{
.srcOffset = dma_mapping.first,
.dstOffset = tiled_data_scratch_offset,
.size = section_length
};
vkCmdCopyBuffer(cmd, dma_mapping.second->value, scratch_buf->value, 1, &copy_rgn);
// Barrier
vk::insert_buffer_memory_barrier(
cmd, scratch_buf->value, linear_data_scratch_offset, section_length,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
// Detile
vk::get_compute_task<vk::cs_tile_memcpy<RSX_detiler_op::decode>>()->run(cmd, config);
// Barrier
vk::insert_buffer_memory_barrier(
cmd, scratch_buf->value, linear_data_scratch_offset, subres.width_in_block * get_bpp() * subres.height_in_block,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT);
const auto [scratch_buf, linear_data_scratch_offset] = vk::detile_memory_block(cmd, tiled_region, range, subres.width_in_block, subres.height_in_block, get_bpp());
// FIXME: !!EVIL!!
subres.data = { scratch_buf, linear_data_scratch_offset };
subres.pitch_in_block = subres.width_in_block;
upload_flags |= source_is_gpu_resident;
heap_align = subres.width_in_block * get_bpp();
#endif

View File

@ -1252,6 +1252,71 @@ namespace vk
}
}
std::pair<buffer*, u32> detile_memory_block(const vk::command_buffer& cmd, const rsx::GCM_tile_reference& tiled_region,
const utils::address_range& range, u16 width, u16 height, u8 bpp)
{
// Calculate the true length of the usable memory section
const auto available_tile_size = tiled_region.tile->size - (range.start - tiled_region.base_address);
const auto max_content_size = tiled_region.tile->pitch * utils::align<u32>(height, 64);
const auto section_length = std::min(max_content_size, available_tile_size);
// Sync the DMA layer
const auto dma_mapping = vk::map_dma(range.start, section_length);
vk::load_dma(range.start, section_length);
// Allocate scratch and prepare for the GPU job
const auto scratch_buf = vk::get_scratch_buffer(cmd, section_length * 3); // 0 = linear data, 1 = padding (deswz), 2 = tiled data
const auto tiled_data_scratch_offset = section_length * 2;
const auto linear_data_scratch_offset = 0u;
// Schedule the job
const RSX_detiler_config config =
{
.tile_base_address = tiled_region.base_address,
.tile_base_offset = range.start - tiled_region.base_address,
.tile_size = tiled_region.tile->size,
.tile_pitch = tiled_region.tile->pitch,
.bank = tiled_region.tile->bank,
.dst = scratch_buf,
.dst_offset = linear_data_scratch_offset,
.src = scratch_buf,
.src_offset = section_length * 2,
.image_width = width,
.image_height = height,
.image_pitch = static_cast<u32>(width) * bpp,
.image_bpp = bpp
};
// Transfer
VkBufferCopy copy_rgn
{
.srcOffset = dma_mapping.first,
.dstOffset = tiled_data_scratch_offset,
.size = section_length
};
vkCmdCopyBuffer(cmd, dma_mapping.second->value, scratch_buf->value, 1, &copy_rgn);
// Barrier
vk::insert_buffer_memory_barrier(
cmd, scratch_buf->value, linear_data_scratch_offset, section_length,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
// Detile
vk::get_compute_task<vk::cs_tile_memcpy<RSX_detiler_op::decode>>()->run(cmd, config);
// Barrier
vk::insert_buffer_memory_barrier(
cmd, scratch_buf->value, linear_data_scratch_offset, static_cast<u32>(width) * height * bpp,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT);
// Return a descriptor pointing to the decrypted data
return { scratch_buf, linear_data_scratch_offset };
}
void blitter::scale_image(vk::command_buffer& cmd, vk::image* src, vk::image* dst, areai src_area, areai dst_area, bool interpolate, const rsx::typeless_xfer& xfer_info)
{
vk::image* real_src = src;

View File

@ -1050,9 +1050,38 @@ namespace vk
upload_command_flags |= upload_contents_async;
}
std::vector<rsx::subresource_layout> tmp;
auto p_subresource_layout = &subresource_layout;
u32 heap_align = upload_heap_align_default;
if (auto tiled_region = rsx::get_current_renderer()->get_tiled_memory_region(rsx_range);
context == rsx::texture_upload_context::blit_engine_src && tiled_region)
{
if (mipmaps > 1)
{
// This really shouldn't happen on framebuffer tiled memory
rsx_log.error("Tiled decode of mipmapped textures is not supported.");
}
else
{
const auto bpp = rsx::get_format_block_size_in_bytes(gcm_format);
const auto [scratch_buf, linear_data_scratch_offset] = vk::detile_memory_block(cmd, tiled_region, rsx_range, width, height, bpp);
auto subres = subresource_layout.front();
// FIXME: !!EVIL!!
subres.data = { scratch_buf, linear_data_scratch_offset };
subres.pitch_in_block = width;
upload_command_flags |= source_is_gpu_resident;
heap_align = width * bpp;
tmp.push_back(subres);
p_subresource_layout = &tmp;
}
}
const u16 layer_count = (type == rsx::texture_dimension_extended::texture_dimension_cubemap) ? 6 : 1;
vk::upload_image(cmd, image, subresource_layout, gcm_format, input_swizzled, layer_count, image->aspect(),
*m_texture_upload_heap, upload_heap_align_default, upload_command_flags);
vk::upload_image(cmd, image, *p_subresource_layout, gcm_format, input_swizzled, layer_count, image->aspect(),
*m_texture_upload_heap, heap_align, upload_command_flags);
vk::leave_uninterruptible();