diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index 315f0cbf1e..3c9e3b263e 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -475,7 +475,7 @@ namespace rsx rsx::texture_upload_context context, rsx::texture_dimension_extended type, bool swizzled, component_order swizzle_flags, rsx::flags32_t flags) = 0; virtual section_storage_type* upload_image_from_cpu(commandbuffer_type&, const address_range &rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u32 pitch, u32 gcm_format, texture_upload_context context, const std::vector& subresource_layout, rsx::texture_dimension_extended type, bool swizzled) = 0; - virtual section_storage_type* create_nul_section(commandbuffer_type&, const address_range &rsx_range, const image_section_attributes_t& attrs, bool memory_load) = 0; + virtual section_storage_type* create_nul_section(commandbuffer_type&, const address_range &rsx_range, const image_section_attributes_t& attrs, const GCM_tile_reference& tile, bool memory_load) = 0; virtual void set_component_order(section_storage_type& section, u32 gcm_format, component_order expected) = 0; virtual void insert_texture_barrier(commandbuffer_type&, image_storage_type* tex, bool strong_ordering = true) = 0; virtual image_view_type generate_cubemap_from_images(commandbuffer_type&, u32 gcm_format, u16 size, const std::vector& sources, const texture_channel_remap_t& remap_vector) = 0; @@ -2551,11 +2551,10 @@ namespace rsx src_address += (src.width - src_w) * src_bpp; } - const auto is_tiled_mem = [&](const utils::address_range& range) + const auto get_tiled_region = [&](const utils::address_range& range) { auto rsxthr = rsx::get_current_renderer(); - auto region = rsxthr->get_tiled_memory_region(range); - return region.tile != nullptr; + return rsxthr->get_tiled_memory_region(range); }; auto rtt_lookup = [&m_rtts, &cmd, &scale_x, &scale_y, this](u32 address, u32 width, u32 height, u32 pitch, u8 bpp, rsx::flags32_t access, bool allow_clipped) -> typename surface_store_type::surface_overlap_info @@ -2662,8 +2661,10 @@ namespace rsx }; // Check tiled mem - const auto dst_is_tiled = is_tiled_mem(utils::address_range::start_length(dst_address, dst.pitch * dst.clip_height)); - const auto src_is_tiled = is_tiled_mem(utils::address_range::start_length(src_address, src.pitch * src.height)); + const auto dst_tile = get_tiled_region(utils::address_range::start_length(dst_address, dst.pitch * dst.clip_height)); + const auto src_tile = get_tiled_region(utils::address_range::start_length(src_address, src.pitch * src.height)); + const auto dst_is_tiled = !!dst_tile; + const auto src_is_tiled = !!src_tile; // Check if src/dst are parts of render targets typename surface_store_type::surface_overlap_info dst_subres; @@ -3219,9 +3220,10 @@ namespace rsx { .pitch = dst.pitch, .width = static_cast(dst_dimensions.width), - .height = static_cast(dst_dimensions.height) + .height = static_cast(dst_dimensions.height), + .bpp = dst_bpp }; - cached_dest = create_nul_section(cmd, rsx_range, attrs, force_dma_load); + cached_dest = create_nul_section(cmd, rsx_range, attrs, dst_tile, force_dma_load); } else { diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index de5084bb2c..5a8aae2b9d 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -712,6 +712,7 @@ namespace gl gl::command_context& /*cmd*/, const utils::address_range& rsx_range, const rsx::image_section_attributes_t& attrs, + const rsx::GCM_tile_reference& /*tile*/, bool /*memory_load*/) override { auto& cached = *find_cached_texture(rsx_range, { .gcm_format = RSX_GCM_FORMAT_IGNORED }, true, false, false); diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h index 4edecb2407..1470519568 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.h +++ b/rpcs3/Emu/RSX/VK/VKCompute.h @@ -512,6 +512,7 @@ namespace vk { u32 tile_base_address; u32 tile_base_offset; + u32 tile_rw_offset; u32 tile_size; u32 tile_pitch; u32 bank; @@ -643,8 +644,9 @@ namespace vk params.factor = factor; params.num_tiles_per_row = tiles_per_row; params.tile_base_address = config.tile_base_address; + params.tile_rw_offset = config.tile_rw_offset; params.tile_size = config.tile_size; - params.tile_offset = config.tile_base_offset; + params.tile_address_offset = config.tile_base_offset; params.tile_pitch = config.tile_pitch; params.tile_bank = config.bank; params.image_width = config.image_width; diff --git a/rpcs3/Emu/RSX/VK/VKDMA.cpp b/rpcs3/Emu/RSX/VK/VKDMA.cpp index d1119bdd56..0aa7d864b7 100644 --- a/rpcs3/Emu/RSX/VK/VKDMA.cpp +++ b/rpcs3/Emu/RSX/VK/VKDMA.cpp @@ -174,7 +174,7 @@ namespace vk // NOTE: Do not unmap. This can be extremely slow on some platforms. } - std::pair dma_block::get(const utils::address_range& range) + dma_mapping_handle dma_block::get(const utils::address_range& range) { if (inheritance_info.parent) { @@ -331,7 +331,7 @@ namespace vk block->init(*g_render_device, base_address, expected_length); } - std::pair map_dma(u32 local_address, u32 length) + dma_mapping_handle map_dma(u32 local_address, u32 length) { // Not much contention expected here, avoid searching twice std::lock_guard lock(g_dma_mutex); diff --git a/rpcs3/Emu/RSX/VK/VKDMA.h b/rpcs3/Emu/RSX/VK/VKDMA.h index 35587c74c9..44ad623825 100644 --- a/rpcs3/Emu/RSX/VK/VKDMA.h +++ b/rpcs3/Emu/RSX/VK/VKDMA.h @@ -4,7 +4,9 @@ namespace vk { - std::pair map_dma(u32 local_address, u32 length); + using dma_mapping_handle = std::pair; + + dma_mapping_handle map_dma(u32 local_address, u32 length); void load_dma(u32 local_address, u32 length); void flush_dma(u32 local_address, u32 length); void unmap_dma(u32 local_address, u32 length); diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index 4ee73769fc..5262fabf79 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -1274,6 +1274,7 @@ namespace vk { .tile_base_address = tiled_region.base_address, .tile_base_offset = range.start - tiled_region.base_address, + .tile_rw_offset = range.start - tiled_region.base_address, // TODO .tile_size = tiled_region.tile->size, .tile_pitch = tiled_region.tile->pitch, .bank = tiled_region.tile->bank, diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index bda591b6ee..a7be3215eb 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -95,7 +95,23 @@ namespace vk const auto tiled_region = rsx::get_current_renderer()->get_tiled_memory_region(valid_range); const bool require_tiling = !!tiled_region; const bool require_gpu_transform = require_format_conversion || pack_unpack_swap_bytes || require_tiling; - auto dma_mapping = vk::map_dma(valid_range.start, valid_range.length()); + + auto dma_sync_region = valid_range; + dma_mapping_handle dma_mapping = { 0, nullptr }; + + auto dma_sync = [&dma_sync_region, &dma_mapping](bool load, bool force = false) + { + if (dma_mapping.second && !force) + { + return; + } + + dma_mapping = vk::map_dma(dma_sync_region.start, dma_sync_region.length()); + if (load) + { + vk::load_dma(dma_sync_region.start, dma_sync_region.length()); + } + }; if (require_gpu_transform) { @@ -104,20 +120,16 @@ namespace vk const auto task_length = transfer_pitch * src_area.height(); auto working_buffer_length = calculate_working_buffer_size(task_length, src->aspect()); +#if !DEBUG_DMA_TILING if (require_tiling) { + // Safety padding working_buffer_length += tiled_region.tile->size; - // Calculate actual section length - const auto available_tile_size = tiled_region.tile->size - (valid_range.start - tiled_region.base_address); - const auto max_content_size = tiled_region.tile->pitch * utils::align(height, 64); - section_length = std::min(max_content_size, available_tile_size); - - if (section_length > valid_range.length()) [[ likely ]] - { - dma_mapping = vk::map_dma(valid_range.start, section_length); - } + // Calculate actual working section for the memory op + dma_sync_region = tiled_region.tile_align(dma_sync_region); } +#endif auto working_buffer = vk::get_scratch_buffer(cmd, working_buffer_length); u32 result_offset = 0; @@ -177,14 +189,43 @@ namespace vk #if !DEBUG_DMA_TILING // Compute -> Compute barrier vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT); + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + + // We don't need to calibrate write if two conditions are met: + // 1. The start offset of our 2D region is a multiple of 64 lines + // 2. We use the whole pitch. + // If these conditions are not met, we need to upload the entire tile (or at least the affected tiles wholly) + + if (valid_range.start != dma_sync_region.start || real_pitch != tiled_region.tile->pitch) + { + // Tile indices run to the end of the row (full pitch). + // Tiles address outside their 64x64 area too, so we need to actually load the whole thing and "fill in" missing blocks. + // Visualizing "hot" pixels when doing a partial copy is very revealing, there's lots of data from the padding areas to be filled in. + + dma_sync(true); + ensure(dma_mapping.second); + + // Upload memory to the working buffer + const auto dst_offset = task_length; // Append to the end of the input + VkBufferCopy mem_load{}; + mem_load.srcOffset = dma_mapping.first; + mem_load.dstOffset = dst_offset; + mem_load.size = dma_sync_region.length(); + vkCmdCopyBuffer(cmd, dma_mapping.second->value, working_buffer->value, 1, &mem_load); + + // Transfer -> Compute barrier + vk::insert_buffer_memory_barrier(cmd, working_buffer->value, dst_offset, dma_sync_region.length(), + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_WRITE_BIT); + } // Prepare payload const RSX_detiler_config config = { .tile_base_address = tiled_region.base_address, .tile_base_offset = valid_range.start - tiled_region.base_address, + .tile_rw_offset = dma_sync_region.start - tiled_region.base_address, .tile_size = tiled_region.tile->size, .tile_pitch = tiled_region.tile->pitch, .bank = tiled_region.tile->bank, @@ -195,8 +236,8 @@ namespace vk .src_offset = 0, // TODO: Check interaction with anti-aliasing - .image_width = width, - .image_height = height, + .image_width = (u16)transfer_width, + .image_height = (u16)transfer_height, .image_pitch = real_pitch, .image_bpp = context == rsx::texture_upload_context::dma ? internal_bpp : rsx::get_format_block_size_in_bytes(gcm_format) }; @@ -207,8 +248,30 @@ namespace vk // Update internal variables result_offset = task_length; - real_pitch = tiled_region.tile->pitch; + real_pitch = tiled_region.tile->pitch; // We're always copying the full image. In case of partials we're "filling in" blocks, not doing partial 2D copies. require_rw_barrier = true; + +#if 0 + vk::insert_buffer_memory_barrier(cmd, working_buffer->value, result_offset, working_buffer_length, + VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + + // Debug write + auto scratch_img = vk::get_typeless_helper(VK_FORMAT_B8G8R8A8_UNORM, RSX_FORMAT_CLASS_COLOR, tiled_region.tile->pitch / 4, 768); + scratch_img->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + + VkBufferImageCopy dbg_copy{}; + dbg_copy.bufferOffset = config.dst_offset; + dbg_copy.imageExtent.width = width; + dbg_copy.imageExtent.height = height; + dbg_copy.imageExtent.depth = 1; + dbg_copy.bufferRowLength = tiled_region.tile->pitch / 4; + dbg_copy.imageSubresource = { .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = 1 }; + vk::copy_buffer_to_image(cmd, working_buffer, scratch_img, dbg_copy); + + scratch_img->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); +#endif + #endif } @@ -221,6 +284,8 @@ namespace vk if (rsx_pitch == real_pitch) [[likely]] { + dma_sync(false); + VkBufferCopy copy = {}; copy.srcOffset = result_offset; copy.dstOffset = dma_mapping.first; @@ -229,13 +294,7 @@ namespace vk } else { - if (context != rsx::texture_upload_context::dma) - { - // Partial load for the bits outside the existing image - // NOTE: A true DMA section would have been prepped beforehand - // TODO: Parial range load/flush - vk::load_dma(valid_range.start, section_length); - } + dma_sync(true); std::vector copy; copy.reserve(transfer_height); @@ -255,6 +314,8 @@ namespace vk } else { + dma_sync(false); + VkBufferImageCopy region = {}; region.bufferRowLength = (rsx_pitch / internal_bpp); region.imageSubresource = { src->aspect(), 0, 0, 1 }; @@ -1011,6 +1072,7 @@ namespace vk vk::command_buffer& /*cmd*/, const utils::address_range& rsx_range, const rsx::image_section_attributes_t& attrs, + const rsx::GCM_tile_reference& tile, bool memory_load) { auto& region = *find_cached_texture(rsx_range, { .gcm_format = RSX_GCM_FORMAT_IGNORED }, true, false, false); @@ -1022,7 +1084,7 @@ namespace vk region.set_dirty(false); region.set_unpack_swap_bytes(true); - if (memory_load) + if (memory_load && !tile) // Memory load on DMA tiles will always happen during the actual copy command { vk::map_dma(rsx_range.start, rsx_range.length()); vk::load_dma(rsx_range.start, rsx_range.length()); diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index c8d7a2fdc4..3ed8d75c47 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -482,7 +482,8 @@ namespace vk cached_texture_section* create_new_texture(vk::command_buffer& cmd, const utils::address_range& rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u32 pitch, u32 gcm_format, rsx::texture_upload_context context, rsx::texture_dimension_extended type, bool swizzled, rsx::component_order swizzle_flags, rsx::flags32_t flags) override; - cached_texture_section* create_nul_section(vk::command_buffer& cmd, const utils::address_range& rsx_range, const rsx::image_section_attributes_t& attrs, bool memory_load) override; + cached_texture_section* create_nul_section(vk::command_buffer& cmd, const utils::address_range& rsx_range, const rsx::image_section_attributes_t& attrs, + const rsx::GCM_tile_reference& tile, bool memory_load) override; cached_texture_section* upload_image_from_cpu(vk::command_buffer& cmd, const utils::address_range& rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u32 pitch, u32 gcm_format, rsx::texture_upload_context context, const std::vector& subresource_layout, rsx::texture_dimension_extended type, bool swizzled) override;