From c18e5e07cc63ad215a415df02fdedf5836295073 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Wed, 14 Jul 2021 01:19:36 +0300 Subject: [PATCH] vk: Implement VRAM spilling - The idea is to shift memory to "shared graphics memory" when VRAM is running out --- rpcs3/Emu/RSX/Common/TextureUtils.h | 5 +- rpcs3/Emu/RSX/VK/VKPresent.cpp | 2 +- rpcs3/Emu/RSX/VK/VKRenderTargets.cpp | 281 ++++++++++++++++++++++++++- rpcs3/Emu/RSX/VK/VKRenderTargets.h | 35 +++- 4 files changed, 314 insertions(+), 9 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/TextureUtils.h b/rpcs3/Emu/RSX/Common/TextureUtils.h index aa39ccf227..d8614df041 100644 --- a/rpcs3/Emu/RSX/Common/TextureUtils.h +++ b/rpcs3/Emu/RSX/Common/TextureUtils.h @@ -49,7 +49,10 @@ namespace rsx // Arbitrary r/w flags, use with caution. memory_write = 8, - memory_read = 16 + memory_read = 16, + + // Not r/w but signifies a GPU reference to this object. + gpu_reference = 32 }; private: diff --git a/rpcs3/Emu/RSX/VK/VKPresent.cpp b/rpcs3/Emu/RSX/VK/VKPresent.cpp index fbc5e55acf..2c0ba8fbb7 100644 --- a/rpcs3/Emu/RSX/VK/VKPresent.cpp +++ b/rpcs3/Emu/RSX/VK/VKPresent.cpp @@ -112,7 +112,7 @@ void VKGSRender::advance_queued_frames() vk::vmm_check_memory_usage(); // m_rtts storage is double buffered and should be safe to tag on frame boundary - m_rtts.free_invalidated(*m_current_command_buffer); + m_rtts.free_invalidated(*m_current_command_buffer, vk::vmm_determine_memory_load_severity()); // Texture cache is also double buffered to prevent use-after-free m_texture_cache.on_frame_end(); diff --git a/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp b/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp index 1921a69688..95ee510dc2 100644 --- a/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp +++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp @@ -52,7 +52,59 @@ namespace vk if (severity >= rsx::problem_severity::fatal) { - // TODO + // Drop MSAA resolve/unresolve caches. Only trigger when a hard sync is guaranteed to follow else it will cause even more problems! + auto relieve_memory_pressure = [&](const auto& list) + { + // 2-pass to ensure resources are available where they are most needed + std::vector> resolve_target_cache; + std::vector deferred_spills; + auto gc = vk::get_resource_manager(); + + // 1. Scan the list and spill resources that can be spilled immediately if requested. Also gather resources from those that don't need it. + for (auto& surface : list) + { + auto& rtt = surface.second; + if (!rtt->spill_request_tag || rtt->spill_request_tag < surface.second->last_rw_access_tag) + { + // We're not going to be spilling into system RAM. If a MSAA resolve target exists, remove it to save memory. + if (rtt->resolve_surface) + { + resolve_target_cache.emplace_back(std::move(rtt->resolve_surface)); + rtt->msaa_flags |= rsx::surface_state_flags::require_resolve; + any_released |= true; + } + + rtt->spill_request_tag = 0; + continue; + } + + if (rtt->resolve_surface || rtt->samples() == 1) + { + // Can spill immediately. Do it. + rtt->spill(cmd, resolve_target_cache); + any_released |= true; + continue; + } + + deferred_spills.push_back(rtt.get()); + } + + // 2. We should have enough discarded reusable memory for the second pass. + for (auto& surface : deferred_spills) + { + surface->spill(cmd, resolve_target_cache); + any_released |= true; + } + + // 3. Discard the now-useless resolve cache memory + for (auto& data : resolve_target_cache) + { + gc->dispose(data); + } + }; + + relieve_memory_pressure(m_render_targets_storage); + relieve_memory_pressure(m_depth_stencil_storage); } return any_released; @@ -123,6 +175,63 @@ namespace vk return (surface_cache_vram_load > surface_cache_allocation_quota); } + bool surface_cache::spill_unused_memory() + { + // Determine how much memory we need to save to system RAM if any + const u64 current_surface_cache_memory = vk::vmm_get_application_pool_usage(VMM_ALLOCATION_POOL_SURFACE_CACHE); + const u64 total_device_memory = vk::get_current_renderer()->get_memory_mapping().device_local_total_bytes; + const u64 target_memory = get_surface_cache_memory_quota(total_device_memory); + + rsx_log.warning("Surface cache memory usage is %lluM", current_surface_cache_memory / 0x100000); + if (current_surface_cache_memory < target_memory) + { + rsx_log.warning("Surface cache memory usage is very low. Will not spill contents to RAM"); + return false; + } + + // Very slow, but should only be called when the situation is dire + std::vector sorted_list; + sorted_list.reserve(m_render_targets_storage.size() + m_depth_stencil_storage.size()); + + auto process_list_function = [&](const auto& list) + { + for (auto& surface : list) + { + if (surface.second->value && !surface.second->is_bound) + { + sorted_list.push_back(surface.second.get()); + } + } + }; + + process_list_function(m_render_targets_storage); + process_list_function(m_depth_stencil_storage); + + std::sort(sorted_list.begin(), sorted_list.end(), [](const auto& a, const auto& b) + { + return a->last_rw_access_tag < b->last_rw_access_tag; + }); + + // Remove upto target_memory bytes from VRAM + u64 bytes_spilled = 0; + const u64 bytes_to_remove = current_surface_cache_memory - target_memory; + const u64 spill_time = rsx::get_shared_tag(); + + for (auto& surface : sorted_list) + { + bytes_spilled += surface->memory->size(); + surface->spill_request_tag = spill_time; + + if (bytes_spilled >= bytes_to_remove) + { + break; + } + } + + rsx_log.warning("Surface cache will attempt to spill %llu bytes.", bytes_spilled); + return (bytes_spilled > 0); + } + // Get the linear resolve target bound to this surface. Initialize if none exists vk::viewable_image* render_target::get_resolve_target_safe(vk::command_buffer& cmd) { @@ -334,6 +443,156 @@ namespace vk } } + std::vector render_target::build_spill_transfer_descriptors(vk::image* target) + { + std::vector result; + result.reserve(2); + + result.push_back({}); + auto& rgn = result.back(); + rgn.imageExtent.width = target->width(); + rgn.imageExtent.height = target->height(); + rgn.imageExtent.depth = 1; + rgn.imageSubresource.aspectMask = target->aspect(); + rgn.imageSubresource.layerCount = 1; + + if (aspect() == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) + { + result.push_back(rgn); + rgn.imageSubresource.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; + result.back().imageSubresource.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT; + result.back().bufferOffset = target->width() * target->height() * 4; + } + + return result; + } + + void render_target::spill(vk::command_buffer& cmd, std::vector>& resolve_cache) + { + ensure(value); + + u64 element_size; + switch (const auto fmt = format()) + { + case VK_FORMAT_D32_SFLOAT: + element_size = 4; + break; + case VK_FORMAT_D32_SFLOAT_S8_UINT: + case VK_FORMAT_D24_UNORM_S8_UINT: + element_size = 5; + break; + default: + element_size = get_format_texel_width(fmt); + } + + vk::image* src = nullptr; + if (samples() == 1) [[likely]] + { + src = this; + } + else if (resolve_surface) + { + src = resolve_surface.get(); + } + else + { + const auto transfer_w = width() * samples_x; + const auto transfer_h = height() * samples_y; + + for (auto& surface : resolve_cache) + { + if (surface->format() == format() && + surface->width() == transfer_w && + surface->height() == transfer_h) + { + src = surface.get(); + break; + } + } + + if (!src) + { + if (vmm_determine_memory_load_severity() <= rsx::problem_severity::moderate) + { + // We have some freedom to allocate something. Add to the shared cache + src = get_resolve_target_safe(cmd); + } + else + { + // TODO: Spill to DMA buf + // For now, just skip this one if we don't have the capacity for it + rsx_log.warning("Could not spill memory due to resolve failure. Will ignore spilling for the moment."); + return; + } + } + + msaa_flags |= rsx::surface_state_flags::require_resolve; + } + + // If a resolve is requested, move data to the target + if (msaa_flags & rsx::surface_state_flags::require_resolve) + { + ensure(samples() > 1); + resolve(cmd); + } + + const auto pdev = vk::get_current_renderer(); + const auto alloc_size = element_size * src->width() * src->height(); + + m_spilled_mem = std::make_unique(*pdev, alloc_size, pdev->get_memory_mapping().host_visible_coherent, + 0, VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, 0, VMM_ALLOCATION_POOL_UNDEFINED); + + const auto regions = build_spill_transfer_descriptors(src); + src->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); + vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, m_spilled_mem->value, ::size32(regions), regions.data()); + + // Destroy this object through a cloned object + auto obj = std::unique_ptr(clone()); + vk::get_resource_manager()->dispose(obj); + + if (resolve_surface) + { + // Just add to the resolve cache and move on + resolve_cache.emplace_back(std::move(resolve_surface)); + } + + ensure(!memory && !value && views.empty() && !resolve_surface); + spill_request_tag = 0ull; + } + + void render_target::unspill(vk::command_buffer& cmd) + { + // Recreate the image + const auto pdev = vk::get_current_renderer(); + create_impl(*pdev, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, pdev->get_memory_mapping().device_local, VMM_ALLOCATION_POOL_SURFACE_CACHE); + change_layout(cmd, is_depth_surface() ? VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL); + + // Load image from host-visible buffer + ensure(m_spilled_mem); + + // Data transfer can be skipped if an erase command is being served + if (!(state_flags & rsx::surface_state_flags::erase_bkgnd)) + { + // Warn. Ideally this should never happen if you have enough resources + rsx_log.warning("[PERFORMANCE WARNING] Loading spilled memory back to the GPU. You may want to lower your resolution scaling."); + + vk::image* dst = (samples() > 1) ? get_resolve_target_safe(cmd) : this; + const auto regions = build_spill_transfer_descriptors(dst); + + dst->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + vkCmdCopyBufferToImage(cmd, m_spilled_mem->value, dst->value, dst->current_layout, ::size32(regions), regions.data()); + + if (samples() > 1) + { + msaa_flags &= ~rsx::surface_state_flags::require_resolve; + msaa_flags |= rsx::surface_state_flags::require_unresolve; + } + } + + // Delete host-visible buffer + vk::get_resource_manager()->dispose(m_spilled_mem); + } + // Load memory from cell and use to initialize the surface void render_target::load_memory(vk::command_buffer& cmd) { @@ -426,6 +685,8 @@ namespace vk vk::viewable_image* render_target::get_surface(rsx::surface_access access_type) { + last_rw_access_tag = rsx::get_shared_tag(); + if (samples() == 1 || access_type == rsx::surface_access::shader_write) { return this; @@ -491,6 +752,18 @@ namespace vk void render_target::memory_barrier(vk::command_buffer& cmd, rsx::surface_access access) { + if (access == rsx::surface_access::gpu_reference) + { + // This barrier only requires that an object is made available for GPU usage. + if (!value) + { + unspill(cmd); + } + + spill_request_tag = 0; + return; + } + const bool is_depth = is_depth_surface(); const bool should_read_buffers = is_depth ? !!g_cfg.video.read_depth_buffer : !!g_cfg.video.read_color_buffers; @@ -506,6 +779,12 @@ namespace vk } } + // Unspill here, because erase flag may have been set above. + if (!value) + { + unspill(cmd); + } + if (access == rsx::surface_access::shader_write && write_barrier_sync_tag != 0) { if (current_layout == VK_IMAGE_LAYOUT_GENERAL) diff --git a/rpcs3/Emu/RSX/VK/VKRenderTargets.h b/rpcs3/Emu/RSX/VK/VKRenderTargets.h index 99238d9590..312bdfbd73 100644 --- a/rpcs3/Emu/RSX/VK/VKRenderTargets.h +++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.h @@ -24,6 +24,9 @@ namespace vk u64 cyclic_reference_sync_tag = 0; u64 write_barrier_sync_tag = 0; + // Memory spilling support + std::unique_ptr m_spilled_mem; + // MSAA support: // Get the linear resolve target bound to this surface. Initialize if none exists vk::viewable_image* get_resolve_target_safe(vk::command_buffer& cmd); @@ -40,8 +43,17 @@ namespace vk // Generic - chooses whether to clear or load. void initialize_memory(vk::command_buffer& cmd, rsx::surface_access access); + // Spill helpers + // Re-initialize using spilled memory + void unspill(vk::command_buffer& cmd); + // Build spill transfer descriptors + std::vector build_spill_transfer_descriptors(vk::image* target); + public: - u64 frame_tag = 0; // frame id when invalidated, 0 if not invalid + u64 frame_tag = 0; // frame id when invalidated, 0 if not invalid + u64 last_rw_access_tag = 0; // timestamp when this object was last used + u64 spill_request_tag = 0; // timestamp when spilling was requested + bool is_bound = false; // set when the surface is bound for rendering using viewable_image::viewable_image; @@ -54,6 +66,9 @@ namespace vk image_view* get_view(u32 remap_encoding, const std::pair, std::array>& remap, VkImageAspectFlags mask = VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT) override; + // Memory management + void spill(vk::command_buffer& cmd, std::vector>& resolve_cache); + // Synchronization void texture_barrier(vk::command_buffer& cmd); void memory_barrier(vk::command_buffer& cmd, rsx::surface_access access); @@ -270,13 +285,16 @@ namespace vk static bool is_compatible_surface(const vk::render_target* surface, const vk::render_target* ref, u16 width, u16 height, u8 sample_count) { return (surface->format() == ref->format() && - surface->get_spp() == sample_count && - surface->get_surface_width() >= width && - surface->get_surface_height() >= height); + surface->get_spp() == sample_count && + surface->get_surface_width() >= width && + surface->get_surface_height() >= height); } static void prepare_surface_for_drawing(vk::command_buffer& cmd, vk::render_target* surface) { + // Special case barrier + surface->memory_barrier(cmd, rsx::surface_access::gpu_reference); + if (surface->aspect() == VK_IMAGE_ASPECT_COLOR_BIT) { surface->change_layout(cmd, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL); @@ -288,10 +306,13 @@ namespace vk surface->reset_surface_counters(); surface->memory_usage_flags |= rsx::surface_usage_flags::attachment; + surface->is_bound = true; } - static void prepare_surface_for_sampling(vk::command_buffer& /*cmd*/, vk::render_target* /*surface*/) - {} + static void prepare_surface_for_sampling(vk::command_buffer& /*cmd*/, vk::render_target* surface) + { + surface->is_bound = false; + } static bool surface_is_pitch_compatible(const std::unique_ptr& surface, usz pitch) { @@ -385,9 +406,11 @@ namespace vk public: void destroy(); + bool spill_unused_memory(); bool is_overallocated(); bool can_collapse_surface(const std::unique_ptr& surface) override; bool handle_memory_pressure(vk::command_buffer& cmd, rsx::problem_severity severity) override; void free_invalidated(vk::command_buffer& cmd, rsx::problem_severity memory_pressure); }; } +//h \ No newline at end of file