vulkan: Use a parallel cb for texture cache ops; bug fixes

vk: More surface fixes and debug stuff

vk: Crude thread sync implementation to prevent cb desync crashes due to resource usage

fix build

more fixes

vulkan: Do not flush command queue if address cannot be flushed

vk: More fixes for accuracy. Needs optimizations

vk: Batch all flush-to-buffer operations in the non-critical path
- More work is needed to make queue submission asynchronous
This commit is contained in:
kd-11 2017-04-21 22:55:05 +03:00
parent fd754a4ddc
commit 2b19031206
5 changed files with 169 additions and 76 deletions

View File

@ -493,6 +493,11 @@ VKGSRender::VKGSRender() : GSRender(frame_type::Vulkan)
//create command buffer...
m_command_buffer_pool.create((*m_device));
m_command_buffer.create(m_command_buffer_pool);
//Create secondar command_buffer for parallel operations
m_secondary_command_buffer_pool.create((*m_device));
m_secondary_command_buffer.create(m_secondary_command_buffer_pool);
open_command_buffer();
for (u32 i = 0; i < m_swap_chain->get_swap_image_count(); ++i)
@ -620,6 +625,9 @@ VKGSRender::~VKGSRender()
m_command_buffer.destroy();
m_command_buffer_pool.destroy();
m_secondary_command_buffer.destroy();
m_secondary_command_buffer_pool.destroy();
//Device handles/contexts
m_swap_chain->destroy();
m_thread_context.close();
@ -632,7 +640,29 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
if (is_writing)
return m_texture_cache.invalidate_address(address);
else
return m_texture_cache.flush_address(address, *m_device, m_command_buffer, m_memory_type_mapping, m_swap_chain->get_present_queue());
{
if (!m_texture_cache.address_is_flushable(address))
return false;
if (std::this_thread::get_id() != rsx_thread)
{
//TODO: Guard this when the renderer is flushing the command queue, might deadlock otherwise
m_flush_commands = true;
m_queued_threads++;
//This is awful!
while (m_flush_commands);
std::lock_guard<std::mutex> lock(m_secondary_cb_guard);
bool status = m_texture_cache.flush_address(address, *m_device, m_secondary_command_buffer, m_memory_type_mapping, m_swap_chain->get_present_queue());
m_queued_threads--;
return status;
}
std::lock_guard<std::mutex> lock(m_secondary_cb_guard);
return m_texture_cache.flush_address(address, *m_device, m_secondary_command_buffer, m_memory_type_mapping, m_swap_chain->get_present_queue());
}
return false;
}
@ -646,7 +676,9 @@ void VKGSRender::begin()
{
std::chrono::time_point<steady_clock> submit_start = steady_clock::now();
close_and_submit_command_buffer({}, m_submit_fence);
//??Should we wait for the queue to actually render to the GPU? or just flush the queue?
//Needs investigation to determine what drivers expect here, bottom_of_pipe is guaranteed to work, but will be too slow
close_and_submit_command_buffer({}, m_submit_fence, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
CHECK_RESULT(vkWaitForFences((*m_device), 1, &m_submit_fence, VK_TRUE, ~0ULL));
vkResetDescriptorPool(*m_device, descriptor_pool, 0);
@ -833,9 +865,9 @@ void VKGSRender::end()
std::chrono::time_point<steady_clock> draw_end = steady_clock::now();
m_draw_time += std::chrono::duration_cast<std::chrono::microseconds>(draw_end - vertex_end).count();
rsx::thread::end();
copy_render_targets_to_dma_location();
rsx::thread::end();
}
void VKGSRender::set_viewport()
@ -875,6 +907,8 @@ void VKGSRender::on_init_thread()
GSRender::on_init_thread();
m_attrib_ring_info.init(8 * RING_BUFFER_SIZE);
m_attrib_ring_info.heap.reset(new vk::buffer(*m_device, 8 * RING_BUFFER_SIZE, m_memory_type_mapping.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT|VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, 0));
rsx_thread = std::this_thread::get_id();
}
void VKGSRender::on_exit()
@ -987,13 +1021,6 @@ void VKGSRender::clear_surface(u32 mask)
void VKGSRender::sync_at_semaphore_release()
{
close_and_submit_command_buffer({}, m_submit_fence);
CHECK_RESULT(vkWaitForFences((*m_device), 1, &m_submit_fence, VK_TRUE, ~0ULL));
CHECK_RESULT(vkResetFences(*m_device, 1, &m_submit_fence));
CHECK_RESULT(vkResetCommandPool(*m_device, m_command_buffer_pool, 0));
open_command_buffer();
m_flush_draw_buffers = true;
}
@ -1002,6 +1029,13 @@ void VKGSRender::copy_render_targets_to_dma_location()
if (!m_flush_draw_buffers)
return;
if (!g_cfg_rsx_write_color_buffers && !g_cfg_rsx_write_depth_buffer)
return;
//TODO: Make this asynchronous. Should be similar to a glFlush() but in this case its similar to glFinish
//This is due to all the hard waits for fences
//TODO: Use a command buffer array to allow explicit draw command tracking
if (g_cfg_rsx_write_color_buffers)
{
for (u8 index = 0; index < rsx::limits::color_buffers_count; index++)
@ -1023,7 +1057,28 @@ void VKGSRender::copy_render_targets_to_dma_location()
}
}
m_flush_draw_buffers = false;
close_and_submit_command_buffer({}, m_submit_fence, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
CHECK_RESULT(vkWaitForFences((*m_device), 1, &m_submit_fence, VK_TRUE, ~0ULL));
CHECK_RESULT(vkResetFences(*m_device, 1, &m_submit_fence));
CHECK_RESULT(vkResetCommandPool(*m_device, m_command_buffer_pool, 0));
open_command_buffer();
}
void VKGSRender::do_local_task()
{
if (m_flush_commands)
{
close_and_submit_command_buffer({}, m_submit_fence, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
CHECK_RESULT(vkWaitForFences((*m_device), 1, &m_submit_fence, VK_TRUE, ~0ULL));
CHECK_RESULT(vkResetFences(*m_device, 1, &m_submit_fence));
CHECK_RESULT(vkResetCommandPool(*m_device, m_command_buffer_pool, 0));
open_command_buffer();
m_flush_commands = false;
while (m_queued_threads);
}
}
bool VKGSRender::do_method(u32 cmd, u32 arg)
@ -1294,17 +1349,16 @@ void VKGSRender::write_buffers()
{
}
void VKGSRender::close_and_submit_command_buffer(const std::vector<VkSemaphore> &semaphores, VkFence fence)
void VKGSRender::close_and_submit_command_buffer(const std::vector<VkSemaphore> &semaphores, VkFence fence, VkPipelineStageFlags pipeline_stage_flags)
{
CHECK_RESULT(vkEndCommandBuffer(m_command_buffer));
VkPipelineStageFlags pipe_stage_flags = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
VkCommandBuffer cmd = m_command_buffer;
VkSubmitInfo infos = {};
infos.commandBufferCount = 1;
infos.pCommandBuffers = &cmd;
infos.pWaitDstStageMask = &pipe_stage_flags;
infos.pWaitDstStageMask = &pipeline_stage_flags;
infos.pWaitSemaphores = semaphores.data();
infos.waitSemaphoreCount = static_cast<uint32_t>(semaphores.size());
infos.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;

View File

@ -11,6 +11,7 @@
#include "VKProgramBuffer.h"
#include "../GCM.h"
#include "../rsx_utils.h"
#include <atomic>
#pragma comment(lib, "VKstatic.1.lib")
@ -60,6 +61,9 @@ private:
vk::command_pool m_command_buffer_pool;
vk::command_buffer m_command_buffer;
std::mutex m_secondary_cb_guard;
vk::command_pool m_secondary_command_buffer_pool;
vk::command_buffer m_secondary_command_buffer;
std::array<VkRenderPass, 120> m_render_passes;
VkDescriptorSetLayout descriptor_layouts;
@ -86,7 +90,13 @@ private:
rsx::gcm_framebuffer_info m_surface_info[rsx::limits::color_buffers_count];
rsx::gcm_framebuffer_info m_depth_surface_info;
bool m_flush_draw_buffers = false;
std::atomic<bool> m_flush_commands = false;
std::atomic<int> m_queued_threads = 0;
std::thread::id rsx_thread;
public:
VKGSRender();
@ -94,7 +104,7 @@ public:
private:
void clear_surface(u32 mask);
void close_and_submit_command_buffer(const std::vector<VkSemaphore> &semaphores, VkFence fence);
void close_and_submit_command_buffer(const std::vector<VkSemaphore> &semaphores, VkFence fence, VkPipelineStageFlags pipeline_stage_flags = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT);
void open_command_buffer();
void sync_at_semaphore_release();
void prepare_rtts();
@ -117,5 +127,7 @@ protected:
bool do_method(u32 id, u32 arg) override;
void flip(int buffer) override;
void do_local_task() override;
bool on_access_violation(u32 address, bool is_writing) override;
};

View File

@ -32,7 +32,7 @@ namespace rsx
namespace vk
{
#define CHECK_RESULT(expr) do { VkResult _res = (expr); if (_res != VK_SUCCESS) fmt::throw_exception("Assertion failed! Result is %Xh", (s32)_res); } while (0)
#define CHECK_RESULT(expr) { VkResult _res = (expr); if (_res != VK_SUCCESS) fmt::throw_exception("Assertion failed! Result is %Xh" HERE, (s32)_res); }
VKAPI_ATTR void *VKAPI_CALL mem_realloc(void *pUserData, void *pOriginal, size_t size, size_t alignment, VkSystemAllocationScope allocationScope);
VKAPI_ATTR void *VKAPI_CALL mem_alloc(void *pUserData, size_t size, size_t alignment, VkSystemAllocationScope allocationScope);

View File

@ -28,6 +28,14 @@ namespace vk
cached_texture_section() {}
void reset(u32 base, u32 length)
{
if (length > cpu_address_range)
release_dma_resources();
rsx::buffered_section::reset(base, length);
}
void create(const u16 w, const u16 h, const u16 depth, const u16 mipmaps, vk::image_view *view, vk::image *image, const u32 native_pitch = 0, bool managed=true)
{
width = w;
@ -38,8 +46,7 @@ namespace vk
uploaded_image_view.reset(view);
vram_texture = image;
if (managed)
managed_texture.reset(image);
if (managed) managed_texture.reset(image);
//TODO: Properly compute these values
this->native_pitch = native_pitch;
@ -105,16 +112,18 @@ namespace vk
bool is_flushable() const
{
if (protection == utils::protection::ro || protection == utils::protection::no)
return true;
if (uploaded_image_view.get() == nullptr && vram_texture != nullptr)
return true;
return false;
//This section is active and can be flushed to cpu
return (protection == utils::protection::no);
}
void copy_texture(vk::command_buffer& cmd, u32 heap_index, VkQueue submit_queue, VkImageLayout layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL)
bool is_flushed() const
{
//This memory section was flushable, but a flush has already removed protection
return (protection == utils::protection::rw && uploaded_image_view.get() == nullptr && managed_texture.get() == nullptr);
}
void copy_texture(vk::command_buffer& cmd, u32 heap_index, VkQueue submit_queue,
bool manage_cb_lifetime = false, VkImageLayout layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL)
{
if (m_device == nullptr)
{
@ -130,7 +139,21 @@ namespace vk
if (dma_buffer.get() == nullptr)
{
dma_buffer.reset(new vk::buffer(*m_device, native_pitch * height, heap_index, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0));
dma_buffer.reset(new vk::buffer(*m_device, align(cpu_address_range, 256), heap_index, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0));
}
if (manage_cb_lifetime)
{
//cb has to be guaranteed to be in a closed state
//This function can be called asynchronously
VkCommandBufferInheritanceInfo inheritance_info = {};
inheritance_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO;
VkCommandBufferBeginInfo begin_infos = {};
begin_infos.pInheritanceInfo = &inheritance_info;
begin_infos.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
begin_infos.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
CHECK_RESULT(vkBeginCommandBuffer(cmd, &begin_infos));
}
VkBufferImageCopy copyRegion = {};
@ -147,52 +170,47 @@ namespace vk
vkCmdCopyImageToBuffer(cmd, vram_texture->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dma_buffer->value, 1, &copyRegion);
change_image_layout(cmd, vram_texture->value, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, layout, subresource_range);
CHECK_RESULT(vkEndCommandBuffer(cmd));
if (manage_cb_lifetime)
{
CHECK_RESULT(vkEndCommandBuffer(cmd));
VkPipelineStageFlags pipe_stage_flags = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
VkCommandBuffer command_buffer = cmd;
VkPipelineStageFlags pipe_stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
VkCommandBuffer command_buffer = cmd;
VkSubmitInfo infos = {};
infos.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
infos.commandBufferCount = 1;
infos.pCommandBuffers = &command_buffer;
infos.pWaitDstStageMask = &pipe_stage_flags;
infos.pWaitSemaphores = nullptr;
infos.waitSemaphoreCount = 0;
VkSubmitInfo infos = {};
infos.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
infos.commandBufferCount = 1;
infos.pCommandBuffers = &command_buffer;
infos.pWaitDstStageMask = &pipe_stage_flags;
infos.pWaitSemaphores = nullptr;
infos.waitSemaphoreCount = 0;
CHECK_RESULT(vkQueueSubmit(submit_queue, 1, &infos, dma_fence));
CHECK_RESULT(vkQueueSubmit(submit_queue, 1, &infos, dma_fence));
//Now we need to restart the command-buffer to restore it to the way it was before...
CHECK_RESULT(vkWaitForFences(*m_device, 1, &dma_fence, VK_TRUE, UINT64_MAX));
CHECK_RESULT(vkResetCommandPool(*m_device, cmd.get_command_pool(), 0));
CHECK_RESULT(vkResetFences(*m_device, 1, &dma_fence));
VkCommandBufferInheritanceInfo inheritance_info = {};
inheritance_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO;
VkCommandBufferBeginInfo begin_infos = {};
begin_infos.pInheritanceInfo = &inheritance_info;
begin_infos.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
begin_infos.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
CHECK_RESULT(vkBeginCommandBuffer(cmd, &begin_infos));
//Now we need to restart the command-buffer to restore it to the way it was before...
CHECK_RESULT(vkWaitForFences(*m_device, 1, &dma_fence, VK_TRUE, UINT64_MAX));
CHECK_RESULT(vkResetCommandPool(*m_device, cmd.get_command_pool(), 0));
CHECK_RESULT(vkResetFences(*m_device, 1, &dma_fence));
}
}
template<typename T>
void do_memory_transfer(void *pixels_dst, void *pixels_src)
{
//LOG_ERROR(RSX, "COPY %d -> %d", native_pitch, pitch);
if (pitch == native_pitch)
{
if (sizeof T == 1)
memcpy(pixels_dst, pixels_src, native_pitch * height);
memcpy(pixels_dst, pixels_src, cpu_address_range);
else
{
const u32 block_size = native_pitch * height / sizeof T;
const u32 block_size = width * height;
auto typed_dst = (be_t<T> *)pixels_dst;
auto typed_src = (T *)pixels_src;
for (u8 n = 0; n < block_size; ++n)
typed_dst[n] = typed_src[n];
for (u32 px = 0; px < block_size; ++px)
typed_dst[px] = typed_src[px];
}
}
else
@ -203,7 +221,7 @@ namespace vk
u8 *typed_src = (u8 *)pixels_src;
//TODO: Scaling
for (int row = 0; row < height; ++row)
for (u16 row = 0; row < height; ++row)
{
memcpy(typed_dst, typed_src, native_pitch);
typed_dst += pitch;
@ -218,9 +236,9 @@ namespace vk
auto typed_dst = (be_t<T> *)pixels_dst;
auto typed_src = (T *)pixels_src;
for (int row = 0; row < height; ++row)
for (u16 row = 0; row < height; ++row)
{
for (int px = 0; px < width; ++px)
for (u16 px = 0; px < width; ++px)
{
typed_dst[px] = typed_src[px];
}
@ -240,15 +258,13 @@ namespace vk
if (dma_fence == VK_NULL_HANDLE || dma_buffer.get() == nullptr)
{
LOG_WARNING(RSX, "Cache miss at address 0x%X. This is gonna hurt...", cpu_address_base);
copy_texture(cmd, heap_index, submit_queue, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
verify (HERE), (dma_fence != VK_NULL_HANDLE && dma_buffer.get());
copy_texture(cmd, heap_index, submit_queue, true, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
}
protect(utils::protection::rw);
//TODO: Image scaling, etc
void* pixels_src = dma_buffer->map(0, VK_WHOLE_SIZE);
void* pixels_src = dma_buffer->map(0, cpu_address_range);
void* pixels_dst = vm::base(cpu_address_base);
//We have to do our own byte swapping since the driver doesnt do it for us
@ -273,12 +289,7 @@ namespace vk
}
dma_buffer->unmap();
//Cleanup
//These sections are usually one-use only so we destroy system resources
//TODO: Recycle dma buffers
release_dma_resources();
vram_texture = nullptr; //Let m_rtts handle lifetime management
//Its highly likely that this surface will be reused, so we just leave resources in place
}
};
@ -333,7 +344,7 @@ namespace vk
for (auto &tex : m_cache)
{
if (tex.is_dirty()) continue;
if (!tex.is_flushable()) continue;
if (!tex.is_flushable() && !tex.is_flushed()) continue;
if (tex.matches(address, range))
return &tex;
@ -529,15 +540,16 @@ namespace vk
void lock_memory_region(vk::render_target* image, const u32 memory_address, const u32 memory_size, const u32 width, const u32 height)
{
cached_texture_section& region = find_cached_texture(memory_address, memory_size, true, width, height, 1);
region.create(width, height, 1, 1, nullptr, image, image->native_pitch, false);
if (!region.is_locked())
{
region.reset(memory_address, memory_size);
region.protect(utils::protection::no);
region.set_dirty(false);
texture_cache_range = region.get_min_max(texture_cache_range);
}
region.protect(utils::protection::no);
region.create(width, height, 1, 1, nullptr, image, image->native_pitch, false);
}
void flush_memory_to_cache(const u32 memory_address, const u32 memory_size, vk::command_buffer&cmd, vk::memory_type_mapping& memory_types, VkQueue submit_queue)
@ -554,6 +566,20 @@ namespace vk
region->copy_texture(cmd, memory_types.host_visible_coherent, submit_queue);
}
bool address_is_flushable(u32 address)
{
for (auto &tex : m_cache)
{
if (tex.is_dirty()) continue;
if (!tex.is_flushable()) continue;
if (tex.overlaps(address))
return true;
}
return false;
}
bool flush_address(u32 address, vk::render_device& dev, vk::command_buffer& cmd, vk::memory_type_mapping& memory_types, VkQueue submit_queue)
{
if (address < texture_cache_range.first ||
@ -584,8 +610,6 @@ namespace vk
//TODO: Map basic host_visible memory without coherent constraint
tex.flush(dev, cmd, memory_types.host_visible_coherent, submit_queue);
tex.set_dirty(true);
response = true;
}
}
@ -607,6 +631,7 @@ namespace vk
auto &tex = m_cache[i];
if (tex.is_dirty()) continue;
if (!tex.is_locked()) continue; //flushable sections can be 'clean' but unlocked. TODO: Handle this better
auto overlapped = tex.overlaps_page(trampled_range, address);
if (std::get<0>(overlapped))

View File

@ -136,12 +136,13 @@ namespace rsx
locked_address_range = align(base + length, 4096) - locked_address_base;
protection = utils::protection::rw;
locked = false;
}
void protect(utils::protection prot)
{
if (prot == protection) return;
utils::memory_protect(vm::base(locked_address_base), locked_address_range, prot);
protection = prot;
locked = prot != utils::protection::rw;
@ -149,7 +150,8 @@ namespace rsx
void unprotect()
{
return protect(utils::protection::rw);
protect(utils::protection::rw);
locked = false;
}
bool overlaps(std::pair<u32, u32> range)