vk/dma: Allow interoperability between pass-through and write-back DMA caching types

This commit is contained in:
kd-11 2021-01-19 00:40:56 +03:00 committed by kd-11
parent e56da4eb46
commit 67949bb5b7
4 changed files with 123 additions and 144 deletions

View File

@ -12,16 +12,15 @@ namespace vk
{
static constexpr usz s_dma_block_length = 0x00010000;
static constexpr u32 s_dma_block_mask = 0xFFFF0000;
//static constexpr u32 s_dma_offset_mask = 0x0000FFFF;
static constexpr u32 s_page_size = 65536;
static constexpr u32 s_page_align = s_page_size - 1;
static constexpr u32 s_pages_per_entry = 32;
static constexpr u32 s_bits_per_page = 2;
static constexpr u32 s_bytes_per_entry = (s_page_size * s_pages_per_entry);
std::unordered_map<u32, std::unique_ptr<dma_block>> g_dma_pool;
dma_block::~dma_block()
{
// Use safe free (uses gc to clean up)
free();
}
void* dma_block::map_range(const utils::address_range& range)
{
if (inheritance_info.parent)
@ -49,19 +48,24 @@ namespace vk
void dma_block::allocate(const render_device& dev, usz size)
{
if (allocated_memory)
{
// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
auto gc = vk::get_resource_manager();
gc->dispose(allocated_memory);
}
// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
free();
allocated_memory = std::make_unique<vk::buffer>(dev, size,
dev.get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0);
}
void dma_block::free()
{
if (allocated_memory)
{
auto gc = vk::get_resource_manager();
gc->dispose(allocated_memory);
}
}
void dma_block::init(const render_device& dev, u32 addr, usz size)
{
ensure(size);
@ -69,7 +73,6 @@ namespace vk
base_address = addr;
allocate(dev, size);
page_info.resize(size / s_bytes_per_entry, ~0ull);
}
void dma_block::init(dma_block* parent, u32 addr, usz size)
@ -79,67 +82,6 @@ namespace vk
inheritance_info.block_offset = (addr - parent->base_address);
}
void dma_block::set_page_bit(u32 offset, u64 bits)
{
const auto entry = (offset / s_bytes_per_entry);
const auto word = entry / s_pages_per_entry;
const auto shift = (entry % s_pages_per_entry) * s_bits_per_page;
page_info[word] &= ~(3 << shift);
page_info[word] |= (bits << shift);
}
bool dma_block::test_page_bit(u32 offset, u64 bits)
{
const auto entry = (offset / s_bytes_per_entry);
const auto word = entry / s_pages_per_entry;
const auto shift = (entry % s_pages_per_entry) * s_bits_per_page;
return !!(page_info[word] & (bits << shift));
}
void dma_block::mark_dirty(const utils::address_range& range)
{
if (!inheritance_info.parent)
{
const u32 start = utils::align(range.start, s_page_size);
const u32 end = ((range.end + 1) & s_page_align);
for (u32 page = start; page < end; page += s_page_size)
{
set_page_bit(page - base_address, page_bits::dirty);
}
if (start > range.start) [[unlikely]]
{
set_page_bit(start - s_page_size, page_bits::nocache);
}
if (end < range.end) [[unlikely]]
{
set_page_bit(end + s_page_size, page_bits::nocache);
}
}
else
{
inheritance_info.parent->mark_dirty(range);
}
}
void dma_block::set_page_info(u32 page_offset, const std::vector<u64>& bits)
{
if (!inheritance_info.parent)
{
auto bit_offset = page_offset / s_bytes_per_entry;
ensure(bit_offset + bits.size() <= page_info.size());
std::memcpy(page_info.data() + bit_offset, bits.data(), bits.size());
}
else
{
inheritance_info.parent->set_page_info(page_offset + inheritance_info.block_offset, bits);
}
}
void dma_block::flush(const utils::address_range& range)
{
auto src = map_range(range);
@ -206,11 +148,10 @@ namespace vk
{
// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
auto gc = vk::get_resource_manager();
gc->dispose(allocated_memory);
free();
parent->set_page_info(inheritance_info.block_offset, page_info);
page_info.clear();
//parent->set_page_info(inheritance_info.block_offset, page_info);
//page_info.clear();
}
}
@ -222,8 +163,8 @@ namespace vk
allocate(dev, new_size);
const auto required_entries = new_size / s_bytes_per_entry;
page_info.resize(required_entries, ~0ull);
//const auto required_entries = new_size / s_bytes_per_entry;
//page_info.resize(required_entries, ~0ull);
}
u32 dma_block::start() const
@ -244,13 +185,9 @@ namespace vk
void dma_block_EXT::allocate(const render_device& dev, usz size)
{
if (allocated_memory)
{
// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
auto gc = vk::get_resource_manager();
gc->dispose(allocated_memory);
}
// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
free();
allocated_memory = std::make_unique<vk::buffer>(dev,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
@ -278,16 +215,53 @@ namespace vk
// NOP
}
void create_dma_block(std::unique_ptr<dma_block>& block)
bool test_host_pointer(u32 base_address, usz length)
{
#if 0 // Unusable due to vm locks
auto block = vm::get(vm::any, base_address);
ensure(block);
if ((block->addr + block->size) < (base_address + length))
{
return false;
}
if (block->flags & 0x120)
{
return true;
}
auto range_info = block->peek(base_address, u32(length));
return !!range_info.second;
#endif
#ifdef _WIN32
const bool allow_host_buffers = true;
MEMORY_BASIC_INFORMATION mem_info;
if (!::VirtualQuery(vm::get_super_ptr<const void>(base_address), &mem_info, sizeof(mem_info)))
{
rsx_log.error("VirtualQuery failed! LastError=0x%x", GetLastError());
return false;
}
return (mem_info.RegionSize >= length);
#else
// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
return true; // *nix behavior is unknown with NVIDIA drivers
#endif
}
void create_dma_block(std::unique_ptr<dma_block>& block, u32 base_address, u32 expected_length)
{
const auto vendor = g_render_device->gpu().get_driver_vendor();
#ifdef _WIN32
const bool allow_host_buffers = (vendor == driver_vendor::NVIDIA) ?
test_host_pointer(base_address, expected_length) :
true;
#else
// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
const bool allow_host_buffers = (vendor != driver_vendor::AMD && vendor != driver_vendor::RADV);
#endif
if (g_render_device->get_external_memory_host_support() && allow_host_buffers)
if (allow_host_buffers && g_render_device->get_external_memory_host_support())
{
block.reset(new dma_block_EXT());
}
@ -295,6 +269,8 @@ namespace vk
{
block.reset(new dma_block());
}
block->init(*g_render_device, base_address, expected_length);
}
std::pair<u32, vk::buffer*> map_dma(const command_buffer& cmd, u32 local_address, u32 length)
@ -315,25 +291,28 @@ namespace vk
if (first_block == last_block) [[likely]]
{
auto &block_info = g_dma_pool[first_block];
if (!block_info) create_dma_block(block_info);
block_info->init(*g_render_device, first_block, s_dma_block_length);
ensure(!block_info);
create_dma_block(block_info, first_block, s_dma_block_length);
return block_info->get(map_range);
}
dma_block* block_head = nullptr;
auto block_end = utils::align(limit, s_dma_block_length);
// Reverse scan to try and find the minimum required length in case of other chaining
for (auto block = last_block; block != first_block; block -= s_dma_block_length)
if (g_render_device->gpu().get_driver_vendor() != driver_vendor::NVIDIA ||
rsx::get_location(local_address) == CELL_GCM_LOCATION_LOCAL)
{
if (auto found = g_dma_pool.find(block); found != g_dma_pool.end())
// Reverse scan to try and find the minimum required length in case of other chaining
for (auto block = last_block; block != first_block; block -= s_dma_block_length)
{
const auto end = found->second->end();
last_block = std::max(last_block, end & s_dma_block_mask);
block_end = std::max(block_end, end + 1);
if (auto found = g_dma_pool.find(block); found != g_dma_pool.end())
{
const auto end = found->second->end();
last_block = std::max(last_block, end & s_dma_block_mask);
block_end = std::max(block_end, end + 1);
break;
break;
}
}
}
@ -342,37 +321,31 @@ namespace vk
auto found = g_dma_pool.find(block);
auto &entry = g_dma_pool[block];
const bool exists = !!entry;
if (!exists) create_dma_block(entry);
if (block == first_block)
{
block_head = entry->head();
if (exists)
if (entry && entry->end() < limit)
{
if (entry->end() < limit)
{
auto new_length = block_end - block_head->start();
block_head->extend(cmd, *g_render_device, new_length);
}
// Then the references to this object do not go to the end of the list as will be done with this new allocation.
// A dumb release is therefore safe...
entry.reset();
}
else
if (!entry)
{
auto required_size = (block_end - block);
block_head->init(*g_render_device, block, required_size);
create_dma_block(entry, block, required_size);
}
block_head = entry->head();
}
else if (entry)
{
entry->set_parent(cmd, block_head);
}
else
{
if (exists)
{
entry->set_parent(cmd, block_head);
}
else
{
entry->init(block_head, block, s_dma_block_length);
}
entry.reset(new dma_block());
entry->init(block_head, block, s_dma_block_length);
}
}

View File

@ -13,13 +13,6 @@ namespace vk
class dma_block
{
protected:
enum page_bits
{
synchronized = 0,
dirty = 1,
nocache = 3
};
struct
{
dma_block* parent = nullptr;
@ -29,19 +22,17 @@ namespace vk
u32 base_address = 0;
std::unique_ptr<buffer> allocated_memory;
std::vector<u64> page_info;
virtual void allocate(const render_device& dev, usz size);
virtual void free();
virtual void* map_range(const utils::address_range& range);
virtual void unmap();
void set_page_bit(u32 page, u64 bits);
bool test_page_bit(u32 page, u64 bits);
void mark_dirty(const utils::address_range& range);
void set_page_info(u32 page_offset, const std::vector<u64>& bits);
public:
dma_block() = default;
virtual ~dma_block();
virtual void init(const render_device& dev, u32 addr, usz size);
virtual void init(dma_block* parent, u32 addr, usz size);
virtual void flush(const utils::address_range& range);

View File

@ -70,7 +70,6 @@ namespace vk
vk::clear_resolve_helpers();
vk::clear_dma_resources();
vk::vmm_reset();
vk::get_resource_manager()->destroy();
vk::clear_scratch_resources();
vk::get_upload_heap()->destroy();
@ -86,6 +85,9 @@ namespace vk
p.second->destroy();
}
g_overlay_passes.clear();
// This must be the last item destroyed
vk::get_resource_manager()->destroy();
}
const vk::render_device *get_current_renderer()
@ -263,8 +265,6 @@ namespace vk
return (g_num_processed_frames > 0)? g_num_processed_frames - 1: 0;
}
void do_query_cleanup(vk::command_buffer& cmd)
{
auto renderer = dynamic_cast<VKGSRender*>(rsx::get_current_renderer());

View File

@ -905,6 +905,8 @@ namespace vk
}
auto dma_mapping = vk::map_dma(cmd, static_cast<u32>(src_address), static_cast<u32>(data_length));
ensure(dma_mapping.second->size() >= (dma_mapping.first + data_length));
vk::load_dma(::narrow<u32>(src_address), data_length);
upload_buffer = dma_mapping.second;
@ -927,7 +929,7 @@ namespace vk
}
// Copy from upload heap to scratch mem
if (!opt.deferred_cmds.empty())
if (opt.require_upload)
{
for (const auto& copy_cmd : opt.deferred_cmds)
{
@ -953,7 +955,8 @@ namespace vk
scratch_offset += image_linear_size;
ensure((scratch_offset + image_linear_size) <= scratch_buf->size()); // "Out of scratch memory"
}
else if (opt.require_upload)
if (opt.require_upload)
{
if (upload_commands.empty() || upload_buffer->value != upload_commands.back().first)
{
@ -974,7 +977,19 @@ namespace vk
{
ensure(scratch_buf);
vkCmdCopyBuffer(cmd, upload_buffer->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
if (upload_commands.size() > 1)
{
auto range_ptr = buffer_copies.data();
for (const auto& op : upload_commands)
{
vkCmdCopyBuffer(cmd, op.first, scratch_buf->value, op.second, range_ptr);
range_ptr += op.second;
}
}
else
{
vkCmdCopyBuffer(cmd, upload_buffer->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
}
insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, scratch_offset, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
@ -1020,7 +1035,7 @@ namespace vk
vkCmdCopyBufferToImage(cmd, scratch_buf->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, static_cast<u32>(copy_regions.size()), copy_regions.data());
}
else if (opt.require_upload)
else if (upload_commands.size() > 1)
{
auto region_ptr = copy_regions.data();
for (const auto& op : upload_commands)