mirror of
https://github.com/RPCS3/rpcs3.git
synced 2024-11-17 08:11:51 +00:00
vk/dma: Allow interoperability between pass-through and write-back DMA caching types
This commit is contained in:
parent
e56da4eb46
commit
67949bb5b7
@ -12,16 +12,15 @@ namespace vk
|
||||
{
|
||||
static constexpr usz s_dma_block_length = 0x00010000;
|
||||
static constexpr u32 s_dma_block_mask = 0xFFFF0000;
|
||||
//static constexpr u32 s_dma_offset_mask = 0x0000FFFF;
|
||||
|
||||
static constexpr u32 s_page_size = 65536;
|
||||
static constexpr u32 s_page_align = s_page_size - 1;
|
||||
static constexpr u32 s_pages_per_entry = 32;
|
||||
static constexpr u32 s_bits_per_page = 2;
|
||||
static constexpr u32 s_bytes_per_entry = (s_page_size * s_pages_per_entry);
|
||||
|
||||
std::unordered_map<u32, std::unique_ptr<dma_block>> g_dma_pool;
|
||||
|
||||
dma_block::~dma_block()
|
||||
{
|
||||
// Use safe free (uses gc to clean up)
|
||||
free();
|
||||
}
|
||||
|
||||
void* dma_block::map_range(const utils::address_range& range)
|
||||
{
|
||||
if (inheritance_info.parent)
|
||||
@ -49,19 +48,24 @@ namespace vk
|
||||
|
||||
void dma_block::allocate(const render_device& dev, usz size)
|
||||
{
|
||||
if (allocated_memory)
|
||||
{
|
||||
// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
|
||||
// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
|
||||
auto gc = vk::get_resource_manager();
|
||||
gc->dispose(allocated_memory);
|
||||
}
|
||||
// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
|
||||
// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
|
||||
free();
|
||||
|
||||
allocated_memory = std::make_unique<vk::buffer>(dev, size,
|
||||
dev.get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
|
||||
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0);
|
||||
}
|
||||
|
||||
void dma_block::free()
|
||||
{
|
||||
if (allocated_memory)
|
||||
{
|
||||
auto gc = vk::get_resource_manager();
|
||||
gc->dispose(allocated_memory);
|
||||
}
|
||||
}
|
||||
|
||||
void dma_block::init(const render_device& dev, u32 addr, usz size)
|
||||
{
|
||||
ensure(size);
|
||||
@ -69,7 +73,6 @@ namespace vk
|
||||
base_address = addr;
|
||||
|
||||
allocate(dev, size);
|
||||
page_info.resize(size / s_bytes_per_entry, ~0ull);
|
||||
}
|
||||
|
||||
void dma_block::init(dma_block* parent, u32 addr, usz size)
|
||||
@ -79,67 +82,6 @@ namespace vk
|
||||
inheritance_info.block_offset = (addr - parent->base_address);
|
||||
}
|
||||
|
||||
void dma_block::set_page_bit(u32 offset, u64 bits)
|
||||
{
|
||||
const auto entry = (offset / s_bytes_per_entry);
|
||||
const auto word = entry / s_pages_per_entry;
|
||||
const auto shift = (entry % s_pages_per_entry) * s_bits_per_page;
|
||||
|
||||
page_info[word] &= ~(3 << shift);
|
||||
page_info[word] |= (bits << shift);
|
||||
}
|
||||
|
||||
bool dma_block::test_page_bit(u32 offset, u64 bits)
|
||||
{
|
||||
const auto entry = (offset / s_bytes_per_entry);
|
||||
const auto word = entry / s_pages_per_entry;
|
||||
const auto shift = (entry % s_pages_per_entry) * s_bits_per_page;
|
||||
|
||||
return !!(page_info[word] & (bits << shift));
|
||||
}
|
||||
|
||||
void dma_block::mark_dirty(const utils::address_range& range)
|
||||
{
|
||||
if (!inheritance_info.parent)
|
||||
{
|
||||
const u32 start = utils::align(range.start, s_page_size);
|
||||
const u32 end = ((range.end + 1) & s_page_align);
|
||||
|
||||
for (u32 page = start; page < end; page += s_page_size)
|
||||
{
|
||||
set_page_bit(page - base_address, page_bits::dirty);
|
||||
}
|
||||
|
||||
if (start > range.start) [[unlikely]]
|
||||
{
|
||||
set_page_bit(start - s_page_size, page_bits::nocache);
|
||||
}
|
||||
|
||||
if (end < range.end) [[unlikely]]
|
||||
{
|
||||
set_page_bit(end + s_page_size, page_bits::nocache);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
inheritance_info.parent->mark_dirty(range);
|
||||
}
|
||||
}
|
||||
|
||||
void dma_block::set_page_info(u32 page_offset, const std::vector<u64>& bits)
|
||||
{
|
||||
if (!inheritance_info.parent)
|
||||
{
|
||||
auto bit_offset = page_offset / s_bytes_per_entry;
|
||||
ensure(bit_offset + bits.size() <= page_info.size());
|
||||
std::memcpy(page_info.data() + bit_offset, bits.data(), bits.size());
|
||||
}
|
||||
else
|
||||
{
|
||||
inheritance_info.parent->set_page_info(page_offset + inheritance_info.block_offset, bits);
|
||||
}
|
||||
}
|
||||
|
||||
void dma_block::flush(const utils::address_range& range)
|
||||
{
|
||||
auto src = map_range(range);
|
||||
@ -206,11 +148,10 @@ namespace vk
|
||||
{
|
||||
// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
|
||||
// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
|
||||
auto gc = vk::get_resource_manager();
|
||||
gc->dispose(allocated_memory);
|
||||
free();
|
||||
|
||||
parent->set_page_info(inheritance_info.block_offset, page_info);
|
||||
page_info.clear();
|
||||
//parent->set_page_info(inheritance_info.block_offset, page_info);
|
||||
//page_info.clear();
|
||||
}
|
||||
}
|
||||
|
||||
@ -222,8 +163,8 @@ namespace vk
|
||||
|
||||
allocate(dev, new_size);
|
||||
|
||||
const auto required_entries = new_size / s_bytes_per_entry;
|
||||
page_info.resize(required_entries, ~0ull);
|
||||
//const auto required_entries = new_size / s_bytes_per_entry;
|
||||
//page_info.resize(required_entries, ~0ull);
|
||||
}
|
||||
|
||||
u32 dma_block::start() const
|
||||
@ -244,13 +185,9 @@ namespace vk
|
||||
|
||||
void dma_block_EXT::allocate(const render_device& dev, usz size)
|
||||
{
|
||||
if (allocated_memory)
|
||||
{
|
||||
// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
|
||||
// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
|
||||
auto gc = vk::get_resource_manager();
|
||||
gc->dispose(allocated_memory);
|
||||
}
|
||||
// Acquired blocks are always to be assumed dirty. It is not possible to synchronize host access and inline
|
||||
// buffer copies without causing weird issues. Overlapped incomplete data ends up overwriting host-uploaded data.
|
||||
free();
|
||||
|
||||
allocated_memory = std::make_unique<vk::buffer>(dev,
|
||||
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
|
||||
@ -278,16 +215,53 @@ namespace vk
|
||||
// NOP
|
||||
}
|
||||
|
||||
void create_dma_block(std::unique_ptr<dma_block>& block)
|
||||
bool test_host_pointer(u32 base_address, usz length)
|
||||
{
|
||||
#if 0 // Unusable due to vm locks
|
||||
auto block = vm::get(vm::any, base_address);
|
||||
ensure(block);
|
||||
|
||||
if ((block->addr + block->size) < (base_address + length))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (block->flags & 0x120)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
auto range_info = block->peek(base_address, u32(length));
|
||||
return !!range_info.second;
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
const bool allow_host_buffers = true;
|
||||
MEMORY_BASIC_INFORMATION mem_info;
|
||||
if (!::VirtualQuery(vm::get_super_ptr<const void>(base_address), &mem_info, sizeof(mem_info)))
|
||||
{
|
||||
rsx_log.error("VirtualQuery failed! LastError=0x%x", GetLastError());
|
||||
return false;
|
||||
}
|
||||
|
||||
return (mem_info.RegionSize >= length);
|
||||
#else
|
||||
// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
|
||||
return true; // *nix behavior is unknown with NVIDIA drivers
|
||||
#endif
|
||||
}
|
||||
|
||||
void create_dma_block(std::unique_ptr<dma_block>& block, u32 base_address, u32 expected_length)
|
||||
{
|
||||
const auto vendor = g_render_device->gpu().get_driver_vendor();
|
||||
|
||||
#ifdef _WIN32
|
||||
const bool allow_host_buffers = (vendor == driver_vendor::NVIDIA) ?
|
||||
test_host_pointer(base_address, expected_length) :
|
||||
true;
|
||||
#else
|
||||
// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
|
||||
const bool allow_host_buffers = (vendor != driver_vendor::AMD && vendor != driver_vendor::RADV);
|
||||
#endif
|
||||
if (g_render_device->get_external_memory_host_support() && allow_host_buffers)
|
||||
if (allow_host_buffers && g_render_device->get_external_memory_host_support())
|
||||
{
|
||||
block.reset(new dma_block_EXT());
|
||||
}
|
||||
@ -295,6 +269,8 @@ namespace vk
|
||||
{
|
||||
block.reset(new dma_block());
|
||||
}
|
||||
|
||||
block->init(*g_render_device, base_address, expected_length);
|
||||
}
|
||||
|
||||
std::pair<u32, vk::buffer*> map_dma(const command_buffer& cmd, u32 local_address, u32 length)
|
||||
@ -315,25 +291,28 @@ namespace vk
|
||||
if (first_block == last_block) [[likely]]
|
||||
{
|
||||
auto &block_info = g_dma_pool[first_block];
|
||||
if (!block_info) create_dma_block(block_info);
|
||||
|
||||
block_info->init(*g_render_device, first_block, s_dma_block_length);
|
||||
ensure(!block_info);
|
||||
create_dma_block(block_info, first_block, s_dma_block_length);
|
||||
return block_info->get(map_range);
|
||||
}
|
||||
|
||||
dma_block* block_head = nullptr;
|
||||
auto block_end = utils::align(limit, s_dma_block_length);
|
||||
|
||||
// Reverse scan to try and find the minimum required length in case of other chaining
|
||||
for (auto block = last_block; block != first_block; block -= s_dma_block_length)
|
||||
if (g_render_device->gpu().get_driver_vendor() != driver_vendor::NVIDIA ||
|
||||
rsx::get_location(local_address) == CELL_GCM_LOCATION_LOCAL)
|
||||
{
|
||||
if (auto found = g_dma_pool.find(block); found != g_dma_pool.end())
|
||||
// Reverse scan to try and find the minimum required length in case of other chaining
|
||||
for (auto block = last_block; block != first_block; block -= s_dma_block_length)
|
||||
{
|
||||
const auto end = found->second->end();
|
||||
last_block = std::max(last_block, end & s_dma_block_mask);
|
||||
block_end = std::max(block_end, end + 1);
|
||||
if (auto found = g_dma_pool.find(block); found != g_dma_pool.end())
|
||||
{
|
||||
const auto end = found->second->end();
|
||||
last_block = std::max(last_block, end & s_dma_block_mask);
|
||||
block_end = std::max(block_end, end + 1);
|
||||
|
||||
break;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -342,37 +321,31 @@ namespace vk
|
||||
auto found = g_dma_pool.find(block);
|
||||
auto &entry = g_dma_pool[block];
|
||||
|
||||
const bool exists = !!entry;
|
||||
if (!exists) create_dma_block(entry);
|
||||
|
||||
if (block == first_block)
|
||||
{
|
||||
block_head = entry->head();
|
||||
|
||||
if (exists)
|
||||
if (entry && entry->end() < limit)
|
||||
{
|
||||
if (entry->end() < limit)
|
||||
{
|
||||
auto new_length = block_end - block_head->start();
|
||||
block_head->extend(cmd, *g_render_device, new_length);
|
||||
}
|
||||
// Then the references to this object do not go to the end of the list as will be done with this new allocation.
|
||||
// A dumb release is therefore safe...
|
||||
entry.reset();
|
||||
}
|
||||
else
|
||||
|
||||
if (!entry)
|
||||
{
|
||||
auto required_size = (block_end - block);
|
||||
block_head->init(*g_render_device, block, required_size);
|
||||
create_dma_block(entry, block, required_size);
|
||||
}
|
||||
|
||||
block_head = entry->head();
|
||||
}
|
||||
else if (entry)
|
||||
{
|
||||
entry->set_parent(cmd, block_head);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (exists)
|
||||
{
|
||||
entry->set_parent(cmd, block_head);
|
||||
}
|
||||
else
|
||||
{
|
||||
entry->init(block_head, block, s_dma_block_length);
|
||||
}
|
||||
entry.reset(new dma_block());
|
||||
entry->init(block_head, block, s_dma_block_length);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -13,13 +13,6 @@ namespace vk
|
||||
class dma_block
|
||||
{
|
||||
protected:
|
||||
enum page_bits
|
||||
{
|
||||
synchronized = 0,
|
||||
dirty = 1,
|
||||
nocache = 3
|
||||
};
|
||||
|
||||
struct
|
||||
{
|
||||
dma_block* parent = nullptr;
|
||||
@ -29,19 +22,17 @@ namespace vk
|
||||
|
||||
u32 base_address = 0;
|
||||
std::unique_ptr<buffer> allocated_memory;
|
||||
std::vector<u64> page_info;
|
||||
|
||||
virtual void allocate(const render_device& dev, usz size);
|
||||
virtual void free();
|
||||
virtual void* map_range(const utils::address_range& range);
|
||||
virtual void unmap();
|
||||
|
||||
void set_page_bit(u32 page, u64 bits);
|
||||
bool test_page_bit(u32 page, u64 bits);
|
||||
void mark_dirty(const utils::address_range& range);
|
||||
void set_page_info(u32 page_offset, const std::vector<u64>& bits);
|
||||
|
||||
public:
|
||||
|
||||
dma_block() = default;
|
||||
virtual ~dma_block();
|
||||
|
||||
virtual void init(const render_device& dev, u32 addr, usz size);
|
||||
virtual void init(dma_block* parent, u32 addr, usz size);
|
||||
virtual void flush(const utils::address_range& range);
|
||||
|
@ -70,7 +70,6 @@ namespace vk
|
||||
vk::clear_resolve_helpers();
|
||||
vk::clear_dma_resources();
|
||||
vk::vmm_reset();
|
||||
vk::get_resource_manager()->destroy();
|
||||
vk::clear_scratch_resources();
|
||||
|
||||
vk::get_upload_heap()->destroy();
|
||||
@ -86,6 +85,9 @@ namespace vk
|
||||
p.second->destroy();
|
||||
}
|
||||
g_overlay_passes.clear();
|
||||
|
||||
// This must be the last item destroyed
|
||||
vk::get_resource_manager()->destroy();
|
||||
}
|
||||
|
||||
const vk::render_device *get_current_renderer()
|
||||
@ -263,8 +265,6 @@ namespace vk
|
||||
return (g_num_processed_frames > 0)? g_num_processed_frames - 1: 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void do_query_cleanup(vk::command_buffer& cmd)
|
||||
{
|
||||
auto renderer = dynamic_cast<VKGSRender*>(rsx::get_current_renderer());
|
||||
|
@ -905,6 +905,8 @@ namespace vk
|
||||
}
|
||||
|
||||
auto dma_mapping = vk::map_dma(cmd, static_cast<u32>(src_address), static_cast<u32>(data_length));
|
||||
|
||||
ensure(dma_mapping.second->size() >= (dma_mapping.first + data_length));
|
||||
vk::load_dma(::narrow<u32>(src_address), data_length);
|
||||
|
||||
upload_buffer = dma_mapping.second;
|
||||
@ -927,7 +929,7 @@ namespace vk
|
||||
}
|
||||
|
||||
// Copy from upload heap to scratch mem
|
||||
if (!opt.deferred_cmds.empty())
|
||||
if (opt.require_upload)
|
||||
{
|
||||
for (const auto& copy_cmd : opt.deferred_cmds)
|
||||
{
|
||||
@ -953,7 +955,8 @@ namespace vk
|
||||
scratch_offset += image_linear_size;
|
||||
ensure((scratch_offset + image_linear_size) <= scratch_buf->size()); // "Out of scratch memory"
|
||||
}
|
||||
else if (opt.require_upload)
|
||||
|
||||
if (opt.require_upload)
|
||||
{
|
||||
if (upload_commands.empty() || upload_buffer->value != upload_commands.back().first)
|
||||
{
|
||||
@ -974,7 +977,19 @@ namespace vk
|
||||
{
|
||||
ensure(scratch_buf);
|
||||
|
||||
vkCmdCopyBuffer(cmd, upload_buffer->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
|
||||
if (upload_commands.size() > 1)
|
||||
{
|
||||
auto range_ptr = buffer_copies.data();
|
||||
for (const auto& op : upload_commands)
|
||||
{
|
||||
vkCmdCopyBuffer(cmd, op.first, scratch_buf->value, op.second, range_ptr);
|
||||
range_ptr += op.second;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
vkCmdCopyBuffer(cmd, upload_buffer->value, scratch_buf->value, static_cast<u32>(buffer_copies.size()), buffer_copies.data());
|
||||
}
|
||||
|
||||
insert_buffer_memory_barrier(cmd, scratch_buf->value, 0, scratch_offset, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
|
||||
@ -1020,7 +1035,7 @@ namespace vk
|
||||
|
||||
vkCmdCopyBufferToImage(cmd, scratch_buf->value, dst_image->value, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, static_cast<u32>(copy_regions.size()), copy_regions.data());
|
||||
}
|
||||
else if (opt.require_upload)
|
||||
else if (upload_commands.size() > 1)
|
||||
{
|
||||
auto region_ptr = copy_regions.data();
|
||||
for (const auto& op : upload_commands)
|
||||
|
Loading…
Reference in New Issue
Block a user