gl: Finalize host labels implementation

This commit is contained in:
kd-11 2024-10-22 03:41:36 +03:00 committed by kd-11
parent 0db06964dc
commit 681debd8f6
14 changed files with 166 additions and 65 deletions

View File

@ -300,7 +300,7 @@ namespace gl
m_src = fmt::replace_all(m_src, syntax_replace);
param_buffer.create(gl::buffer::target::uniform, 32, nullptr, gl::buffer::memory_type::local, GL_DYNAMIC_COPY);
param_buffer.create(gl::buffer::target::uniform, 32, nullptr, gl::buffer::memory_type::local, gl::buffer::usage::dynamic_update);
}
~cs_deswizzle_3d()

View File

@ -19,8 +19,11 @@ namespace gl
void* userptr = vm::get_super_ptr(base_address);
m_data = std::make_unique<gl::buffer>();
m_data->create(buffer::target::userptr, block_size, userptr);
m_data->create(buffer::target::array, block_size, userptr, buffer::memory_type::userptr, 0);
m_base_address = base_address;
// Some drivers may reject userptr input for whatever reason. Check that the state is still valid.
gl::check_state();
}
void* dma_block::map(const utils::address_range& range) const
@ -69,8 +72,8 @@ namespace gl
utils::address_range to_dma_block_range(u32 start, u32 length)
{
const auto start_block_address = start & ~s_dma_block_size;
const auto end_block_address = (start + length - 1) & ~s_dma_block_size;
const auto start_block_address = start & -s_dma_block_size;
const auto end_block_address = (start + length + s_dma_block_size - 1) & -s_dma_block_size;
return utils::address_range::start_end(start_block_address, end_block_address);
}
@ -81,7 +84,7 @@ namespace gl
if (!block)
{
block = std::make_unique<dma_block>();
block->allocate(block_range.start, length);
block->allocate(block_range.start, block_range.length());
return *block;
}
@ -96,6 +99,7 @@ namespace gl
const auto search_end = (block_range.end + 1);
// 1. Resize to new length
ensure((new_length & -s_dma_block_size) == new_length);
auto new_owner = std::make_unique<dma_block>();
new_owner->allocate(owner->base_addr(), new_length);

View File

@ -24,7 +24,7 @@ namespace gl
void* map(const utils::address_range& range) const;
void set_parent(const dma_block* other);
const dma_block* head() const { return m_parent; }
const dma_block* head() const { return m_parent ? m_parent : this; }
bool can_map(const utils::address_range& range) const;
u32 base_addr() const { return m_base_address; }

View File

@ -181,18 +181,18 @@ void GLGSRender::on_init_thread()
backend_config.supports_normalized_barycentrics = false;
}
if (gl_caps.AMD_pinned_memory)
if (gl_caps.AMD_pinned_memory && g_cfg.video.host_label_synchronization)
{
backend_config.supports_host_gpu_labels = true;
if (g_cfg.video.host_label_synchronization)
{
m_host_gpu_context_data = std::make_unique<gl::buffer>();
m_host_gpu_context_data->create(gl::buffer::target::array, 4096);
m_host_gpu_context_data = std::make_unique<gl::buffer>();
m_host_gpu_context_data->create(gl::buffer::target::array, 4096, nullptr, gl::buffer::memory_type::host_visible,
gl::buffer::usage::host_read | gl::buffer::usage::host_write | gl::buffer::usage::persistent_map);
auto host_context_ptr = reinterpret_cast<rsx::host_gpu_context_t*>(m_host_gpu_context_data->map(0, 4096, gl::buffer::access::read));
m_host_dma_ctrl = std::make_unique<rsx::RSXDMAWriter>(host_context_ptr);
}
auto host_context_ptr = reinterpret_cast<rsx::host_gpu_context_t*>(m_host_gpu_context_data->map(0, 4096, gl::buffer::access::persistent_rw));
m_host_dma_ctrl = std::make_unique<rsx::RSXDMAWriter>(host_context_ptr);
m_enqueued_host_write_buffer = std::make_unique<gl::scratch_ring_buffer>();
m_enqueued_host_write_buffer->create(gl::buffer::target::array, 64 * 0x100000, gl::buffer::usage::dynamic_update);
}
// Use industry standard resource alignment values as defaults
@ -425,6 +425,7 @@ void GLGSRender::on_exit()
m_host_dma_ctrl.reset();
m_host_gpu_context_data.reset();
m_enqueued_host_write_buffer.reset();
for (auto &fbo : m_framebuffer_cache)
{
@ -1222,6 +1223,66 @@ void GLGSRender::notify_tile_unbound(u32 tile)
}
}
bool GLGSRender::release_GCM_label(u32 address, u32 args)
{
if (!backend_config.supports_host_gpu_labels)
{
return false;
}
auto host_ctx = ensure(m_host_dma_ctrl->host_ctx());
if (host_ctx->texture_loads_completed())
{
// We're about to poll waiting for GPU state, ensure the context is still valid.
gl::check_state();
// All texture loads already seen by the host GPU
// Wait for all previously submitted labels to be flushed
m_host_dma_ctrl->drain_label_queue();
return false;
}
const auto mapping = gl::map_dma(address, 4);
const auto write_data = std::bit_cast<u32, be_t<u32>>(args);
const auto release_event_id = host_ctx->on_label_acquire();
// We don't have async texture loads yet, so just release both the label and the commands complete
u64 write_buf[2] = { write_data, release_event_id };
const auto host_read_offset = m_enqueued_host_write_buffer->alloc(16, 16);
m_enqueued_host_write_buffer->get().sub_data(host_read_offset, 16, write_buf);
// Now write to DMA and then to host context
m_enqueued_host_write_buffer->get().copy_to(mapping.second, host_read_offset, mapping.first, 4);
m_enqueued_host_write_buffer->get().copy_to(m_host_gpu_context_data.get(), host_read_offset + 8, ::offset32(&rsx::host_gpu_context_t::commands_complete_event), 8);
m_enqueued_host_write_buffer->push_barrier(host_read_offset, 16);
host_ctx->on_label_release();
return true;
}
void GLGSRender::enqueue_host_context_write(u32 offset, u32 size, const void* data)
{
ensure(size <= 8);
const u32 host_read_offset = m_enqueued_host_write_buffer->alloc(8, 16);
m_enqueued_host_write_buffer->get().sub_data(host_read_offset, size, data);
m_enqueued_host_write_buffer->get().copy_to(m_host_gpu_context_data.get(), host_read_offset, offset, size);
m_enqueued_host_write_buffer->push_barrier(host_read_offset, 16);
}
void GLGSRender::on_guest_texture_read()
{
if (!backend_config.supports_host_gpu_labels)
{
return;
}
// Tag the read as being in progress
u64 event_id = m_host_dma_ctrl->host_ctx()->inc_counter();
m_host_dma_ctrl->host_ctx()->texture_load_request_event = event_id;
enqueue_host_context_write(::offset32(&rsx::host_gpu_context_t::texture_load_complete_event), 8, &event_id);
}
void GLGSRender::begin_occlusion_query(rsx::reports::occlusion_query_info* query)
{
query->result = 0;

View File

@ -152,6 +152,7 @@ class GLGSRender : public GSRender, public ::rsx::reports::ZCULL_control
// Host context for GPU-driven work
std::unique_ptr<gl::buffer> m_host_gpu_context_data;
std::unique_ptr<gl::scratch_ring_buffer> m_enqueued_host_write_buffer;
public:
u64 get_cycles() final;
@ -196,6 +197,11 @@ public:
void get_occlusion_query_result(rsx::reports::occlusion_query_info* query) override;
void discard_occlusion_query(rsx::reports::occlusion_query_info* query) override;
// DMA
bool release_GCM_label(u32 address, u32 data) override;
void enqueue_host_context_write(u32 offset, u32 size, const void* data);
void on_guest_texture_read();
// GRAPH backend
void patch_transform_constants(rsx::context* ctx, u32 index, u32 count) override;

View File

@ -3,6 +3,7 @@
#include "GLCompute.h"
#include "GLRenderTargets.h"
#include "GLOverlays.h"
#include "GLGSRender.h"
#include "glutils/blitter.h"
#include "glutils/ring_buffer.h"
@ -285,7 +286,7 @@ namespace gl
if (!(*dst) || max_mem > static_cast<u64>(dst->size()))
{
if (*dst) dst->remove();
dst->create(buffer::target::ssbo, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY);
dst->create(buffer::target::ssbo, max_mem, nullptr, buffer::memory_type::local, 0);
}
if (auto as_vi = dynamic_cast<const gl::viewable_image*>(src);
@ -400,7 +401,7 @@ namespace gl
return;
}
scratch_mem.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, GL_STATIC_COPY);
scratch_mem.create(buffer::target::pixel_pack, max_mem, nullptr, buffer::memory_type::local, 0);
glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
src->copy_to(&scratch_mem, in_offset, 0, mem_info->image_size_in_bytes);
@ -835,6 +836,10 @@ namespace gl
const GLenum gl_format = std::get<0>(format_type);
const GLenum gl_type = std::get<1>(format_type);
fill_texture(cmd, dst, gcm_format, subresources_layout, is_swizzled, gl_format, gl_type, data_upload_buf);
// Notify the renderer of the upload
auto renderer = static_cast<GLGSRender*>(rsx::get_current_renderer());
renderer->on_guest_texture_read();
}
u32 get_format_texel_width(GLenum format)

View File

@ -59,7 +59,7 @@ namespace gl
pbo.remove();
}
pbo.create(buffer::target::pixel_pack, buffer_size, nullptr, buffer::memory_type::host_visible, GL_STREAM_READ);
pbo.create(buffer::target::pixel_pack, buffer_size, nullptr, buffer::memory_type::host_visible, buffer::usage::host_read);
glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);
}

View File

@ -3,38 +3,35 @@
namespace gl
{
void buffer::allocate(GLsizeiptr size, const void* data_, memory_type type, GLenum usage)
void buffer::allocate(GLsizeiptr size, const void* data_, memory_type type, GLuint usage_flags)
{
m_memory_type = type;
if (const auto& caps = get_driver_caps();
m_target != target::userptr && caps.ARB_buffer_storage_supported)
type != memory_type::userptr && caps.ARB_buffer_storage_supported)
{
GLenum flags = 0;
if (type == memory_type::host_visible)
if (usage_flags & usage::host_write)
{
switch (usage)
{
case GL_STREAM_DRAW:
case GL_STATIC_DRAW:
case GL_DYNAMIC_DRAW:
flags |= GL_MAP_WRITE_BIT;
break;
case GL_STREAM_READ:
case GL_STATIC_READ:
case GL_DYNAMIC_READ:
flags |= GL_MAP_READ_BIT;
break;
default:
fmt::throw_exception("Unsupported buffer usage 0x%x", usage);
}
flags |= GL_MAP_WRITE_BIT;
}
else
if (usage_flags & usage::host_read)
{
// Local memory hints
if (usage == GL_DYNAMIC_COPY)
{
flags |= GL_DYNAMIC_STORAGE_BIT;
}
flags |= GL_MAP_READ_BIT;
}
if (usage_flags & usage::persistent_map)
{
flags |= GL_MAP_PERSISTENT_BIT;
}
if (usage_flags & usage::dynamic_update)
{
flags |= GL_DYNAMIC_STORAGE_BIT;
}
ensure((flags & (GL_MAP_PERSISTENT_BIT | GL_DYNAMIC_STORAGE_BIT)) != (GL_MAP_PERSISTENT_BIT | GL_DYNAMIC_STORAGE_BIT),
"Mutually exclusive usage flags set!");
ensure(type == memory_type::local || flags != 0, "Host-visible memory must have usage flags set!");
if ((flags & GL_MAP_READ_BIT) && !caps.vendor_AMD)
{
@ -51,10 +48,8 @@ namespace gl
}
else
{
data(size, data_, usage);
data(size, data_, GL_STREAM_COPY);
}
m_memory_type = type;
}
buffer::~buffer()
@ -89,18 +84,18 @@ namespace gl
save_binding_state save(current_target(), *this);
}
void buffer::create(GLsizeiptr size, const void* data_, memory_type type, GLenum usage)
void buffer::create(GLsizeiptr size, const void* data_, memory_type type, GLuint usage_bits)
{
create();
allocate(size, data_, type, usage);
allocate(size, data_, type, usage_bits);
}
void buffer::create(target target_, GLsizeiptr size, const void* data_, memory_type type, GLenum usage)
void buffer::create(target target_, GLsizeiptr size, const void* data_, memory_type type, GLuint usage_bits)
{
m_target = target_;
create();
allocate(size, data_, type, usage);
allocate(size, data_, type, usage_bits);
}
void buffer::remove()
@ -117,11 +112,19 @@ namespace gl
{
ensure(m_memory_type != memory_type::local);
DSA_CALL2(NamedBufferData, m_id, size, data_, usage);
m_size = size;
if (m_memory_type == memory_type::userptr)
{
glBindBuffer(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, m_id);
glBufferData(GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD, size, data_, usage);
return;
}
DSA_CALL2(NamedBufferData, m_id, size, data_, usage);
}
void buffer::sub_data(GLsizeiptr offset, GLsizeiptr length, GLvoid* data)
void buffer::sub_data(GLsizeiptr offset, GLsizeiptr length, const GLvoid* data)
{
ensure(m_memory_type == memory_type::local);
DSA_CALL2(NamedBufferSubData, m_id, offset, length, data);

View File

@ -15,28 +15,37 @@ namespace gl
element_array = GL_ELEMENT_ARRAY_BUFFER,
uniform = GL_UNIFORM_BUFFER,
texture = GL_TEXTURE_BUFFER,
ssbo = GL_SHADER_STORAGE_BUFFER,
userptr = GL_EXTERNAL_VIRTUAL_MEMORY_BUFFER_AMD
ssbo = GL_SHADER_STORAGE_BUFFER
};
enum class access
{
read = GL_MAP_READ_BIT,
write = GL_MAP_WRITE_BIT,
read_write = GL_MAP_READ_BIT | GL_MAP_WRITE_BIT
rw = GL_MAP_READ_BIT | GL_MAP_WRITE_BIT,
persistent_rw = GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT
};
enum class memory_type
{
undefined = 0,
local = 1,
host_visible = 2
host_visible = 2,
userptr = 4
};
enum usage
{
host_write = (1 << 0),
host_read = (1 << 1),
persistent_map = (1 << 2),
dynamic_update = (1 << 3),
};
class save_binding_state
{
GLint m_last_binding;
GLenum m_target;
GLint m_last_binding = GL_ZERO;
GLenum m_target = GL_NONE;
public:
save_binding_state(target target_, const buffer& new_state) : save_binding_state(target_)
@ -65,6 +74,11 @@ namespace gl
~save_binding_state()
{
if (!m_target)
{
return;
}
glBindBuffer(m_target, m_last_binding);
}
};
@ -78,7 +92,7 @@ namespace gl
// Metadata
mutable std::pair<u32, u32> m_bound_range{};
void allocate(GLsizeiptr size, const void* data_, memory_type type, GLenum usage);
void allocate(GLsizeiptr size, const void* data_, memory_type type, GLuint usage_bits);
public:
buffer() = default;
@ -89,8 +103,8 @@ namespace gl
void recreate(GLsizeiptr size, const void* data = nullptr);
void create();
void create(GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLenum usage = GL_STREAM_DRAW);
void create(target target_, GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLenum usage = GL_STREAM_DRAW);
void create(GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLuint usage_bits = 0);
void create(target target_, GLsizeiptr size, const void* data_ = nullptr, memory_type type = memory_type::local, GLuint usage_bits = 0);
void remove();
@ -98,7 +112,7 @@ namespace gl
void bind() const { bind(current_target()); }
void data(GLsizeiptr size, const void* data_ = nullptr, GLenum usage = GL_STREAM_DRAW);
void sub_data(GLsizeiptr offset, GLsizeiptr length, GLvoid* data);
void sub_data(GLsizeiptr offset, GLsizeiptr length, const GLvoid* data);
GLubyte* map(GLsizeiptr offset, GLsizeiptr length, access access_);
void unmap();

View File

@ -79,4 +79,12 @@ namespace gl
{
glInsertEventMarkerEXT(static_cast<GLsizei>(strlen(label)), label);
}
// Checks if GL state is still valid
void check_state()
{
// GL_OUT_OF_MEMORY invalidates the OpenGL context and is actually the GL version of DEVICE_LOST.
// This spec workaround allows it to be abused by ISVs to indicate a broken GL context.
ensure(glGetError() != GL_OUT_OF_MEMORY);
}
}

View File

@ -242,14 +242,14 @@ namespace gl
}
}
void scratch_ring_buffer::create(buffer::target target_, u64 size)
void scratch_ring_buffer::create(buffer::target target_, u64 size, u32 usage_flags)
{
if (m_storage)
{
remove();
}
m_storage.create(target_, size, nullptr, gl::buffer::memory_type::local, GL_STATIC_COPY);
m_storage.create(target_, size, nullptr, gl::buffer::memory_type::local, usage_flags);
}
void scratch_ring_buffer::remove()

View File

@ -103,7 +103,7 @@ namespace gl
scratch_ring_buffer(const scratch_ring_buffer&) = delete;
~scratch_ring_buffer();
void create(buffer::target _target, u64 size);
void create(buffer::target _target, u64 size, u32 usage_flags = 0);
void remove();
u32 alloc(u32 size, u32 alignment);

View File

@ -80,7 +80,7 @@ namespace gl
if (!m_ubo)
{
ensure(compiled);
m_ubo.create(gl::buffer::target::uniform, push_buffer_size, nullptr, gl::buffer::memory_type::local, GL_DYNAMIC_COPY);
m_ubo.create(gl::buffer::target::uniform, push_buffer_size, nullptr, gl::buffer::memory_type::local, gl::buffer::usage::dynamic_update);
// Statically bind the image sources
m_program.uniforms["InputTexture"] = GL_TEMP_IMAGE_SLOT(0);

View File

@ -27,7 +27,7 @@ namespace rsx
inline bool in_flight_commands_completed() const volatile
{
return last_label_release2_event == commands_complete_event;
return last_label_release2_event <= commands_complete_event;
}
inline bool texture_loads_completed() const volatile