rsx/vk: Initial hw-accelerated tile decoder

This commit is contained in:
kd-11 2023-09-12 02:41:25 +03:00 committed by kd-11
parent 6a7386ddb8
commit 647f7ddeec
4 changed files with 559 additions and 9 deletions

View File

@ -0,0 +1,349 @@
R"(
#version 450
layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;
#define SSBO_LOCATION(x) (x + %loc)
#define MEMORY_OP %op
#define MEMORY_OP_DETILE 0
#define MEMORY_OP_TILE 1
#if (MEMORY_OP == MEMORY_OP_TILE)
#define TILED_DATA_MODIFIER
#define LINEAR_DATA_MODIFIER readonly
#else
#define TILED_DATA_MODIFIER readonly
#define LINEAR_DATA_MODIFIER
#endif
layout(%set, binding=SSBO_LOCATION(0), std430) TILED_DATA_MODIFIER restrict buffer TiledDataBlock
{
uint tiled_data[];
};
layout(%set, binding=SSBO_LOCATION(1), std430) LINEAR_DATA_MODIFIER restrict buffer LinearDataBlock
{
uint linear_data[];
};
#ifdef VULKAN
layout(%push_block) uniform Configuration
{
uint prime;
uint factor;
uint num_tiles_per_row;
uint tile_base_address;
uint tile_size;
uint tile_offset;
uint tile_pitch;
uint tile_bank;
uint image_width;
uint image_height;
uint image_bpp;
};
#else
uniform uint prime;
uniform uint factor;
uniform uint num_tiles_per_row;
uniform uint tile_base_address;
uniform uint tile_size;
uniform uint tile_offset;
uniform uint tile_pitch;
uniform uint tile_bank;
uniform uint image_width;
uniform uint image_height;
uniform uint image_bpp;
#endif
// Constants
#define RSX_TILE_WIDTH 256
#define RSX_TILE_HEIGHT 64
#if (MEMORY_OP == MEMORY_OP_TILE)
uvec4 read_linear(const in uint offset)
{
switch (image_bpp)
{
case 16:
{
return uvec4(
linear_data[offset * 4],
linear_data[offset * 4 + 1],
linear_data[offset * 4 + 2],
linear_data[offset * 4 + 3]);
}
case 8:
{
return uvec4(
linear_data[offset * 2],
linear_data[offset * 2 + 1],
0,
0);
}
case 4:
{
return uvec4(linear_data[offset], 0, 0, 0);
}
case 2:
{
const uint word = linear_data[offset >> 1];
const int shift = int(offset & 1) << 4;
return uvec4(bitfieldExtract(word, shift, 16), 0, 0, 0);
}
case 1:
{
const uint word = linear_data[offset >> 2];
const int shift = int(offset & 3) << 3;
return uvec4(bitfieldExtract(word, shift, 8), 0, 0, 0);
}
default:
return uvec4(0);
}
}
void write_tiled(const in uint offset, const in uvec4 value)
{
switch (image_bpp)
{
case 16:
{
tiled_data[offset * 4] = value.x;
tiled_data[offset * 4 + 1] = value.y;
tiled_data[offset * 4 + 2] = value.z;
tiled_data[offset * 4 + 3] = value.w;
break;
}
case 8:
{
tiled_data[offset * 2] = value.x;
tiled_data[offset * 2 + 1] = value.y;
break;
}
case 4:
{
tiled_data[offset] = value.x;
break;
}
case 2:
{
const uint word_offset = offset >> 1;
const uint word = tiled_data[word_offset];
const int shift = int(offset & 1) << 4;
tiled_data[word_offset] = bitfieldInsert(word, value.x, shift, 16);
break;
}
case 1:
{
const uint word_offset = offset >> 2;
const uint word = tiled_data[word_offset];
const int shift = int(offset & 3) << 3;
tiled_data[word_offset] = bitfieldInsert(word, value.x, shift, 8);
break;
}
default:
break;
}
}
#else
uvec4 read_tiled(const in uint offset)
{
switch (image_bpp)
{
case 16:
{
return uvec4(
tiled_data[offset * 4],
tiled_data[offset * 4 + 1],
tiled_data[offset * 4 + 2],
tiled_data[offset * 4 + 3]);
}
case 8:
{
return uvec4(
tiled_data[offset * 2],
tiled_data[offset * 2 + 1],
0,
0);
}
case 4:
{
return uvec4(tiled_data[offset], 0, 0, 0);
}
case 2:
{
const uint word = tiled_data[offset >> 1];
const int shift = int(offset & 1) << 4;
return uvec4(bitfieldExtract(word, shift, 16), 0, 0, 0);
}
case 1:
{
const uint word = tiled_data[offset >> 2];
const int shift = int(offset & 3) << 3;
return uvec4(bitfieldExtract(word, shift, 8), 0, 0, 0);
}
default:
return uvec4(0);
}
}
void write_linear(const in uint offset, const in uvec4 value)
{
switch (image_bpp)
{
case 16:
{
linear_data[offset * 4] = value.x;
linear_data[offset * 4 + 1] = value.y;
linear_data[offset * 4 + 2] = value.z;
linear_data[offset * 4 + 3] = value.w;
break;
}
case 8:
{
linear_data[offset * 2] = value.x;
linear_data[offset * 2 + 1] = value.y;
break;
}
case 4:
{
linear_data[offset] = value.x;
break;
}
case 2:
{
const uint word_offset = offset >> 1;
const uint word = linear_data[word_offset];
const int shift = int(offset & 1) << 4;
linear_data[word_offset] = bitfieldInsert(word, value.x, shift, 16);
break;
}
case 1:
{
const uint word_offset = offset >> 2;
const uint word = linear_data[word_offset];
const int shift = int(offset & 3) << 3;
linear_data[word_offset] = bitfieldInsert(word, value.x, shift, 8);
break;
}
default:
break;
}
}
#endif
void do_memory_op(const in uint row, const in uint col)
{
const uint row_offset = (row * tile_pitch) + tile_base_address + tile_offset;
const uint this_address = row_offset + (col * image_bpp);
// 1. Calculate row_addr
const uint texel_offset = (this_address - tile_base_address) / RSX_TILE_WIDTH;
// Calculate coordinate of the tile grid we're supposed to be in
const uint tile_x = texel_offset % num_tiles_per_row;
const uint tile_y = (texel_offset / num_tiles_per_row) / RSX_TILE_HEIGHT;
// Calculate the grid offset for the tile selected and add the base offset. It's supposed to affect the bank stuff in the next step
const uint tile_id = tile_y * num_tiles_per_row + tile_x;
const uint tile_selector = (tile_id + (tile_base_address >> 14)) & 0x3ffff;
// Calculate row address
const uint row_address = (tile_selector >> 2) & 0xffff;
// 2. Calculate bank selector
// There's a lot of weird math here, but it's just a variant of (tile_selector % 4) to pick a value between [0..3]
uint bank_selector = 0;
const uint bank_distribution_lookup[16] = { 0, 1, 2, 3, 2, 3, 0, 1, 1, 2, 3, 0, 3, 0, 1, 2 };
if (factor == 1)
{
bank_selector = (tile_selector & 3);
}
else if (factor == 2)
{
const uint idx = ((tile_selector + ((tile_y & 1) << 1)) & 3) * 4 + (tile_y & 3);
bank_selector = bank_distribution_lookup[idx];
}
else if (factor >= 4)
{
const uint idx = (tile_selector & 3) * 4 + (tile_y & 3);
bank_selector = bank_distribution_lookup[idx];
}
bank_selector = (bank_selector + tile_bank) % 4;
// 3. Calculate column selector
uint column_selector = 0;
const uint line_offset_in_tile = (texel_offset / num_tiles_per_row) % RSX_TILE_HEIGHT;
// Calculate column_selector by bit-twiddling line offset and the other calculated parameter bits:
// column_selector[9:7] = line_offset_in_tile[5:3]
// column_selector[6:4] = this_address[7:5]
// column_selector[3:2] = line_offset_in_tile[1:0]
// column_selector[1:0] = 0
column_selector |= ((line_offset_in_tile >> 3) & 0x7) << 7;
column_selector |= ((this_address >> 5) & 0x7) << 4;
column_selector |= ((line_offset_in_tile >> 0) & 0x3) << 2;
// 4. Calculate partition selector (0 or 1)
const uint partition_selector = (((line_offset_in_tile >> 2) & 1) + ((this_address >> 6) & 1)) & 1;
// 5. Build tiled address
uint tile_address = 0;
// tile_address[31:16] = row_adr[15:0]
// tile_address[15:14] = bank_sel[1:0]
// tile_address[13:8] = column_sel[9:4]
// tile_address[7:7] = partition_sel[0:0]
// tile_address[6:5] = column_sel[3:2]
// tile_address[4:0] = this_address[4:0]
tile_address |= ((row_address >> 0) & 0xFFFF) << 16;
tile_address |= ((bank_selector >> 0) & 0x3) << 14;
tile_address |= ((column_selector >> 4) & 0x3F) << 8;
tile_address |= ((partition_selector >> 0) & 0x1) << 7;
tile_address |= ((column_selector >> 2) & 0x3) << 5;
tile_address |= ((this_address >> 0) & 0x1F) << 0;
// Twiddle bits 9 and 10
tile_address ^= (((tile_address >> 12) ^ ((bank_selector ^ tile_selector) & 1) ^ (tile_address >> 14)) & 1) << 9;
tile_address ^= ((tile_address >> 11) & 1) << 10;
// Calculate relative addresses and sample
uint linear_image_offset = (row * tile_pitch) + (col * image_bpp);
uint tile_data_offset = tile_address - (tile_base_address + tile_offset);
if (tile_data_offset >= tile_size)
{
// Do not touch anything out of bounds
return;
}
// Convert to texel addresses for data access
linear_image_offset /= image_bpp;
tile_data_offset /= image_bpp;
#if (MEMORY_OP == MEMORY_OP_DETILE)
// Write to linear from tiled
write_linear(linear_image_offset, read_tiled(tile_data_offset));
#else
// Opposite. Write to tile from linear
write_tiled(tile_data_offset, read_linear(linear_image_offset));
#endif
}
void main()
{
// The 2D coordinates are retrieved from gl_GlobalInvocationID
const uint num_iterations = (image_bpp < 4) ? (4 / image_bpp) : 1;
const uint row = gl_GlobalInvocationID.y;
const uint col0 = gl_GlobalInvocationID.x;
// for (uint col = col0; col < (col0 + num_iterations); ++col)
{
if (row >= image_height || col0 >= image_width)
{
// Out of bounds
return;
}
do_memory_op(row, col0);
}
}
)"

View File

@ -502,6 +502,159 @@ namespace vk
void run(const vk::command_buffer& cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words);
};
enum RSX_detiler_op
{
decode = 0,
encode = 1
};
struct RSX_detiler_config
{
u32 tile_base_address;
u32 tile_base_offset;
u32 tile_size;
u32 tile_pitch;
u32 bank;
const vk::buffer* dst;
u32 dst_offset;
const vk::buffer* src;
u32 src_offset;
u16 image_width;
u16 image_height;
u32 image_pitch;
};
template <RSX_detiler_op Op>
struct cs_tile_memcpy : compute_task
{
#pragma pack (push, 1)
struct
{
u32 prime;
u32 factor;
u32 num_tiles_per_row;
u32 tile_base_address;
u32 tile_size;
u32 tile_offset;
u32 tile_pitch;
u32 tile_bank;
u32 image_width;
u32 image_height;
u32 image_bpp;
} params;
#pragma pack (pop)
const vk::buffer* src_buffer = nullptr;
const vk::buffer* dst_buffer = nullptr;
u32 in_offset = 0;
u32 out_offset = 0;
u32 in_block_length = 0;
u32 out_block_length = 0;
cs_tile_memcpy()
{
ssbo_count = 2;
use_push_constants = true;
push_constants_size = 44;
create();
m_src =
#include "../Program/GLSLSnippets/RSXMemoryTiling.glsl"
;
optimal_group_size = 1;
const std::pair<std::string_view, std::string> syntax_replace[] =
{
{ "%loc", "0" },
{ "%set", "set = 0" },
{ "%push_block", "push_constant" },
{ "%ws", std::to_string(optimal_group_size) },
{ "%op", std::to_string(Op) }
};
m_src = fmt::replace_all(m_src, syntax_replace);
}
void bind_resources() override
{
const auto op = static_cast<int>(Op);
m_program->bind_buffer({ src_buffer->value, in_offset, in_block_length }, 0 ^ op, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
m_program->bind_buffer({ dst_buffer->value, out_offset, out_block_length }, 1 ^ op, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
}
void set_parameters(const vk::command_buffer& cmd)
{
vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, push_constants_size, &params);
}
void run(const vk::command_buffer& cmd, const RSX_detiler_config& config)
{
dst_buffer = config.dst;
src_buffer = config.src;
this->in_offset = config.src_offset;
this->out_offset = config.dst_offset;
const auto tiled_height = std::min(
utils::align<u32>(config.image_height, 64),
utils::aligned_div(config.tile_size - config.tile_base_offset, config.tile_pitch)
);
if constexpr (Op == RSX_detiler_op::decode)
{
this->in_block_length = tiled_height * config.tile_pitch;
this->out_block_length = config.image_height * config.image_pitch;
}
else
{
this->in_block_length = config.image_height * config.image_pitch;
this->out_block_length = tiled_height* config.tile_pitch;
}
auto get_prime_factor = [](u32 pitch) -> std::pair<u32, u32>
{
const u32 base = (pitch >> 8);
if ((pitch & (pitch - 1)) == 0)
{
return { 1u, base };
}
for (const auto prime : { 3, 5, 7, 11, 13 })
{
if ((base % prime) == 0)
{
return { prime, base / prime };
}
}
rsx_log.error("Unexpected pitch value 0x%x", pitch);
return {};
};
const auto [prime, factor] = get_prime_factor(config.tile_pitch);
const u32 tiles_per_row = prime * factor;
params.prime = prime;
params.factor = factor;
params.num_tiles_per_row = tiles_per_row;
params.tile_base_address = config.tile_base_address;
params.tile_size = config.tile_size;
params.tile_offset = config.tile_base_offset;
params.tile_pitch = config.tile_pitch;
params.tile_bank = config.bank;
params.image_width = config.image_width;
params.image_height = config.image_height;
params.image_bpp = config.image_pitch / config.image_width;
set_parameters(cmd);
const u32 invocations_x = utils::aligned_div(config.image_width, optimal_group_size);
compute_task::run(cmd, invocations_x, config.image_height, 1);
}
};
// TODO: Replace with a proper manager
extern std::unordered_map<u32, std::unique_ptr<vk::compute_task>> g_compute_tasks;

View File

@ -92,16 +92,24 @@ namespace vk
rsx_pitch = pitch;
const bool require_format_conversion = !!(src->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || src->format() == VK_FORMAT_D32_SFLOAT;
const auto tiled_region = rsx::get_current_renderer()->get_tiled_memory_region(valid_range);
const bool require_tiling = !!tiled_region;
const bool require_gpu_transform = require_format_conversion || pack_unpack_swap_bytes || require_tiling;
auto dma_mapping = vk::map_dma(valid_range.start, valid_range.length());
if (require_format_conversion || pack_unpack_swap_bytes)
if (require_gpu_transform)
{
const auto section_length = valid_range.length();
const auto transfer_pitch = real_pitch;
const auto task_length = transfer_pitch * src_area.height();
const auto working_buffer_length = calculate_working_buffer_size(task_length, src->aspect());
auto working_buffer_length = calculate_working_buffer_size(task_length, src->aspect());
if (require_tiling) {
working_buffer_length += tiled_region.tile->size;
}
auto working_buffer = vk::get_scratch_buffer(cmd, working_buffer_length);
u32 result_offset = 0;
VkBufferImageCopy region = {};
region.imageSubresource = { src->aspect(), 0, 0, 1 };
@ -142,17 +150,56 @@ namespace vk
shuffle_kernel->run(cmd, working_buffer, task_length);
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
if (!require_tiling)
{
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
require_rw_barrier = false;
require_rw_barrier = false;
}
}
}
if (require_tiling)
{
#if !DEBUG_DMA_TILING
// Compute -> Compute barrier
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT);
// Prepare payload
const RSX_detiler_config config =
{
.tile_base_address = tiled_region.base_address,
.tile_base_offset = valid_range.start - tiled_region.base_address,
.tile_size = tiled_region.tile->size,
.tile_pitch = tiled_region.tile->pitch,
.bank = tiled_region.tile->bank,
.dst = working_buffer,
.dst_offset = task_length,
.src = working_buffer,
.src_offset = 0,
.image_width = width,
.image_height = height,
.image_pitch = real_pitch
};
// Execute
const auto job = vk::get_compute_task<vk::cs_tile_memcpy<RSX_detiler_op::encode>>();
job->run(cmd, config);
result_offset = task_length;
require_rw_barrier = true;
#endif
}
if (require_rw_barrier)
{
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, working_buffer_length,
vk::insert_buffer_memory_barrier(cmd, working_buffer->value, result_offset, working_buffer_length,
VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
}
@ -160,6 +207,7 @@ namespace vk
if (rsx_pitch == real_pitch) [[likely]]
{
VkBufferCopy copy = {};
copy.srcOffset = result_offset;
copy.dstOffset = dma_mapping.first;
copy.size = section_length;
vkCmdCopyBuffer(cmd, working_buffer->value, dma_mapping.second->value, 1, &copy);
@ -178,7 +226,7 @@ namespace vk
copy.reserve(transfer_height);
u32 dst_offset = dma_mapping.first;
u32 src_offset = 0;
u32 src_offset = result_offset;
for (unsigned row = 0; row < transfer_height; ++row)
{

View File

@ -14,7 +14,7 @@
#include <memory>
#include <vector>
#define DEBUG_DMA_TILING 1
#define DEBUG_DMA_TILING 0
#if DEBUG_DMA_TILING
#include "../Common/tiled_dma_copy.hpp"