diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 561f3cc02e0..ebc54a900e3 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -12526,7 +12526,8 @@ select_rt_prolog(Program* program, ac_shader_config* config, */ PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr); PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_size); - PhysReg out_launch_size_z = out_launch_size_x.advance(8); + PhysReg out_launch_size_y = out_launch_size_x.advance(4); + PhysReg out_launch_size_z = out_launch_size_y.advance(4); PhysReg out_launch_ids[3]; for (unsigned i = 0; i < 3; i++) out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_id).advance(i * 4); @@ -12534,9 +12535,13 @@ select_rt_prolog(Program* program, ac_shader_config* config, PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record); /* Temporaries: */ - num_sgprs = align(num_sgprs, 2) + 4; - PhysReg tmp_raygen_sbt = PhysReg{num_sgprs - 4}; - PhysReg tmp_ring_offsets = PhysReg{num_sgprs - 2}; + num_sgprs = align(num_sgprs, 2); + PhysReg tmp_raygen_sbt = PhysReg{num_sgprs}; + num_sgprs += 2; + PhysReg tmp_ring_offsets = PhysReg{num_sgprs}; + num_sgprs += 2; + + PhysReg tmp_invocation_idx = PhysReg{256 + num_vgprs++}; /* Confirm some assumptions about register aliasing */ assert(in_ring_offsets == out_uniform_shader_addr); @@ -12610,6 +12615,36 @@ select_rt_prolog(Program* program, ac_shader_config* config, bld.vop1(aco_opcode::v_mov_b32, Definition(out_record_ptr.advance(4), v1), Operand(tmp_raygen_sbt.advance(4), s1)); + /* For 1D dispatches converted into 2D ones, we need to fix up the launch IDs. + * Calculating the 1D launch ID is: id = local_invocation_index + (wg_id.x * wg_size). + * in_wg_id_x now holds wg_id.x * wg_size. + */ + bld.sop2(aco_opcode::s_lshl_b32, Definition(in_wg_id_x, s1), Definition(scc, s1), + Operand(in_wg_id_x, s1), Operand::c32(program->workgroup_size == 32 ? 5 : 6)); + + /* Calculate and add local_invocation_index */ + bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(tmp_invocation_idx, v1), Operand::c32(-1u), + Operand(in_wg_id_x, s1)); + if (program->wave_size == 64) { + if (program->gfx_level <= GFX7) + bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(tmp_invocation_idx, v1), + Operand::c32(-1u), Operand(tmp_invocation_idx, v1)); + else + bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(tmp_invocation_idx, v1), + Operand::c32(-1u), Operand(tmp_invocation_idx, v1)); + } + + /* Make fixup operations a no-op if this is not a converted 2D dispatch. */ + bld.sopc(aco_opcode::s_cmp_lg_u32, Definition(scc, s1), + Operand::c32(ACO_RT_CONVERTED_2D_LAUNCH_SIZE), Operand(out_launch_size_y, s1)); + bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm), + Operand::c32_or_c64(-1u, program->wave_size == 64), + Operand::c32_or_c64(0, program->wave_size == 64), Operand(scc, s1)); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[0], v1), + Operand(tmp_invocation_idx, v1), Operand(out_launch_ids[0], v1), Operand(vcc, bld.lm)); + bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), Operand::zero(), + Operand(out_launch_ids[1], v1), Operand(vcc, bld.lm)); + /* jump to raygen */ bld.sop1(aco_opcode::s_setpc_b64, Operand(out_uniform_shader_addr, s2)); diff --git a/src/amd/compiler/aco_interface.h b/src/amd/compiler/aco_interface.h index 8f35e18b5b0..9d2c1dbb2af 100644 --- a/src/amd/compiler/aco_interface.h +++ b/src/amd/compiler/aco_interface.h @@ -32,6 +32,9 @@ extern "C" { #endif +/* Special launch size to indicate this dispatch is a 1D dispatch converted into a 2D one */ +#define ACO_RT_CONVERTED_2D_LAUNCH_SIZE -1u + typedef struct nir_shader nir_shader; struct ac_shader_config; struct aco_shader_info; diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index cdede679552..c7dd4b216d4 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -42,6 +42,8 @@ #include "ac_debug.h" #include "ac_shader_args.h" +#include "aco_interface.h" + #include "util/fast_idiv_by_const.h" enum { @@ -10003,7 +10005,26 @@ enum radv_rt_mode { }; static void -radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCommand2KHR *tables, uint64_t indirect_va, +radv_upload_trace_rays_params(struct radv_cmd_buffer *cmd_buffer, VkTraceRaysIndirectCommand2KHR *tables, + enum radv_rt_mode mode, uint64_t *launch_size_va, uint64_t *sbt_va) +{ + uint32_t upload_size = mode == radv_rt_mode_direct ? sizeof(VkTraceRaysIndirectCommand2KHR) + : offsetof(VkTraceRaysIndirectCommand2KHR, width); + + uint32_t offset; + if (!radv_cmd_buffer_upload_data(cmd_buffer, upload_size, tables, &offset)) + return; + + uint64_t upload_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset; + + if (mode == radv_rt_mode_direct) + *launch_size_va = upload_va + offsetof(VkTraceRaysIndirectCommand2KHR, width); + if (sbt_va) + *sbt_va = upload_va; +} + +static void +radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, VkTraceRaysIndirectCommand2KHR *tables, uint64_t indirect_va, enum radv_rt_mode mode) { if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_RT) @@ -10024,34 +10045,43 @@ radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCom cmd_buffer->compute_scratch_size_per_wave_needed = MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave); + /* Since the workgroup size is 8x4 (or 8x8), 1D dispatches can only fill 8 threads per wave at most. To increase + * occupancy, it's beneficial to convert to a 2D dispatch in these cases. */ + if (tables && tables->height == 1 && tables->width >= cmd_buffer->state.rt_prolog->info.cs.block_size[0]) + tables->height = ACO_RT_CONVERTED_2D_LAUNCH_SIZE; + struct radv_dispatch_info info = {0}; info.unaligned = true; - uint64_t launch_size_va; - uint64_t sbt_va; + uint64_t launch_size_va = 0; + uint64_t sbt_va = 0; if (mode != radv_rt_mode_indirect2) { - uint32_t upload_size = mode == radv_rt_mode_direct ? sizeof(VkTraceRaysIndirectCommand2KHR) - : offsetof(VkTraceRaysIndirectCommand2KHR, width); - - uint32_t offset; - if (!radv_cmd_buffer_upload_data(cmd_buffer, upload_size, tables, &offset)) - return; - - uint64_t upload_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset; - - launch_size_va = - (mode == radv_rt_mode_direct) ? upload_va + offsetof(VkTraceRaysIndirectCommand2KHR, width) : indirect_va; - sbt_va = upload_va; + launch_size_va = indirect_va; + radv_upload_trace_rays_params(cmd_buffer, tables, mode, &launch_size_va, &sbt_va); } else { launch_size_va = indirect_va + offsetof(VkTraceRaysIndirectCommand2KHR, width); sbt_va = indirect_va; } + uint32_t remaining_ray_count = 0; + if (mode == radv_rt_mode_direct) { info.blocks[0] = tables->width; info.blocks[1] = tables->height; info.blocks[2] = tables->depth; + + if (tables->height == ACO_RT_CONVERTED_2D_LAUNCH_SIZE) { + /* We need the ray count for the 2D dispatch to be a multiple of the y block size for the division to work, and + * a multiple of the x block size because the invocation offset must be a multiple of the block size when + * dispatching the remaining rays. Fortunately, the x block size is itself a multiple of the y block size, so + * we only need to ensure that the ray count is a multiple of the x block size. */ + remaining_ray_count = tables->width % rt_prolog->info.cs.block_size[0]; + + uint32_t ray_count = tables->width - remaining_ray_count; + info.blocks[0] = ray_count / rt_prolog->info.cs.block_size[1]; + info.blocks[1] = rt_prolog->info.cs.block_size[1]; + } } else info.va = launch_size_va; @@ -10085,6 +10115,22 @@ radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCom assert(cmd_buffer->cs->cdw <= cdw_max); radv_dispatch(cmd_buffer, &info, pipeline, rt_prolog, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR); + + if (remaining_ray_count) { + info.blocks[0] = remaining_ray_count; + info.blocks[1] = 1; + info.offsets[0] = tables->width - remaining_ray_count; + + /* Reset the ray launch size so the prolog doesn't think this is a converted dispatch */ + tables->height = 1; + radv_upload_trace_rays_params(cmd_buffer, tables, mode, &launch_size_va, NULL); + if (size_loc->sgpr_idx != -1) { + radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + size_loc->sgpr_idx * 4, launch_size_va, + true); + } + + radv_dispatch(cmd_buffer, &info, pipeline, rt_prolog, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR); + } } VKAPI_ATTR void VKAPI_CALL