feat(mesa): Add patch for drastically improved RADV RT performance

2025-02-04 03:39:58 +00:00 · 2024-01-05 12:16:58 -08:00 · 2024-01-05 12:16:58 -08:00 · 566bb401ee
commit 566bb401ee
parent 70b0737770
2 changed files with 230 additions and 2 deletions
--- a/spec_files/mesa/26105.patch
+++ b/spec_files/mesa/26105.patch
@ -0,0 +1,225 @@
+From 1e3541728b63cd635aac7211b76798edfa8c34eb Mon Sep 17 00:00:00 2001
+From: Friedrich Vock <friedrich.vock@gmx.de>
+Date: Tue, 7 Nov 2023 22:28:44 +0100
+Subject: [PATCH] radv,aco: Convert 1D ray launches to 2D
+
+Because we use unaligned dispatches, 1D launches only use 8 threads per
+wave. Converting to 2D and fixing up launch IDs in the prolog
+significantly increases occupancy.
+
+Gives ~30% uplift in Ghostwire Tokyo.
+
+Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26105>
+---
+ .../compiler/aco_instruction_selection.cpp    | 43 ++++++++++-
+ src/amd/compiler/aco_interface.h              |  3 +
+ src/amd/vulkan/radv_cmd_buffer.c              | 76 +++++++++++++++----
+ 3 files changed, 103 insertions(+), 19 deletions(-)
+
+diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
+index 4bcf85f8ed644..bdf81dff3589d 100644
+--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
+@@ -12534,7 +12534,8 @@ select_rt_prolog(Program* program, ac_shader_config* config,
+     */
+    PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr);
+    PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_size);
+-   PhysReg out_launch_size_z = out_launch_size_x.advance(8);
+   PhysReg out_launch_size_y = out_launch_size_x.advance(4);
+   PhysReg out_launch_size_z = out_launch_size_y.advance(4);
+    PhysReg out_launch_ids[3];
+    for (unsigned i = 0; i < 3; i++)
+       out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_id).advance(i * 4);
+@@ -12542,9 +12543,13 @@ select_rt_prolog(Program* program, ac_shader_config* config,
+    PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record);
+ 
+    /* Temporaries: */
+-   num_sgprs = align(num_sgprs, 2) + 4;
+-   PhysReg tmp_raygen_sbt = PhysReg{num_sgprs - 4};
+-   PhysReg tmp_ring_offsets = PhysReg{num_sgprs - 2};
+   num_sgprs = align(num_sgprs, 2);
+   PhysReg tmp_raygen_sbt = PhysReg{num_sgprs};
+   num_sgprs += 2;
+   PhysReg tmp_ring_offsets = PhysReg{num_sgprs};
+   num_sgprs += 2;
+
+   PhysReg tmp_invocation_idx = PhysReg{256 + num_vgprs++};
+ 
+    /* Confirm some assumptions about register aliasing */
+    assert(in_ring_offsets == out_uniform_shader_addr);
+@@ -12618,6 +12623,36 @@ select_rt_prolog(Program* program, ac_shader_config* config,
+    bld.vop1(aco_opcode::v_mov_b32, Definition(out_record_ptr.advance(4), v1),
+             Operand(tmp_raygen_sbt.advance(4), s1));
+ 
+   /* For 1D dispatches converted into 2D ones, we need to fix up the launch IDs.
+    * Calculating the 1D launch ID is: id = local_invocation_index + (wg_id.x * wg_size).
+    * in_wg_id_x now holds wg_id.x * wg_size.
+    */
+   bld.sop2(aco_opcode::s_lshl_b32, Definition(in_wg_id_x, s1), Definition(scc, s1),
+            Operand(in_wg_id_x, s1), Operand::c32(program->workgroup_size == 32 ? 5 : 6));
+
+   /* Calculate and add local_invocation_index */
+   bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(tmp_invocation_idx, v1), Operand::c32(-1u),
+            Operand(in_wg_id_x, s1));
+   if (program->wave_size == 64) {
+      if (program->gfx_level <= GFX7)
+         bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(tmp_invocation_idx, v1),
+                  Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
+      else
+         bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(tmp_invocation_idx, v1),
+                  Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
+   }
+
+   /* Make fixup operations a no-op if this is not a converted 2D dispatch. */
+   bld.sopc(aco_opcode::s_cmp_lg_u32, Definition(scc, s1),
+            Operand::c32(ACO_RT_CONVERTED_2D_LAUNCH_SIZE), Operand(out_launch_size_y, s1));
+   bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm),
+            Operand::c32_or_c64(-1u, program->wave_size == 64),
+            Operand::c32_or_c64(0, program->wave_size == 64), Operand(scc, s1));
+   bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[0], v1),
+            Operand(tmp_invocation_idx, v1), Operand(out_launch_ids[0], v1), Operand(vcc, bld.lm));
+   bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), Operand::zero(),
+            Operand(out_launch_ids[1], v1), Operand(vcc, bld.lm));
+
+    /* jump to raygen */
+    bld.sop1(aco_opcode::s_setpc_b64, Operand(out_uniform_shader_addr, s2));
+ 
+diff --git a/src/amd/compiler/aco_interface.h b/src/amd/compiler/aco_interface.h
+index 85c270ba19983..15e5398416b3c 100644
+--- a/src/amd/compiler/aco_interface.h
+++ b/src/amd/compiler/aco_interface.h
+@@ -34,6 +34,9 @@
+ extern "C" {
+ #endif
+ 
+/* Special launch size to indicate this dispatch is a 1D dispatch converted into a 2D one */
+#define ACO_RT_CONVERTED_2D_LAUNCH_SIZE -1u
+
+ struct ac_shader_config;
+ struct aco_shader_info;
+ struct aco_vs_prolog_info;
+diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
+index dab2230d778a5..81d553cf32fe7 100644
+--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
+@@ -42,6 +42,8 @@
+ #include "ac_debug.h"
+ #include "ac_shader_args.h"
+ 
+#include "aco_interface.h"
+
+ #include "util/fast_idiv_by_const.h"
+ 
+ enum {
+@@ -9963,7 +9965,26 @@ enum radv_rt_mode {
+ };
+ 
+ static void
+-radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCommand2KHR *tables, uint64_t indirect_va,
+radv_upload_trace_rays_params(struct radv_cmd_buffer *cmd_buffer, VkTraceRaysIndirectCommand2KHR *tables,
+                              enum radv_rt_mode mode, uint64_t *launch_size_va, uint64_t *sbt_va)
+{
+   uint32_t upload_size = mode == radv_rt_mode_direct ? sizeof(VkTraceRaysIndirectCommand2KHR)
+                                                      : offsetof(VkTraceRaysIndirectCommand2KHR, width);
+
+   uint32_t offset;
+   if (!radv_cmd_buffer_upload_data(cmd_buffer, upload_size, tables, &offset))
+      return;
+
+   uint64_t upload_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
+
+   if (mode == radv_rt_mode_direct)
+      *launch_size_va = upload_va + offsetof(VkTraceRaysIndirectCommand2KHR, width);
+   if (sbt_va)
+      *sbt_va = upload_va;
+}
+
+static void
+radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, VkTraceRaysIndirectCommand2KHR *tables, uint64_t indirect_va,
+                 enum radv_rt_mode mode)
+ {
+    if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_RT)
+@@ -9984,34 +10005,43 @@ radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCom
+    cmd_buffer->compute_scratch_size_per_wave_needed =
+       MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
+ 
+   /* Since the workgroup size is 8x4 (or 8x8), 1D dispatches can only fill 8 threads per wave at most. To increase
+    * occupancy, it's beneficial to convert to a 2D dispatch in these cases. */
+   if (tables && tables->height == 1 && tables->width >= cmd_buffer->state.rt_prolog->info.cs.block_size[0])
+      tables->height = ACO_RT_CONVERTED_2D_LAUNCH_SIZE;
+
+    struct radv_dispatch_info info = {0};
+    info.unaligned = true;
+ 
+-   uint64_t launch_size_va;
+-   uint64_t sbt_va;
+   uint64_t launch_size_va = 0;
+   uint64_t sbt_va = 0;
+ 
+    if (mode != radv_rt_mode_indirect2) {
+-      uint32_t upload_size = mode == radv_rt_mode_direct ? sizeof(VkTraceRaysIndirectCommand2KHR)
+-                                                         : offsetof(VkTraceRaysIndirectCommand2KHR, width);
+-
+-      uint32_t offset;
+-      if (!radv_cmd_buffer_upload_data(cmd_buffer, upload_size, tables, &offset))
+-         return;
+-
+-      uint64_t upload_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
+-
+-      launch_size_va =
+-         (mode == radv_rt_mode_direct) ? upload_va + offsetof(VkTraceRaysIndirectCommand2KHR, width) : indirect_va;
+-      sbt_va = upload_va;
+      launch_size_va = indirect_va;
+      radv_upload_trace_rays_params(cmd_buffer, tables, mode, &launch_size_va, &sbt_va);
+    } else {
+       launch_size_va = indirect_va + offsetof(VkTraceRaysIndirectCommand2KHR, width);
+       sbt_va = indirect_va;
+    }
+ 
+   uint32_t remaining_ray_count = 0;
+
+    if (mode == radv_rt_mode_direct) {
+       info.blocks[0] = tables->width;
+       info.blocks[1] = tables->height;
+       info.blocks[2] = tables->depth;
+
+      if (tables->height == ACO_RT_CONVERTED_2D_LAUNCH_SIZE) {
+         /* We need the ray count for the 2D dispatch to be a multiple of the y block size for the division to work, and
+          * a multiple of the x block size because the invocation offset must be a multiple of the block size when
+          * dispatching the remaining rays. Fortunately, the x block size is itself a multiple of the y block size, so
+          * we only need to ensure that the ray count is a multiple of the x block size. */
+         remaining_ray_count = tables->width % rt_prolog->info.cs.block_size[0];
+
+         uint32_t ray_count = tables->width - remaining_ray_count;
+         info.blocks[0] = ray_count / rt_prolog->info.cs.block_size[1];
+         info.blocks[1] = rt_prolog->info.cs.block_size[1];
+      }
+    } else
+       info.va = launch_size_va;
+ 
+@@ -10045,6 +10075,22 @@ radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCom
+    assert(cmd_buffer->cs->cdw <= cdw_max);
+ 
+    radv_dispatch(cmd_buffer, &info, pipeline, rt_prolog, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
+
+   if (remaining_ray_count) {
+      info.blocks[0] = remaining_ray_count;
+      info.blocks[1] = 1;
+      info.offsets[0] = tables->width - remaining_ray_count;
+
+      /* Reset the ray launch size so the prolog doesn't think this is a converted dispatch */
+      tables->height = 1;
+      radv_upload_trace_rays_params(cmd_buffer, tables, mode, &launch_size_va, NULL);
+      if (size_loc->sgpr_idx != -1) {
+         radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + size_loc->sgpr_idx * 4, launch_size_va,
+                                  true);
+      }
+
+      radv_dispatch(cmd_buffer, &info, pipeline, rt_prolog, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
+   }
+ }
+ 
+ VKAPI_ATTR void VKAPI_CALL
+-- 
+GitLab
+
--- a/spec_files/mesa/mesa.spec
+++ b/spec_files/mesa/mesa.spec
@ -63,7 +63,7 @@ Name:           mesa
 Summary:        Mesa graphics libraries
 %global ver 23.3.1
 Version:        %{lua:ver = string.gsub(rpm.expand("%{ver}"), "-", "~"); print(ver)}
-Release:        100.bazzite.{{{ git_dir_version }}}
+Release:        101.bazzite.{{{ git_dir_version }}}
 License:        MIT AND BSD-3-Clause AND SGI-B-2.0
 URL:            http://www.mesa3d.org

@ -76,8 +76,11 @@ Source1:        Mesa-MLAA-License-Clarification-Email.txt
 Patch10:        gnome-shell-glthread-disable.patch
 Patch11:        0001-intel-compiler-move-gen5-final-pass-to-actually-be-f.patch

+# https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26105/
+Patch30:        26105.patch
+
 # https://gitlab.com/evlaV/mesa/
-Patch30:         valve.patch
+Patch40:        valve.patch

 BuildRequires:  meson >= 1.2.0
 BuildRequires:  gcc