From 86da28257026e8f23b1fdcbda5630b8980105313 Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Fri, 22 Mar 2019 20:39:11 +1000
Subject: [PATCH 1/3] OGL: Support subgroup reduction operations via
 GL_NV_shader_thread_shuffle

---
 .../VideoBackends/OGL/ProgramShaderCache.cpp  | 29 +++++++++++++++++--
 Source/Core/VideoBackends/OGL/Render.cpp      |  3 ++
 Source/Core/VideoBackends/OGL/Render.h        |  1 +
 3 files changed, 31 insertions(+), 2 deletions(-)
diff --git a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp
index e06491b986..2dc1ef318c 100644
--- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp
+++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp
@@ -719,6 +719,29 @@ void ProgramShaderCache::CreateHeader()
     break;
   }
 
+  std::string shader_shuffle_string;
+  if (g_ogl_config.bSupportsShaderThreadShuffleNV)
+  {
+    shader_shuffle_string = R"(
+#extension GL_NV_shader_thread_group : enable
+#extension GL_NV_shader_thread_shuffle : enable
+#define SUPPORTS_SUBGROUP_REDUCTION 1
+
+// The xor shuffle below produces incorrect results if all threads in a warp are not active.
+#define CAN_USE_SUBGROUP_REDUCTION (ballotThreadNV(true) == 0xFFFFFFFFu)
+
+#define IS_HELPER_INVOCATION gl_HelperThreadNV
+#define IS_FIRST_ACTIVE_INVOCATION (gl_ThreadInWarpNV == findLSB(ballotThreadNV(!gl_HelperThreadNV)))
+#define SUBGROUP_REDUCTION(func, value) value = func(value, shuffleXorNV(value, 16, 32)); \
+                                        value = func(value, shuffleXorNV(value, 8, 32)); \
+                                        value = func(value, shuffleXorNV(value, 4, 32)); \
+                                        value = func(value, shuffleXorNV(value, 2, 32)); \
+                                        value = func(value, shuffleXorNV(value, 1, 32));
+#define SUBGROUP_MIN(value) SUBGROUP_REDUCTION(min, value)
+#define SUBGROUP_MAX(value) SUBGROUP_REDUCTION(max, value)
+)";
+  }
+
   s_glsl_header = StringFromFormat(
       "%s\n"
       "%s\n"  // ubo
@@ -737,6 +760,7 @@ void ProgramShaderCache::CreateHeader()
       "%s\n"  // ES dual source blend
       "%s\n"  // shader image load store
       "%s\n"  // shader framebuffer fetch
+      "%s\n"  // shader thread shuffle
 
       // Precision defines for GLSL ES
       "%s\n"
@@ -815,8 +839,9 @@ void ProgramShaderCache::CreateHeader()
               ((!is_glsles && v < Glsl430) || (is_glsles && v < GlslEs310)) ?
           "#extension GL_ARB_shader_image_load_store : enable" :
           "",
-      framebuffer_fetch_string.c_str(), is_glsles ? "precision highp float;" : "",
-      is_glsles ? "precision highp int;" : "", is_glsles ? "precision highp sampler2DArray;" : "",
+      framebuffer_fetch_string.c_str(), shader_shuffle_string.c_str(),
+      is_glsles ? "precision highp float;" : "", is_glsles ? "precision highp int;" : "",
+      is_glsles ? "precision highp sampler2DArray;" : "",
       (is_glsles && g_ActiveConfig.backend_info.bSupportsPaletteConversion) ?
           "precision highp usamplerBuffer;" :
           "",
diff --git a/Source/Core/VideoBackends/OGL/Render.cpp b/Source/Core/VideoBackends/OGL/Render.cpp
index fca1de7e7b..06e5135191 100644
--- a/Source/Core/VideoBackends/OGL/Render.cpp
+++ b/Source/Core/VideoBackends/OGL/Render.cpp
@@ -661,6 +661,9 @@ Renderer::Renderer(std::unique_ptr<GLContext> main_gl_context, float backbuffer_
   if (g_ogl_config.max_samples < 1 || !g_ogl_config.bSupportsMSAA)
     g_ogl_config.max_samples = 1;
 
+  g_ogl_config.bSupportsShaderThreadShuffleNV =
+      GLExtensions::Supports("GL_NV_shader_thread_shuffle");
+
   // We require texel buffers, image load store, and compute shaders to enable GPU texture decoding.
   // If the driver doesn't expose the extensions, but supports GL4.3/GLES3.1, it will still be
   // enabled in the version check below.
diff --git a/Source/Core/VideoBackends/OGL/Render.h b/Source/Core/VideoBackends/OGL/Render.h
index 442a31d5c0..0fb6c4e93e 100644
--- a/Source/Core/VideoBackends/OGL/Render.h
+++ b/Source/Core/VideoBackends/OGL/Render.h
@@ -70,6 +70,7 @@ struct VideoConfig
   bool bSupportsBitfield;
   bool bSupportsTextureSubImage;
   EsFbFetchType SupportedFramebufferFetch;
+  bool bSupportsShaderThreadShuffleNV;
 
   const char* gl_vendor;
   const char* gl_renderer;

From 6561850f2bbb8e6faa98e23bdf8293c002fbd405 Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Fri, 22 Mar 2019 20:39:37 +1000
Subject: [PATCH 2/3] Vulkan: Support subgroup reduction operations via
 GL_KHR_shader_subgroup

---
 .../VideoBackends/Vulkan/ShaderCompiler.cpp   | 22 ++++++++++++++-
 .../VideoBackends/Vulkan/VulkanContext.cpp    | 28 +++++++++++++++++++
 .../Core/VideoBackends/Vulkan/VulkanContext.h |  6 ++++
 .../Vulkan/VulkanEntryPoints.inl              |  1 +
 4 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp b/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp
index abe6df2653..a9c1fa2c19 100644
--- a/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp
+++ b/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp
@@ -17,6 +17,7 @@
 #include "ShaderLang.h"
 #include "disassemble.h"
 
+#include "Common/CommonFuncs.h"
 #include "Common/FileUtil.h"
 #include "Common/Logging/Log.h"
 #include "Common/MsgHandler.h"
@@ -99,6 +100,18 @@ static const char COMPUTE_SHADER_HEADER[] = R"(
   #define frac fract
   #define lerp mix
 )";
+static const char SUBGROUP_HELPER_HEADER[] = R"(
+  #extension GL_KHR_shader_subgroup_basic : enable
+  #extension GL_KHR_shader_subgroup_arithmetic : enable
+  #extension GL_KHR_shader_subgroup_ballot : enable
+
+  #define SUPPORTS_SUBGROUP_REDUCTION 1
+  #define CAN_USE_SUBGROUP_REDUCTION true
+  #define IS_HELPER_INVOCATION gl_HelperInvocation
+  #define IS_FIRST_ACTIVE_INVOCATION (gl_SubgroupInvocationID == subgroupBallotFindLSB(subgroupBallot(true)))
+  #define SUBGROUP_MIN(value) value = subgroupMin(value)
+  #define SUBGROUP_MAX(value) value = subgroupMax(value)
+)";
 
 bool CompileShaderToSPV(SPIRVCodeVector* out_code, EShLanguage stage, const char* stage_filename,
                         const char* source_code, size_t source_code_length, const char* header,
@@ -120,13 +133,20 @@ bool CompileShaderToSPV(SPIRVCodeVector* out_code, EShLanguage stage, const char
   int pass_source_code_length = static_cast<int>(source_code_length);
   if (header_length > 0)
   {
-    full_source_code.reserve(header_length + source_code_length);
+    constexpr size_t subgroup_helper_header_length = ArraySize(SUBGROUP_HELPER_HEADER) - 1;
+    full_source_code.reserve(header_length + subgroup_helper_header_length + source_code_length);
     full_source_code.append(header, header_length);
+    if (g_vulkan_context->SupportsShaderSubgroupOperations())
+      full_source_code.append(SUBGROUP_HELPER_HEADER, subgroup_helper_header_length);
     full_source_code.append(source_code, source_code_length);
     pass_source_code = full_source_code.c_str();
     pass_source_code_length = static_cast<int>(full_source_code.length());
   }
 
+  // Sub-group operations require Vulkan 1.1 and SPIR-V 1.3.
+  if (g_vulkan_context->SupportsShaderSubgroupOperations())
+    shader->setEnvTarget(glslang::EShTargetSpv, glslang::EShTargetSpv_1_3);
+
   shader->setStringsWithLengths(&pass_source_code, &pass_source_code_length, 1);
 
   auto DumpBadShader = [&](const char* msg) {
diff --git a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp
index 51c0cde329..52038e39dc 100644
--- a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp
+++ b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp
@@ -384,6 +384,7 @@ std::unique_ptr<VulkanContext> VulkanContext::Create(VkInstance instance, VkPhys
 
   // Initialize DriverDetails so that we can check for bugs to disable features if needed.
   context->InitDriverDetails();
+  context->PopulateShaderSubgroupSupport();
 
   // Enable debug reports if the "Host GPU" log category is enabled.
   if (enable_debug_reports)
@@ -864,4 +865,31 @@ void VulkanContext::InitDriverDetails()
                       static_cast<double>(m_device_properties.driverVersion),
                       DriverDetails::Family::UNKNOWN);
 }
+
+void VulkanContext::PopulateShaderSubgroupSupport()
+{
+  // If this function isn't available, we don't support Vulkan 1.1.
+  if (!vkGetPhysicalDeviceProperties2)
+    return;
+
+  VkPhysicalDeviceProperties2 device_properties_2 = {};
+  device_properties_2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+
+  VkPhysicalDeviceSubgroupProperties subgroup_properties = {};
+  subgroup_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
+  device_properties_2.pNext = &subgroup_properties;
+
+  vkGetPhysicalDeviceProperties2(m_physical_device, &device_properties_2);
+
+  m_shader_subgroup_size = subgroup_properties.subgroupSize;
+
+  // We require basic ops (for gl_SubgroupInvocationID), ballot (for subgroupBallot,
+  // subgroupBallotFindLSB), and arithmetic (for subgroupMin/subgroupMax).
+  constexpr VkSubgroupFeatureFlags required_operations = VK_SUBGROUP_FEATURE_BASIC_BIT |
+                                                         VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
+                                                         VK_SUBGROUP_FEATURE_BALLOT_BIT;
+  m_supports_shader_subgroup_operations =
+      (subgroup_properties.supportedOperations & required_operations) == required_operations &&
+      subgroup_properties.supportedStages & VK_SHADER_STAGE_FRAGMENT_BIT;
+}
 }  // namespace Vulkan
diff --git a/Source/Core/VideoBackends/Vulkan/VulkanContext.h b/Source/Core/VideoBackends/Vulkan/VulkanContext.h
index 3f4492bc4a..dcd9584e50 100644
--- a/Source/Core/VideoBackends/Vulkan/VulkanContext.h
+++ b/Source/Core/VideoBackends/Vulkan/VulkanContext.h
@@ -80,6 +80,8 @@ public:
   {
     return m_device_features.occlusionQueryPrecise == VK_TRUE;
   }
+  u32 GetShaderSubgroupSize() const { return m_shader_subgroup_size; }
+  bool SupportsShaderSubgroupOperations() const { return m_supports_shader_subgroup_operations; }
 
   // Helpers for getting constants
   VkDeviceSize GetUniformBufferAlignment() const
@@ -112,6 +114,7 @@ private:
   bool SelectDeviceFeatures();
   bool CreateDevice(VkSurfaceKHR surface, bool enable_validation_layer);
   void InitDriverDetails();
+  void PopulateShaderSubgroupSupport();
 
   VkInstance m_instance = VK_NULL_HANDLE;
   VkPhysicalDevice m_physical_device = VK_NULL_HANDLE;
@@ -128,6 +131,9 @@ private:
   VkPhysicalDeviceFeatures m_device_features = {};
   VkPhysicalDeviceProperties m_device_properties = {};
   VkPhysicalDeviceMemoryProperties m_device_memory_properties = {};
+
+  u32 m_shader_subgroup_size = 1;
+  bool m_supports_shader_subgroup_operations = false;
 };
 
 extern std::unique_ptr<VulkanContext> g_vulkan_context;
diff --git a/Source/Core/VideoBackends/Vulkan/VulkanEntryPoints.inl b/Source/Core/VideoBackends/Vulkan/VulkanEntryPoints.inl
index c40483e10e..2a4c4bda24 100644
--- a/Source/Core/VideoBackends/Vulkan/VulkanEntryPoints.inl
+++ b/Source/Core/VideoBackends/Vulkan/VulkanEntryPoints.inl
@@ -59,6 +59,7 @@ VULKAN_INSTANCE_ENTRY_POINT(vkCreateMacOSSurfaceMVK, false)
 VULKAN_INSTANCE_ENTRY_POINT(vkCreateDebugReportCallbackEXT, false)
 VULKAN_INSTANCE_ENTRY_POINT(vkDestroyDebugReportCallbackEXT, false)
 VULKAN_INSTANCE_ENTRY_POINT(vkDebugReportMessageEXT, false)
+VULKAN_INSTANCE_ENTRY_POINT(vkGetPhysicalDeviceProperties2, false)
 
 #endif  // VULKAN_INSTANCE_ENTRY_POINT
 

From d66d778bae8484cdff239da33c1b87ba41297366 Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Fri, 22 Mar 2019 20:39:54 +1000
Subject: [PATCH 3/3] PixelShaderGen: Use subgroup reduction operations for
 bounding box

---
 Source/Core/VideoCommon/PixelShaderGen.cpp | 40 ++++++++++++++++------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp
index 4d37418725..7a771c430e 100644
--- a/Source/Core/VideoCommon/PixelShaderGen.cpp
+++ b/Source/Core/VideoCommon/PixelShaderGen.cpp
@@ -459,6 +459,17 @@ SSBO_BINDING(0) buffer BBox {
 };
 #endif
 
+void UpdateBoundingBoxBuffer(float2 min_pos, float2 max_pos) {
+  if (bbox_left > int(min_pos.x))
+    atomicMin(bbox_left, int(min_pos.x));
+  if (bbox_right < int(max_pos.x))
+    atomicMax(bbox_right, int(max_pos.x));
+  if (bbox_top > int(min_pos.y))
+    atomicMin(bbox_top, int(min_pos.y));
+  if (bbox_bottom < int(max_pos.y))
+    atomicMax(bbox_bottom, int(max_pos.y));
+}
+
 void UpdateBoundingBox(float2 rawpos) {
   // The pixel center in the GameCube GPU is 7/12, not 0.5 (see VertexShaderGen.cpp)
   // Adjust for this by unapplying the offset we added in the vertex shader.
@@ -471,18 +482,25 @@ void UpdateBoundingBox(float2 rawpos) {
 #endif
 
   // The bounding box register is exclusive of the right coordinate, hence the +1.
-  int2 pos = iround(rawpos * cefbscale + offset);
-  int2 pos_offset = pos + int2(1, 1);
+  int2 min_pos = iround(rawpos * cefbscale + offset);
+  int2 max_pos = min_pos + int2(1, 1);
 
-  if (bbox_left > pos.x)
-    atomicMin(bbox_left, pos.x);
-  if (bbox_right < pos_offset.x)
-    atomicMax(bbox_right, pos_offset.x);
-  if (bbox_top > pos.y)
-    atomicMin(bbox_top, pos.y);
-  if (bbox_bottom < pos_offset.y)
-    atomicMax(bbox_bottom, pos_offset.y);
+#ifdef SUPPORTS_SUBGROUP_REDUCTION
+  if (CAN_USE_SUBGROUP_REDUCTION) {
+    min_pos = IS_HELPER_INVOCATION ? int2(2147483647, 2147483647) : min_pos;
+    max_pos = IS_HELPER_INVOCATION ? int2(-2147483648, -2147483648) : max_pos;
+    SUBGROUP_MIN(min_pos);
+    SUBGROUP_MAX(max_pos);
+    if (IS_FIRST_ACTIVE_INVOCATION)
+      UpdateBoundingBoxBuffer(min_pos, max_pos);
+  } else {
+    UpdateBoundingBoxBuffer(min_pos, max_pos);
+  }
+#else
+  UpdateBoundingBoxBuffer(min_pos, max_pos);
+#endif
 }
+
 )");
   }
 }
@@ -1332,7 +1350,7 @@ static void WriteAlphaTest(ShaderCode& out, const pixel_shader_uid_data* uid_dat
   if (!uid_data->alpha_test_use_zcomploc_hack)
   {
     out.Write("\t\tdiscard;\n");
-    if (ApiType != APIType::D3D)
+    if (ApiType == APIType::D3D)
       out.Write("\t\treturn;\n");
   }