From 86da28257026e8f23b1fdcbda5630b8980105313 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Fri, 22 Mar 2019 20:39:11 +1000 Subject: [PATCH 1/3] OGL: Support subgroup reduction operations via GL_NV_shader_thread_shuffle --- .../VideoBackends/OGL/ProgramShaderCache.cpp | 29 +++++++++++++++++-- Source/Core/VideoBackends/OGL/Render.cpp | 3 ++ Source/Core/VideoBackends/OGL/Render.h | 1 + 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp index e06491b986..2dc1ef318c 100644 --- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp +++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp @@ -719,6 +719,29 @@ void ProgramShaderCache::CreateHeader() break; } + std::string shader_shuffle_string; + if (g_ogl_config.bSupportsShaderThreadShuffleNV) + { + shader_shuffle_string = R"( +#extension GL_NV_shader_thread_group : enable +#extension GL_NV_shader_thread_shuffle : enable +#define SUPPORTS_SUBGROUP_REDUCTION 1 + +// The xor shuffle below produces incorrect results if all threads in a warp are not active. +#define CAN_USE_SUBGROUP_REDUCTION (ballotThreadNV(true) == 0xFFFFFFFFu) + +#define IS_HELPER_INVOCATION gl_HelperThreadNV +#define IS_FIRST_ACTIVE_INVOCATION (gl_ThreadInWarpNV == findLSB(ballotThreadNV(!gl_HelperThreadNV))) +#define SUBGROUP_REDUCTION(func, value) value = func(value, shuffleXorNV(value, 16, 32)); \ + value = func(value, shuffleXorNV(value, 8, 32)); \ + value = func(value, shuffleXorNV(value, 4, 32)); \ + value = func(value, shuffleXorNV(value, 2, 32)); \ + value = func(value, shuffleXorNV(value, 1, 32)); +#define SUBGROUP_MIN(value) SUBGROUP_REDUCTION(min, value) +#define SUBGROUP_MAX(value) SUBGROUP_REDUCTION(max, value) +)"; + } + s_glsl_header = StringFromFormat( "%s\n" "%s\n" // ubo @@ -737,6 +760,7 @@ void ProgramShaderCache::CreateHeader() "%s\n" // ES dual source blend "%s\n" // shader image load store "%s\n" // shader framebuffer fetch + "%s\n" // shader thread shuffle // Precision defines for GLSL ES "%s\n" @@ -815,8 +839,9 @@ void ProgramShaderCache::CreateHeader() ((!is_glsles && v < Glsl430) || (is_glsles && v < GlslEs310)) ? "#extension GL_ARB_shader_image_load_store : enable" : "", - framebuffer_fetch_string.c_str(), is_glsles ? "precision highp float;" : "", - is_glsles ? "precision highp int;" : "", is_glsles ? "precision highp sampler2DArray;" : "", + framebuffer_fetch_string.c_str(), shader_shuffle_string.c_str(), + is_glsles ? "precision highp float;" : "", is_glsles ? "precision highp int;" : "", + is_glsles ? "precision highp sampler2DArray;" : "", (is_glsles && g_ActiveConfig.backend_info.bSupportsPaletteConversion) ? "precision highp usamplerBuffer;" : "", diff --git a/Source/Core/VideoBackends/OGL/Render.cpp b/Source/Core/VideoBackends/OGL/Render.cpp index fca1de7e7b..06e5135191 100644 --- a/Source/Core/VideoBackends/OGL/Render.cpp +++ b/Source/Core/VideoBackends/OGL/Render.cpp @@ -661,6 +661,9 @@ Renderer::Renderer(std::unique_ptr main_gl_context, float backbuffer_ if (g_ogl_config.max_samples < 1 || !g_ogl_config.bSupportsMSAA) g_ogl_config.max_samples = 1; + g_ogl_config.bSupportsShaderThreadShuffleNV = + GLExtensions::Supports("GL_NV_shader_thread_shuffle"); + // We require texel buffers, image load store, and compute shaders to enable GPU texture decoding. // If the driver doesn't expose the extensions, but supports GL4.3/GLES3.1, it will still be // enabled in the version check below. diff --git a/Source/Core/VideoBackends/OGL/Render.h b/Source/Core/VideoBackends/OGL/Render.h index 442a31d5c0..0fb6c4e93e 100644 --- a/Source/Core/VideoBackends/OGL/Render.h +++ b/Source/Core/VideoBackends/OGL/Render.h @@ -70,6 +70,7 @@ struct VideoConfig bool bSupportsBitfield; bool bSupportsTextureSubImage; EsFbFetchType SupportedFramebufferFetch; + bool bSupportsShaderThreadShuffleNV; const char* gl_vendor; const char* gl_renderer; From 6561850f2bbb8e6faa98e23bdf8293c002fbd405 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Fri, 22 Mar 2019 20:39:37 +1000 Subject: [PATCH 2/3] Vulkan: Support subgroup reduction operations via GL_KHR_shader_subgroup --- .../VideoBackends/Vulkan/ShaderCompiler.cpp | 22 ++++++++++++++- .../VideoBackends/Vulkan/VulkanContext.cpp | 28 +++++++++++++++++++ .../Core/VideoBackends/Vulkan/VulkanContext.h | 6 ++++ .../Vulkan/VulkanEntryPoints.inl | 1 + 4 files changed, 56 insertions(+), 1 deletion(-) diff --git a/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp b/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp index abe6df2653..a9c1fa2c19 100644 --- a/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp +++ b/Source/Core/VideoBackends/Vulkan/ShaderCompiler.cpp @@ -17,6 +17,7 @@ #include "ShaderLang.h" #include "disassemble.h" +#include "Common/CommonFuncs.h" #include "Common/FileUtil.h" #include "Common/Logging/Log.h" #include "Common/MsgHandler.h" @@ -99,6 +100,18 @@ static const char COMPUTE_SHADER_HEADER[] = R"( #define frac fract #define lerp mix )"; +static const char SUBGROUP_HELPER_HEADER[] = R"( + #extension GL_KHR_shader_subgroup_basic : enable + #extension GL_KHR_shader_subgroup_arithmetic : enable + #extension GL_KHR_shader_subgroup_ballot : enable + + #define SUPPORTS_SUBGROUP_REDUCTION 1 + #define CAN_USE_SUBGROUP_REDUCTION true + #define IS_HELPER_INVOCATION gl_HelperInvocation + #define IS_FIRST_ACTIVE_INVOCATION (gl_SubgroupInvocationID == subgroupBallotFindLSB(subgroupBallot(true))) + #define SUBGROUP_MIN(value) value = subgroupMin(value) + #define SUBGROUP_MAX(value) value = subgroupMax(value) +)"; bool CompileShaderToSPV(SPIRVCodeVector* out_code, EShLanguage stage, const char* stage_filename, const char* source_code, size_t source_code_length, const char* header, @@ -120,13 +133,20 @@ bool CompileShaderToSPV(SPIRVCodeVector* out_code, EShLanguage stage, const char int pass_source_code_length = static_cast(source_code_length); if (header_length > 0) { - full_source_code.reserve(header_length + source_code_length); + constexpr size_t subgroup_helper_header_length = ArraySize(SUBGROUP_HELPER_HEADER) - 1; + full_source_code.reserve(header_length + subgroup_helper_header_length + source_code_length); full_source_code.append(header, header_length); + if (g_vulkan_context->SupportsShaderSubgroupOperations()) + full_source_code.append(SUBGROUP_HELPER_HEADER, subgroup_helper_header_length); full_source_code.append(source_code, source_code_length); pass_source_code = full_source_code.c_str(); pass_source_code_length = static_cast(full_source_code.length()); } + // Sub-group operations require Vulkan 1.1 and SPIR-V 1.3. + if (g_vulkan_context->SupportsShaderSubgroupOperations()) + shader->setEnvTarget(glslang::EShTargetSpv, glslang::EShTargetSpv_1_3); + shader->setStringsWithLengths(&pass_source_code, &pass_source_code_length, 1); auto DumpBadShader = [&](const char* msg) { diff --git a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp index 51c0cde329..52038e39dc 100644 --- a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp +++ b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp @@ -384,6 +384,7 @@ std::unique_ptr VulkanContext::Create(VkInstance instance, VkPhys // Initialize DriverDetails so that we can check for bugs to disable features if needed. context->InitDriverDetails(); + context->PopulateShaderSubgroupSupport(); // Enable debug reports if the "Host GPU" log category is enabled. if (enable_debug_reports) @@ -864,4 +865,31 @@ void VulkanContext::InitDriverDetails() static_cast(m_device_properties.driverVersion), DriverDetails::Family::UNKNOWN); } + +void VulkanContext::PopulateShaderSubgroupSupport() +{ + // If this function isn't available, we don't support Vulkan 1.1. + if (!vkGetPhysicalDeviceProperties2) + return; + + VkPhysicalDeviceProperties2 device_properties_2 = {}; + device_properties_2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + + VkPhysicalDeviceSubgroupProperties subgroup_properties = {}; + subgroup_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; + device_properties_2.pNext = &subgroup_properties; + + vkGetPhysicalDeviceProperties2(m_physical_device, &device_properties_2); + + m_shader_subgroup_size = subgroup_properties.subgroupSize; + + // We require basic ops (for gl_SubgroupInvocationID), ballot (for subgroupBallot, + // subgroupBallotFindLSB), and arithmetic (for subgroupMin/subgroupMax). + constexpr VkSubgroupFeatureFlags required_operations = VK_SUBGROUP_FEATURE_BASIC_BIT | + VK_SUBGROUP_FEATURE_ARITHMETIC_BIT | + VK_SUBGROUP_FEATURE_BALLOT_BIT; + m_supports_shader_subgroup_operations = + (subgroup_properties.supportedOperations & required_operations) == required_operations && + subgroup_properties.supportedStages & VK_SHADER_STAGE_FRAGMENT_BIT; +} } // namespace Vulkan diff --git a/Source/Core/VideoBackends/Vulkan/VulkanContext.h b/Source/Core/VideoBackends/Vulkan/VulkanContext.h index 3f4492bc4a..dcd9584e50 100644 --- a/Source/Core/VideoBackends/Vulkan/VulkanContext.h +++ b/Source/Core/VideoBackends/Vulkan/VulkanContext.h @@ -80,6 +80,8 @@ public: { return m_device_features.occlusionQueryPrecise == VK_TRUE; } + u32 GetShaderSubgroupSize() const { return m_shader_subgroup_size; } + bool SupportsShaderSubgroupOperations() const { return m_supports_shader_subgroup_operations; } // Helpers for getting constants VkDeviceSize GetUniformBufferAlignment() const @@ -112,6 +114,7 @@ private: bool SelectDeviceFeatures(); bool CreateDevice(VkSurfaceKHR surface, bool enable_validation_layer); void InitDriverDetails(); + void PopulateShaderSubgroupSupport(); VkInstance m_instance = VK_NULL_HANDLE; VkPhysicalDevice m_physical_device = VK_NULL_HANDLE; @@ -128,6 +131,9 @@ private: VkPhysicalDeviceFeatures m_device_features = {}; VkPhysicalDeviceProperties m_device_properties = {}; VkPhysicalDeviceMemoryProperties m_device_memory_properties = {}; + + u32 m_shader_subgroup_size = 1; + bool m_supports_shader_subgroup_operations = false; }; extern std::unique_ptr g_vulkan_context; diff --git a/Source/Core/VideoBackends/Vulkan/VulkanEntryPoints.inl b/Source/Core/VideoBackends/Vulkan/VulkanEntryPoints.inl index c40483e10e..2a4c4bda24 100644 --- a/Source/Core/VideoBackends/Vulkan/VulkanEntryPoints.inl +++ b/Source/Core/VideoBackends/Vulkan/VulkanEntryPoints.inl @@ -59,6 +59,7 @@ VULKAN_INSTANCE_ENTRY_POINT(vkCreateMacOSSurfaceMVK, false) VULKAN_INSTANCE_ENTRY_POINT(vkCreateDebugReportCallbackEXT, false) VULKAN_INSTANCE_ENTRY_POINT(vkDestroyDebugReportCallbackEXT, false) VULKAN_INSTANCE_ENTRY_POINT(vkDebugReportMessageEXT, false) +VULKAN_INSTANCE_ENTRY_POINT(vkGetPhysicalDeviceProperties2, false) #endif // VULKAN_INSTANCE_ENTRY_POINT From d66d778bae8484cdff239da33c1b87ba41297366 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Fri, 22 Mar 2019 20:39:54 +1000 Subject: [PATCH 3/3] PixelShaderGen: Use subgroup reduction operations for bounding box --- Source/Core/VideoCommon/PixelShaderGen.cpp | 40 ++++++++++++++++------ 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp index 4d37418725..7a771c430e 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.cpp +++ b/Source/Core/VideoCommon/PixelShaderGen.cpp @@ -459,6 +459,17 @@ SSBO_BINDING(0) buffer BBox { }; #endif +void UpdateBoundingBoxBuffer(float2 min_pos, float2 max_pos) { + if (bbox_left > int(min_pos.x)) + atomicMin(bbox_left, int(min_pos.x)); + if (bbox_right < int(max_pos.x)) + atomicMax(bbox_right, int(max_pos.x)); + if (bbox_top > int(min_pos.y)) + atomicMin(bbox_top, int(min_pos.y)); + if (bbox_bottom < int(max_pos.y)) + atomicMax(bbox_bottom, int(max_pos.y)); +} + void UpdateBoundingBox(float2 rawpos) { // The pixel center in the GameCube GPU is 7/12, not 0.5 (see VertexShaderGen.cpp) // Adjust for this by unapplying the offset we added in the vertex shader. @@ -471,18 +482,25 @@ void UpdateBoundingBox(float2 rawpos) { #endif // The bounding box register is exclusive of the right coordinate, hence the +1. - int2 pos = iround(rawpos * cefbscale + offset); - int2 pos_offset = pos + int2(1, 1); + int2 min_pos = iround(rawpos * cefbscale + offset); + int2 max_pos = min_pos + int2(1, 1); - if (bbox_left > pos.x) - atomicMin(bbox_left, pos.x); - if (bbox_right < pos_offset.x) - atomicMax(bbox_right, pos_offset.x); - if (bbox_top > pos.y) - atomicMin(bbox_top, pos.y); - if (bbox_bottom < pos_offset.y) - atomicMax(bbox_bottom, pos_offset.y); +#ifdef SUPPORTS_SUBGROUP_REDUCTION + if (CAN_USE_SUBGROUP_REDUCTION) { + min_pos = IS_HELPER_INVOCATION ? int2(2147483647, 2147483647) : min_pos; + max_pos = IS_HELPER_INVOCATION ? int2(-2147483648, -2147483648) : max_pos; + SUBGROUP_MIN(min_pos); + SUBGROUP_MAX(max_pos); + if (IS_FIRST_ACTIVE_INVOCATION) + UpdateBoundingBoxBuffer(min_pos, max_pos); + } else { + UpdateBoundingBoxBuffer(min_pos, max_pos); + } +#else + UpdateBoundingBoxBuffer(min_pos, max_pos); +#endif } + )"); } } @@ -1332,7 +1350,7 @@ static void WriteAlphaTest(ShaderCode& out, const pixel_shader_uid_data* uid_dat if (!uid_data->alpha_test_use_zcomploc_hack) { out.Write("\t\tdiscard;\n"); - if (ApiType != APIType::D3D) + if (ApiType == APIType::D3D) out.Write("\t\treturn;\n"); }