Use re-spirv instead of native specialization constants. Optimize the amount of ubershader pipelines that need to be created. (#69)

This commit is contained in:
Darío 2024-08-07 21:38:50 -03:00 committed by GitHub
parent cb631739b0
commit b91d6a7441
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 137 additions and 103 deletions

3
.gitmodules vendored
View File

@ -52,3 +52,6 @@
[submodule "src/contrib/zstd"] [submodule "src/contrib/zstd"]
path = src/contrib/zstd path = src/contrib/zstd
url = https://github.com/facebook/zstd url = https://github.com/facebook/zstd
[submodule "src/contrib/re-spirv"]
path = src/contrib/re-spirv
url = https://github.com/rt64/re-spirv

View File

@ -3,11 +3,17 @@ project(rt64)
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_VISIBILITY_PRESET hidden) set(CMAKE_CXX_VISIBILITY_PRESET hidden)
option(RT64_STATIC "Build RT64 as a static library" OFF)
option(RT64_BUILD_EXAMPLES "Build examples for RT64" OFF) option(RT64_BUILD_EXAMPLES "Build examples for RT64" OFF)
if (${RT64_BUILD_EXAMPLES}) if (${RT64_BUILD_EXAMPLES})
set(RT64_STATIC ON) set(RT64_STATIC ON)
endif() endif()
if (NOT ${RT64_STATIC})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
endif()
function(preprocess INFILE OUTFILE OPTIONS) function(preprocess INFILE OUTFILE OPTIONS)
if (CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC") if (CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC")
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
@ -56,6 +62,7 @@ set(ZSTD_BUILD_STATIC ON)
set(ZSTD_BUILD_SHARED OFF) set(ZSTD_BUILD_SHARED OFF)
add_subdirectory(src/tools/file_to_c) add_subdirectory(src/tools/file_to_c)
add_subdirectory(src/contrib/re-spirv)
add_subdirectory(src/contrib/nativefiledialog-extended) add_subdirectory(src/contrib/nativefiledialog-extended)
add_subdirectory(src/contrib/zstd/build/cmake) add_subdirectory(src/contrib/zstd/build/cmake)
@ -353,7 +360,6 @@ if (WIN32)
include_directories("${PROJECT_SOURCE_DIR}/src/contrib/dxc/inc") include_directories("${PROJECT_SOURCE_DIR}/src/contrib/dxc/inc")
endif() endif()
option(RT64_STATIC "Build RT64 as a static library" OFF)
if (${RT64_STATIC}) if (${RT64_STATIC})
add_library(rt64 STATIC ${SOURCES}) add_library(rt64 STATIC ${SOURCES})
else() else()
@ -364,6 +370,7 @@ set_target_properties(rt64 PROPERTIES OUTPUT_NAME "rt64")
set_target_properties(rt64 PROPERTIES PREFIX "") set_target_properties(rt64 PROPERTIES PREFIX "")
# Add common libraries. # Add common libraries.
target_link_libraries(rt64 re-spirv)
target_link_libraries(rt64 nfd) target_link_libraries(rt64 nfd)
target_link_libraries(rt64 libzstd_static) target_link_libraries(rt64 libzstd_static)

1
src/contrib/re-spirv Submodule

@ -0,0 +1 @@
Subproject commit f0ad27a50339e72d4c86b3436b9f74de83a20544

View File

@ -228,9 +228,12 @@ namespace RT64 {
shaderLibrary->setupCommonShaders(renderInterface.get(), device.get()); shaderLibrary->setupCommonShaders(renderInterface.get(), device.get());
shaderLibrary->setupMultisamplingShaders(renderInterface.get(), device.get(), multisampling); shaderLibrary->setupMultisamplingShaders(renderInterface.get(), device.get(), multisampling);
// Create the shader caches. Estimate the amount of shader compiler threads by trying to use about half of the system's available threads. // Create the shader caches.
// Estimate the amount of shader compiler threads by trying to use about half of the system's available threads.
// We need the ubershader pipelines done as soon as possible, so we use a different thread count that demands more of the system.
const uint32_t rasterShaderThreads = std::max(threadsAvailable / 2U, 1U); const uint32_t rasterShaderThreads = std::max(threadsAvailable / 2U, 1U);
rasterShaderCache = std::make_unique<RasterShaderCache>(rasterShaderThreads); const uint32_t ubershaderThreads = uint32_t(std::max(int(threadsAvailable) - 2, 1));
rasterShaderCache = std::make_unique<RasterShaderCache>(rasterShaderThreads, ubershaderThreads);
rasterShaderCache->setup(device.get(), renderInterface->getCapabilities().shaderFormat, shaderLibrary.get(), multisampling); rasterShaderCache->setup(device.get(), renderInterface->getCapabilities().shaderFormat, shaderLibrary.get(), multisampling);
# if RT_ENABLED # if RT_ENABLED

View File

@ -1544,11 +1544,8 @@ namespace RT64 {
else { else {
const bool copyMode = (call.shaderDesc.otherMode.cycleType() == G_CYC_COPY); const bool copyMode = (call.shaderDesc.otherMode.cycleType() == G_CYC_COPY);
triangles.pipeline = rasterShaderUber->getPipeline( triangles.pipeline = rasterShaderUber->getPipeline(
!copyMode && interop::Blender::usesAlphaBlend(call.shaderDesc.otherMode), !copyMode && call.shaderDesc.otherMode.zCmp() && (call.shaderDesc.otherMode.zMode() != ZMODE_DEC),
!copyMode && call.shaderDesc.flags.culling,
!copyMode && call.shaderDesc.otherMode.zCmp(),
!copyMode && call.shaderDesc.otherMode.zUpd(), !copyMode && call.shaderDesc.otherMode.zUpd(),
!copyMode && (call.shaderDesc.otherMode.zMode() == ZMODE_DEC),
(call.shaderDesc.otherMode.cvgDst() == CVG_DST_WRAP) || (call.shaderDesc.otherMode.cvgDst() == CVG_DST_SAVE)); (call.shaderDesc.otherMode.cvgDst() == CVG_DST_WRAP) || (call.shaderDesc.otherMode.cvgDst() == CVG_DST_SAVE));
} }

View File

@ -52,10 +52,31 @@ namespace RT64 {
RenderInputElement("COLOR", 0, 2, RasterColorFormat, 2, 0) RenderInputElement("COLOR", 0, 2, RasterColorFormat, 2, 0)
}; };
// OptimizerCacheSPIRV
void OptimizerCacheSPIRV::initialize() {
rasterVS.parse(RasterVSSpecConstantBlobSPIRV, std::size(RasterVSSpecConstantBlobSPIRV));
rasterVSFlat.parse(RasterVSSpecConstantFlatBlobSPIRV, std::size(RasterVSSpecConstantFlatBlobSPIRV));
rasterPS.parse(RasterPSSpecConstantBlobSPIRV, std::size(RasterPSSpecConstantBlobSPIRV));
rasterPSDepth.parse(RasterPSSpecConstantDepthBlobSPIRV, std::size(RasterPSSpecConstantDepthBlobSPIRV));
rasterPSDepthMS.parse(RasterPSSpecConstantDepthMSBlobSPIRV, std::size(RasterPSSpecConstantDepthMSBlobSPIRV));
rasterPSFlatDepth.parse(RasterPSSpecConstantFlatDepthBlobSPIRV, std::size(RasterPSSpecConstantFlatDepthBlobSPIRV));
rasterPSFlatDepthMS.parse(RasterPSSpecConstantFlatDepthMSBlobSPIRV, std::size(RasterPSSpecConstantFlatDepthMSBlobSPIRV));
rasterPSFlat.parse(RasterPSSpecConstantFlatBlobSPIRV, std::size(RasterPSSpecConstantFlatBlobSPIRV));
assert(!rasterVS.empty());
assert(!rasterVSFlat.empty());
assert(!rasterPS.empty());
assert(!rasterPSDepth.empty());
assert(!rasterPSDepthMS.empty());
assert(!rasterPSFlatDepth.empty());
assert(!rasterPSFlatDepthMS.empty());
assert(!rasterPSFlat.empty());
}
// RasterShader // RasterShader
RasterShader::RasterShader(RenderDevice *device, const ShaderDescription &desc, const RenderPipelineLayout *pipelineLayout, RenderShaderFormat shaderFormat, const RenderMultisampling &multisampling, RasterShader::RasterShader(RenderDevice *device, const ShaderDescription &desc, const RenderPipelineLayout *pipelineLayout, RenderShaderFormat shaderFormat, const RenderMultisampling &multisampling,
const ShaderCompiler *shaderCompiler, std::vector<uint8_t> *vsBytes, std::vector<uint8_t> *psBytes, bool useBytes) const ShaderCompiler *shaderCompiler, const OptimizerCacheSPIRV *optimizerCacheSPIRV, std::vector<uint8_t> *vsBytes, std::vector<uint8_t> *psBytes, bool useBytes)
{ {
assert(device != nullptr); assert(device != nullptr);
@ -65,54 +86,55 @@ namespace RT64 {
const bool useMSAA = (multisampling.sampleCount > 1); const bool useMSAA = (multisampling.sampleCount > 1);
std::unique_ptr<RenderShader> vertexShader; std::unique_ptr<RenderShader> vertexShader;
std::unique_ptr<RenderShader> pixelShader; std::unique_ptr<RenderShader> pixelShader;
std::vector<RenderSpecConstant> specConstants;
if (shaderFormat == RenderShaderFormat::SPIRV) { if (shaderFormat == RenderShaderFormat::SPIRV) {
// Choose the pre-compiled shader permutations. // Choose the pre-compiled shader permutations.
const void *VSBlob = nullptr; const respv::Shader *VS = nullptr;
const void *PSBlob = nullptr; const respv::Shader *PS = nullptr;
uint32_t VSBlobSize = 0;
uint32_t PSBlobSize = 0;
const bool outputDepth = desc.outputDepth(useMSAA); const bool outputDepth = desc.outputDepth(useMSAA);
if (desc.flags.smoothShade) { VS = desc.flags.smoothShade ? &optimizerCacheSPIRV->rasterVS : &optimizerCacheSPIRV->rasterVSFlat;
VSBlob = RasterVSSpecConstantBlobSPIRV;
VSBlobSize = uint32_t(std::size(RasterVSSpecConstantBlobSPIRV));
}
else {
VSBlob = RasterVSSpecConstantFlatBlobSPIRV;
VSBlobSize = uint32_t(std::size(RasterVSSpecConstantFlatBlobSPIRV));
}
// Pick the correct SPIR-V based on the configuration. // Pick the correct SPIR-V based on the configuration.
if (desc.flags.smoothShade) { if (desc.flags.smoothShade) {
if (outputDepth) { if (outputDepth) {
PSBlob = useMSAA ? RasterPSSpecConstantDepthMSBlobSPIRV : RasterPSSpecConstantDepthBlobSPIRV; PS = useMSAA ? &optimizerCacheSPIRV->rasterPSDepthMS : &optimizerCacheSPIRV->rasterPSDepth;
PSBlobSize = uint32_t(useMSAA ? std::size(RasterPSSpecConstantDepthMSBlobSPIRV) : std::size(RasterPSSpecConstantDepthBlobSPIRV));
} }
else { else {
PSBlob = RasterPSSpecConstantBlobSPIRV; PS = &optimizerCacheSPIRV->rasterPS;
PSBlobSize = uint32_t(std::size(RasterPSSpecConstantBlobSPIRV));
} }
} }
else { else {
if (outputDepth) { if (outputDepth) {
PSBlob = useMSAA ? RasterPSSpecConstantFlatDepthMSBlobSPIRV : RasterPSSpecConstantFlatDepthBlobSPIRV; PS = useMSAA ? &optimizerCacheSPIRV->rasterPSFlatDepthMS : &optimizerCacheSPIRV->rasterPSFlatDepth;
PSBlobSize = uint32_t(useMSAA ? std::size(RasterPSSpecConstantFlatDepthMSBlobSPIRV) : std::size(RasterPSSpecConstantFlatDepthBlobSPIRV));
} }
else { else {
PSBlob = RasterPSSpecConstantFlatBlobSPIRV; PS = &optimizerCacheSPIRV->rasterPSFlat;
PSBlobSize = uint32_t(std::size(RasterPSSpecConstantFlatBlobSPIRV));
} }
} }
thread_local std::vector<respv::SpecConstant> specConstants;
thread_local bool specConstantsSetup = false;
thread_local std::vector<uint8_t> optimizedVS;
thread_local std::vector<uint8_t> optimizedPS;
if (!specConstantsSetup) {
for (uint32_t i = 0; i < 5; i++) {
specConstants.push_back(respv::SpecConstant(i, { 0 }));
}
specConstantsSetup = true;
}
vertexShader = device->createShader(VSBlob, VSBlobSize, "VSMain", shaderFormat); specConstants[0].values[0] = desc.otherMode.L;
pixelShader = device->createShader(PSBlob, PSBlobSize, "PSMain", shaderFormat); specConstants[1].values[0] = desc.otherMode.H;
specConstants[2].values[0] = desc.colorCombiner.L;
specConstants[3].values[0] = desc.colorCombiner.H;
specConstants[4].values[0] = desc.flags.value;
// Spec constants should replace the constants embedded in the shader directly. bool vsRun = respv::Optimizer::run(*VS, specConstants.data(), uint32_t(specConstants.size()), optimizedVS);
specConstants.emplace_back(0, desc.otherMode.L); bool psRun = respv::Optimizer::run(*PS, specConstants.data(), uint32_t(specConstants.size()), optimizedPS);
specConstants.emplace_back(1, desc.otherMode.H); assert(vsRun && psRun && "Shader optimization must always succeed as the inputs are always the same.");
specConstants.emplace_back(2, desc.colorCombiner.L);
specConstants.emplace_back(3, desc.colorCombiner.H); vertexShader = device->createShader(optimizedVS.data(), optimizedVS.size(), "VSMain", shaderFormat);
specConstants.emplace_back(4, desc.flags.value); pixelShader = device->createShader(optimizedPS.data(), optimizedPS.size(), "PSMain", shaderFormat);
} }
else { else {
# if defined(_WIN32) # if defined(_WIN32)
@ -180,13 +202,11 @@ namespace RT64 {
creation.pixelShader = pixelShader.get(); creation.pixelShader = pixelShader.get();
creation.alphaBlend = !copyMode && interop::Blender::usesAlphaBlend(desc.otherMode); creation.alphaBlend = !copyMode && interop::Blender::usesAlphaBlend(desc.otherMode);
creation.culling = !copyMode && desc.flags.culling; creation.culling = !copyMode && desc.flags.culling;
creation.zCmp = !copyMode && desc.otherMode.zCmp(); creation.zCmp = !copyMode && desc.otherMode.zCmp() && (desc.otherMode.zMode() != ZMODE_DEC);
creation.zUpd = !copyMode && desc.otherMode.zUpd(); creation.zUpd = !copyMode && desc.otherMode.zUpd();
creation.zDecal = !copyMode && (desc.otherMode.zMode() == ZMODE_DEC);
creation.cvgAdd = (desc.otherMode.cvgDst() == CVG_DST_WRAP) || (desc.otherMode.cvgDst() == CVG_DST_SAVE); creation.cvgAdd = (desc.otherMode.cvgDst() == CVG_DST_WRAP) || (desc.otherMode.cvgDst() == CVG_DST_SAVE);
creation.NoN = desc.flags.NoN; creation.NoN = desc.flags.NoN;
creation.usesHDR = desc.flags.usesHDR; creation.usesHDR = desc.flags.usesHDR;
creation.specConstants = specConstants;
creation.multisampling = multisampling; creation.multisampling = multisampling;
pipeline = createPipeline(creation); pipeline = createPipeline(creation);
} }
@ -227,7 +247,7 @@ namespace RT64 {
pss << std::string_view(RenderParamsText, sizeof(RenderParamsText)); pss << std::string_view(RenderParamsText, sizeof(RenderParamsText));
pss << "RenderParams getRenderParams() {" + renderParamsCode + "; return rp; }"; pss << "RenderParams getRenderParams() {" + renderParamsCode + "; return rp; }";
pss << pss <<
"bool RasterPS(const RenderParams, bool, float4, float2, float4, float4, uint, out float4, out float4, out float);" "bool RasterPS(const RenderParams, bool, float4, float2, float4, float4, bool, uint, out float4, out float4, out float);"
"[shader(\"pixel\")]" "[shader(\"pixel\")]"
"void PSMain(" "void PSMain("
" in float4 vertexPosition : SV_POSITION" " in float4 vertexPosition : SV_POSITION"
@ -266,7 +286,7 @@ namespace RT64 {
" float4 resultColor;" " float4 resultColor;"
" float4 resultAlpha;" " float4 resultAlpha;"
" float resultDepth;" " float resultDepth;"
" if (!RasterPS(getRenderParams(), outputDepth, vertexPosition, vertexUV, vertexSmoothColor, vertexFlatColor, sampleIndex, resultColor, resultAlpha, resultDepth)) discard;" " if (!RasterPS(getRenderParams(), outputDepth, vertexPosition, vertexUV, vertexSmoothColor, vertexFlatColor, false, sampleIndex, resultColor, resultAlpha, resultDepth)) discard;"
" pixelColor = resultColor;" " pixelColor = resultColor;"
" pixelAlpha = resultAlpha;"; " pixelAlpha = resultAlpha;";
@ -280,8 +300,6 @@ namespace RT64 {
} }
std::unique_ptr<RenderPipeline> RasterShader::createPipeline(const PipelineCreation &c) { std::unique_ptr<RenderPipeline> RasterShader::createPipeline(const PipelineCreation &c) {
assert((!c.zDecal || !c.zUpd) && "Decals with depth write should never be created.");
RenderGraphicsPipelineDesc pipelineDesc; RenderGraphicsPipelineDesc pipelineDesc;
pipelineDesc.renderTargetBlend[0] = RenderBlendDesc::Copy(); pipelineDesc.renderTargetBlend[0] = RenderBlendDesc::Copy();
pipelineDesc.renderTargetFormat[0] = RenderTarget::colorBufferFormat(c.usesHDR); pipelineDesc.renderTargetFormat[0] = RenderTarget::colorBufferFormat(c.usesHDR);
@ -289,6 +307,7 @@ namespace RT64 {
pipelineDesc.cullMode = c.culling ? RenderCullMode::FRONT : RenderCullMode::NONE; pipelineDesc.cullMode = c.culling ? RenderCullMode::FRONT : RenderCullMode::NONE;
pipelineDesc.depthClipEnabled = !c.NoN; pipelineDesc.depthClipEnabled = !c.NoN;
pipelineDesc.depthEnabled = c.zCmp || c.zUpd; pipelineDesc.depthEnabled = c.zCmp || c.zUpd;
pipelineDesc.depthFunction = c.zCmp ? RenderComparisonFunction::LESS : RenderComparisonFunction::ALWAYS;
pipelineDesc.depthWriteEnabled = c.zUpd; pipelineDesc.depthWriteEnabled = c.zUpd;
pipelineDesc.depthTargetFormat = RenderFormat::D32_FLOAT; pipelineDesc.depthTargetFormat = RenderFormat::D32_FLOAT;
pipelineDesc.multisampling = c.multisampling; pipelineDesc.multisampling = c.multisampling;
@ -303,20 +322,6 @@ namespace RT64 {
pipelineDesc.specConstants = c.specConstants.data(); pipelineDesc.specConstants = c.specConstants.data();
pipelineDesc.specConstantsCount = uint32_t(c.specConstants.size()); pipelineDesc.specConstantsCount = uint32_t(c.specConstants.size());
if (c.zCmp) {
// While these modes evaluate equality in the hardware, we use LEQUAL to simulate the depth comparison in the shader instead.
if (c.zDecal) {
pipelineDesc.depthFunction = RenderComparisonFunction::LESS_EQUAL;
}
// ZMODE_OPA, ZMODE_XLU and ZMODE_INTER only differ based on coverage, which is not emulated, so they can all be approximated the same way.
else {
pipelineDesc.depthFunction = RenderComparisonFunction::LESS;
}
}
else {
pipelineDesc.depthFunction = RenderComparisonFunction::ALWAYS;
}
// Alpha blending is performed by using dual source blending. The blending factor will be in the secondary output. // Alpha blending is performed by using dual source blending. The blending factor will be in the secondary output.
RenderBlendDesc &targetBlend = pipelineDesc.renderTargetBlend[0]; RenderBlendDesc &targetBlend = pipelineDesc.renderTargetBlend[0];
if (c.alphaBlend) { if (c.alphaBlend) {
@ -438,32 +443,26 @@ namespace RT64 {
pipelineLayout = layoutBuilder.create(device); pipelineLayout = layoutBuilder.create(device);
// Generate all possible combinations of pipeline creations and assign them to each thread. Skip the ones that are invalid. // Generate all possible combinations of pipeline creations and assign them to each thread. Skip the ones that are invalid.
uint32_t pipelineCount = uint32_t(std::size(pipelines));
pipelineThreadCreations.clear(); pipelineThreadCreations.clear();
pipelineThreadCreations.resize(threadCount); pipelineThreadCreations.resize(std::min(threadCount, pipelineCount));
PipelineCreation creation; PipelineCreation creation;
creation.device = device; creation.device = device;
creation.pipelineLayout = pipelineLayout.get(); creation.pipelineLayout = pipelineLayout.get();
creation.vertexShader = vertexShader.get(); creation.vertexShader = vertexShader.get();
creation.pixelShader = pixelShader.get(); creation.pixelShader = pixelShader.get();
creation.alphaBlend = true;
creation.culling = false;
creation.NoN = true; creation.NoN = true;
creation.usesHDR = shaderLibrary->usesHDR; creation.usesHDR = shaderLibrary->usesHDR;
creation.multisampling = multisampling; creation.multisampling = multisampling;
uint32_t threadIndex = 0; uint32_t threadIndex = 0;
uint32_t pipelineCount = uint32_t(std::size(pipelines));
for (uint32_t i = 0; i < pipelineCount; i++) { for (uint32_t i = 0; i < pipelineCount; i++) {
creation.alphaBlend = i & (1 << 0); creation.zCmp = i & (1 << 0);
creation.culling = i & (1 << 1); creation.zUpd = i & (1 << 1);
creation.zCmp = i & (1 << 2); creation.cvgAdd = i & (1 << 2);
creation.zUpd = i & (1 << 3);
creation.zDecal = i & (1 << 4);
creation.cvgAdd = i & (1 << 5);
// Skip all PSOs that would lead to invalid decal behavior.
if (creation.zDecal && (creation.zUpd || !creation.zCmp)) {
continue;
}
pipelineThreadCreations[threadIndex].emplace_back(creation); pipelineThreadCreations[threadIndex].emplace_back(creation);
threadIndex = (threadIndex + 1) % threadCount; threadIndex = (threadIndex + 1) % threadCount;
@ -471,8 +470,8 @@ namespace RT64 {
// Spawn the threads that will compile all the pipelines. // Spawn the threads that will compile all the pipelines.
pipelineThreads.clear(); pipelineThreads.clear();
pipelineThreads.resize(threadCount); pipelineThreads.resize(pipelineThreadCreations.size());
for (uint32_t i = 0; i < threadCount; i++) { for (uint32_t i = 0; i < uint32_t(pipelineThreads.size()); i++) {
pipelineThreads[i] = std::make_unique<std::thread>(&RasterShaderUber::threadCreatePipelines, this, i); pipelineThreads[i] = std::make_unique<std::thread>(&RasterShaderUber::threadCreatePipelines, this, i);
} }
@ -533,7 +532,7 @@ namespace RT64 {
void RasterShaderUber::threadCreatePipelines(uint32_t threadIndex) { void RasterShaderUber::threadCreatePipelines(uint32_t threadIndex) {
for (const PipelineCreation &creation : pipelineThreadCreations[threadIndex]) { for (const PipelineCreation &creation : pipelineThreadCreations[threadIndex]) {
uint32_t pipelineIndex = pipelineStateIndex(creation.alphaBlend, creation.culling, creation.zCmp, creation.zUpd, creation.zDecal, creation.cvgAdd); uint32_t pipelineIndex = pipelineStateIndex(creation.zCmp, creation.zUpd, creation.cvgAdd);
pipelines[pipelineIndex] = RasterShader::createPipeline(creation); pipelines[pipelineIndex] = RasterShader::createPipeline(creation);
} }
} }
@ -552,23 +551,14 @@ namespace RT64 {
} }
} }
uint32_t RasterShaderUber::pipelineStateIndex(bool alphaBlend, bool culling, bool zCmp, bool zUpd, bool zDecal, bool cvgAdd) const { uint32_t RasterShaderUber::pipelineStateIndex(bool zCmp, bool zUpd, bool cvgAdd) const {
return return
(uint32_t(alphaBlend) << 0) | (uint32_t(zCmp) << 0) |
(uint32_t(culling) << 1) | (uint32_t(zUpd) << 1) |
(uint32_t(zCmp) << 2) | (uint32_t(cvgAdd) << 2);
(uint32_t(zUpd) << 3) |
(uint32_t(zDecal) << 4) |
(uint32_t(cvgAdd) << 5);
} }
const RenderPipeline *RasterShaderUber::getPipeline(bool alphaBlend, bool culling, bool zCmp, bool zUpd, bool zDecal, bool cvgAdd) const { const RenderPipeline *RasterShaderUber::getPipeline(bool zCmp, bool zUpd, bool cvgAdd) const {
// Force read and turn off writing on decal modes since those PSOs are not generated. return pipelines[pipelineStateIndex(zCmp, zUpd, cvgAdd)].get();
if (zDecal) {
zCmp = true;
zUpd = false;
}
return pipelines[pipelineStateIndex(alphaBlend, culling, zCmp, zUpd, zDecal, cvgAdd)].get();
} }
}; };

View File

@ -9,6 +9,8 @@
#include <mutex> #include <mutex>
#include <thread> #include <thread>
#include "re-spirv/re-spirv.h"
#include "rhi/rt64_render_interface.h" #include "rhi/rt64_render_interface.h"
#include "shared/rt64_blender.h" #include "shared/rt64_blender.h"
#include "shared/rt64_color_combiner.h" #include "shared/rt64_color_combiner.h"
@ -21,6 +23,19 @@
#define SAMPLE_LOCATIONS_REQUIRED 1 #define SAMPLE_LOCATIONS_REQUIRED 1
namespace RT64 { namespace RT64 {
struct OptimizerCacheSPIRV {
respv::Shader rasterVS;
respv::Shader rasterVSFlat;
respv::Shader rasterPS;
respv::Shader rasterPSDepth;
respv::Shader rasterPSDepthMS;
respv::Shader rasterPSFlatDepth;
respv::Shader rasterPSFlatDepthMS;
respv::Shader rasterPSFlat;
void initialize();
};
struct PipelineCreation { struct PipelineCreation {
RenderDevice *device; RenderDevice *device;
const RenderPipelineLayout *pipelineLayout; const RenderPipelineLayout *pipelineLayout;
@ -31,7 +46,6 @@ namespace RT64 {
bool NoN; bool NoN;
bool zCmp; bool zCmp;
bool zUpd; bool zUpd;
bool zDecal;
bool cvgAdd; bool cvgAdd;
bool usesHDR; bool usesHDR;
std::vector<RenderSpecConstant> specConstants; std::vector<RenderSpecConstant> specConstants;
@ -49,7 +63,7 @@ namespace RT64 {
std::unique_ptr<RenderPipeline> pipeline; std::unique_ptr<RenderPipeline> pipeline;
RasterShader(RenderDevice *device, const ShaderDescription &desc, const RenderPipelineLayout *pipelineLayout, RenderShaderFormat shaderFormat, const RenderMultisampling &multisampling, RasterShader(RenderDevice *device, const ShaderDescription &desc, const RenderPipelineLayout *pipelineLayout, RenderShaderFormat shaderFormat, const RenderMultisampling &multisampling,
const ShaderCompiler *shaderCompiler, std::vector<uint8_t> *vsBytes = nullptr, std::vector<uint8_t> *psBytes = nullptr, bool useBytes = false); const ShaderCompiler *shaderCompiler, const OptimizerCacheSPIRV *optimizerCacheSPIRV, std::vector<uint8_t> *vsBytes = nullptr, std::vector<uint8_t> *psBytes = nullptr, bool useBytes = false);
~RasterShader(); ~RasterShader();
static RasterShaderText generateShaderText(const ShaderDescription &desc, bool multisampling); static RasterShaderText generateShaderText(const ShaderDescription &desc, bool multisampling);
@ -61,7 +75,7 @@ namespace RT64 {
static const uint64_t RasterVSLibraryHash; static const uint64_t RasterVSLibraryHash;
static const uint64_t RasterPSLibraryHash; static const uint64_t RasterPSLibraryHash;
std::unique_ptr<RenderPipeline> pipelines[64]; std::unique_ptr<RenderPipeline> pipelines[8];
std::unique_ptr<RenderPipeline> postBlendDitherNoiseAddPipeline; std::unique_ptr<RenderPipeline> postBlendDitherNoiseAddPipeline;
std::unique_ptr<RenderPipeline> postBlendDitherNoiseSubPipeline; std::unique_ptr<RenderPipeline> postBlendDitherNoiseSubPipeline;
std::mutex pipelinesMutex; std::mutex pipelinesMutex;
@ -76,7 +90,7 @@ namespace RT64 {
~RasterShaderUber(); ~RasterShaderUber();
void threadCreatePipelines(uint32_t threadIndex); void threadCreatePipelines(uint32_t threadIndex);
void waitForPipelineCreation(); void waitForPipelineCreation();
uint32_t pipelineStateIndex(bool alphaBlend, bool culling, bool zCmp, bool zUpd, bool zDecal, bool cvgAdd) const; uint32_t pipelineStateIndex(bool zCmp, bool zUpd, bool cvgAdd) const;
const RenderPipeline *getPipeline(bool alphaBlend, bool culling, bool zCmp, bool zUpd, bool zDecal, bool cvgAdd) const; const RenderPipeline *getPipeline(bool zCmp, bool zUpd, bool cvgAdd) const;
}; };
}; };

View File

@ -209,7 +209,7 @@ namespace RT64 {
assert((shaderCache->shaderUber != nullptr) && "Ubershader should've been created by the time a new shader is submitted to the cache."); assert((shaderCache->shaderUber != nullptr) && "Ubershader should've been created by the time a new shader is submitted to the cache.");
const RenderPipelineLayout *uberPipelineLayout = shaderCache->shaderUber->pipelineLayout.get(); const RenderPipelineLayout *uberPipelineLayout = shaderCache->shaderUber->pipelineLayout.get();
const RenderMultisampling multisampling = shaderCache->multisampling; const RenderMultisampling multisampling = shaderCache->multisampling;
std::unique_ptr<RasterShader> newShader = std::make_unique<RasterShader>(shaderCache->device, shaderDesc, uberPipelineLayout, shaderCache->shaderFormat, multisampling, shaderCache->shaderCompiler.get(), shaderVsBytes, shaderPsBytes, useShaderBytes); std::unique_ptr<RasterShader> newShader = std::make_unique<RasterShader>(shaderCache->device, shaderDesc, uberPipelineLayout, shaderCache->shaderFormat, multisampling, shaderCache->shaderCompiler.get(), &shaderCache->optimizerCacheSPIRV, shaderVsBytes, shaderPsBytes, useShaderBytes);
// Dump the bytes of the shader if requested. // Dump the bytes of the shader if requested.
if (!useShaderBytes && (shaderVsBytes != nullptr) && (shaderPsBytes != nullptr)) { if (!useShaderBytes && (shaderVsBytes != nullptr) && (shaderPsBytes != nullptr)) {
@ -220,7 +220,7 @@ namespace RT64 {
// Toggle the use of HDR and compile another shader. // Toggle the use of HDR and compile another shader.
ShaderDescription shaderDescAlt = shaderDesc; ShaderDescription shaderDescAlt = shaderDesc;
shaderDescAlt.flags.usesHDR = (shaderDescAlt.flags.usesHDR == 0); shaderDescAlt.flags.usesHDR = (shaderDescAlt.flags.usesHDR == 0);
std::unique_ptr<RasterShader> altShader = std::make_unique<RasterShader>(shaderCache->device, shaderDescAlt, uberPipelineLayout, shaderCache->shaderFormat, multisampling, shaderCache->shaderCompiler.get(), shaderVsBytes, shaderPsBytes, useShaderBytes); std::unique_ptr<RasterShader> altShader = std::make_unique<RasterShader>(shaderCache->device, shaderDescAlt, uberPipelineLayout, shaderCache->shaderFormat, multisampling, shaderCache->shaderCompiler.get(), &shaderCache->optimizerCacheSPIRV, shaderVsBytes, shaderPsBytes, useShaderBytes);
shaderCache->offlineDumper.stepDumping(shaderDescAlt, dumperVsBytes, dumperPsBytes); shaderCache->offlineDumper.stepDumping(shaderDescAlt, dumperVsBytes, dumperPsBytes);
} }
} }
@ -235,10 +235,11 @@ namespace RT64 {
// RasterShaderCache // RasterShaderCache
RasterShaderCache::RasterShaderCache(uint32_t threadCount) { RasterShaderCache::RasterShaderCache(uint32_t threadCount, uint32_t ubershaderThreadCount) {
assert(threadCount > 0); assert(threadCount > 0);
this->threadCount = threadCount; this->threadCount = threadCount;
this->ubershaderThreadCount = ubershaderThreadCount;
#ifdef ENABLE_OPTIMIZED_SHADER_GENERATION #ifdef ENABLE_OPTIMIZED_SHADER_GENERATION
# ifdef _WIN32 # ifdef _WIN32
@ -264,8 +265,13 @@ namespace RT64 {
this->shaderFormat = shaderFormat; this->shaderFormat = shaderFormat;
this->multisampling = multisampling; this->multisampling = multisampling;
shaderUber = std::make_unique<RasterShaderUber>(device, shaderFormat, multisampling, shaderLibrary, threadCount); shaderUber = std::make_unique<RasterShaderUber>(device, shaderFormat, multisampling, shaderLibrary, ubershaderThreadCount);
usesHDR = shaderLibrary->usesHDR; usesHDR = shaderLibrary->usesHDR;
// Initialize the re-spirv optimizer cache.
if (shaderFormat == RenderShaderFormat::SPIRV) {
optimizerCacheSPIRV.initialize();
}
} }
void RasterShaderCache::submit(const ShaderDescription &desc) { void RasterShaderCache::submit(const ShaderDescription &desc) {

View File

@ -55,6 +55,7 @@ namespace RT64 {
RenderDevice *device; RenderDevice *device;
std::unique_ptr<RasterShaderUber> shaderUber; std::unique_ptr<RasterShaderUber> shaderUber;
OptimizerCacheSPIRV optimizerCacheSPIRV;
std::mutex submissionMutex; std::mutex submissionMutex;
std::queue<ShaderDescription> descQueue; std::queue<ShaderDescription> descQueue;
std::mutex descQueueMutex; std::mutex descQueueMutex;
@ -65,6 +66,7 @@ namespace RT64 {
std::mutex GPUShadersMutex; std::mutex GPUShadersMutex;
std::list<std::unique_ptr<CompilationThread>> compilationThreads; std::list<std::unique_ptr<CompilationThread>> compilationThreads;
uint32_t threadCount; uint32_t threadCount;
uint32_t ubershaderThreadCount;
RenderShaderFormat shaderFormat; RenderShaderFormat shaderFormat;
std::unique_ptr<ShaderCompiler> shaderCompiler; std::unique_ptr<ShaderCompiler> shaderCompiler;
RenderMultisampling multisampling; RenderMultisampling multisampling;
@ -73,7 +75,7 @@ namespace RT64 {
std::mutex offlineDumperMutex; std::mutex offlineDumperMutex;
bool usesHDR = false; bool usesHDR = false;
RasterShaderCache(uint32_t threadCount); RasterShaderCache(uint32_t threadCount, uint32_t ubershaderThreadCount);
~RasterShaderCache(); ~RasterShaderCache();
void setup(RenderDevice *device, RenderShaderFormat shaderFormat, const ShaderLibrary *shaderLibrary, const RenderMultisampling &multisampling); void setup(RenderDevice *device, RenderShaderFormat shaderFormat, const ShaderLibrary *shaderLibrary, const RenderMultisampling &multisampling);
void submit(const ShaderDescription &desc); void submit(const ShaderDescription &desc);

View File

@ -29,12 +29,19 @@ float sampleBackgroundDepth(int2 pixelPos, uint sampleIndex) {
#endif #endif
LIBRARY_EXPORT bool RasterPS(const RenderParams rp, bool outputDepth, float4 vertexPosition, float2 vertexUV, float4 vertexSmoothColor, float4 vertexFlatColor, LIBRARY_EXPORT bool RasterPS(const RenderParams rp, bool outputDepth, float4 vertexPosition, float2 vertexUV, float4 vertexSmoothColor, float4 vertexFlatColor,
uint sampleIndex, out float4 resultColor, out float4 resultAlpha, out float resultDepth) bool isFrontFace, uint sampleIndex, out float4 resultColor, out float4 resultAlpha, out float resultDepth)
{ {
const OtherMode otherMode = { rp.omL, rp.omH };
#if defined(DYNAMIC_RENDER_PARAMS)
if ((otherMode.cycleType() != G_CYC_COPY) && renderFlagCulling(rp.flags) && isFrontFace) {
resultDepth = vertexPosition.z;
return false;
}
#endif
const uint instanceIndex = instanceRenderIndices[gConstants.renderIndex].instanceIndex; const uint instanceIndex = instanceRenderIndices[gConstants.renderIndex].instanceIndex;
const float4 vertexColor = renderFlagSmoothShade(rp.flags) ? vertexSmoothColor : float4(vertexFlatColor.rgb, vertexSmoothColor.a); const float4 vertexColor = renderFlagSmoothShade(rp.flags) ? vertexSmoothColor : float4(vertexFlatColor.rgb, vertexSmoothColor.a);
const ColorCombiner colorCombiner = { rp.ccL, rp.ccH }; const ColorCombiner colorCombiner = { rp.ccL, rp.ccH };
const OtherMode otherMode = { rp.omL, rp.omH };
const bool depthClampNear = renderFlagNoN(rp.flags); const bool depthClampNear = renderFlagNoN(rp.flags);
const bool depthDecal = (otherMode.zMode() == ZMODE_DEC); const bool depthDecal = (otherMode.zMode() == ZMODE_DEC);
const bool zSourcePrim = (otherMode.zSource() == G_ZS_PRIM); const bool zSourcePrim = (otherMode.zSource() == G_ZS_PRIM);
@ -269,6 +276,9 @@ void PSMain(
#if defined(DYNAMIC_RENDER_PARAMS) || defined(VERTEX_FLAT_COLOR) #if defined(DYNAMIC_RENDER_PARAMS) || defined(VERTEX_FLAT_COLOR)
, nointerpolation in float4 vertexFlatColor : COLOR1 , nointerpolation in float4 vertexFlatColor : COLOR1
#endif #endif
#if defined(DYNAMIC_RENDER_PARAMS)
, bool isFrontFace : SV_IsFrontFace
#endif
#if defined(MULTISAMPLING) #if defined(MULTISAMPLING)
, in uint sampleIndex : SV_SampleIndex , in uint sampleIndex : SV_SampleIndex
#endif #endif
@ -283,6 +293,7 @@ void PSMain(
#if !defined(VERTEX_FLAT_COLOR) #if !defined(VERTEX_FLAT_COLOR)
float4 vertexFlatColor = 0.0f; float4 vertexFlatColor = 0.0f;
#endif #endif
bool isFrontFace = false;
#endif #endif
#if !defined(MULTISAMPLING) #if !defined(MULTISAMPLING)
uint sampleIndex = 0; uint sampleIndex = 0;
@ -295,7 +306,7 @@ void PSMain(
float4 resultColor; float4 resultColor;
float4 resultAlpha; float4 resultAlpha;
float resultDepth; float resultDepth;
if (!RasterPS(getRenderParams(), outputDepth, vertexPosition, vertexUV, vertexSmoothColor, vertexFlatColor, sampleIndex, resultColor, resultAlpha, resultDepth)) { if (!RasterPS(getRenderParams(), outputDepth, vertexPosition, vertexUV, vertexSmoothColor, vertexFlatColor, isFrontFace, sampleIndex, resultColor, resultAlpha, resultDepth)) {
discard; discard;
} }