diff --git a/.gitmodules b/.gitmodules index 7a5b640..57e3d6f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -52,3 +52,6 @@ [submodule "src/contrib/zstd"] path = src/contrib/zstd url = https://github.com/facebook/zstd +[submodule "src/contrib/re-spirv"] + path = src/contrib/re-spirv + url = https://github.com/rt64/re-spirv diff --git a/CMakeLists.txt b/CMakeLists.txt index 1cddf06..8aac0c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,11 +3,17 @@ project(rt64) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_VISIBILITY_PRESET hidden) +option(RT64_STATIC "Build RT64 as a static library" OFF) + option(RT64_BUILD_EXAMPLES "Build examples for RT64" OFF) if (${RT64_BUILD_EXAMPLES}) set(RT64_STATIC ON) endif() +if (NOT ${RT64_STATIC}) + set(CMAKE_POSITION_INDEPENDENT_CODE ON) +endif() + function(preprocess INFILE OUTFILE OPTIONS) if (CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC") if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") @@ -56,6 +62,7 @@ set(ZSTD_BUILD_STATIC ON) set(ZSTD_BUILD_SHARED OFF) add_subdirectory(src/tools/file_to_c) +add_subdirectory(src/contrib/re-spirv) add_subdirectory(src/contrib/nativefiledialog-extended) add_subdirectory(src/contrib/zstd/build/cmake) @@ -353,7 +360,6 @@ if (WIN32) include_directories("${PROJECT_SOURCE_DIR}/src/contrib/dxc/inc") endif() -option(RT64_STATIC "Build RT64 as a static library" OFF) if (${RT64_STATIC}) add_library(rt64 STATIC ${SOURCES}) else() @@ -364,6 +370,7 @@ set_target_properties(rt64 PROPERTIES OUTPUT_NAME "rt64") set_target_properties(rt64 PROPERTIES PREFIX "") # Add common libraries. +target_link_libraries(rt64 re-spirv) target_link_libraries(rt64 nfd) target_link_libraries(rt64 libzstd_static) diff --git a/src/contrib/re-spirv b/src/contrib/re-spirv new file mode 160000 index 0000000..f0ad27a --- /dev/null +++ b/src/contrib/re-spirv @@ -0,0 +1 @@ +Subproject commit f0ad27a50339e72d4c86b3436b9f74de83a20544 diff --git a/src/hle/rt64_application.cpp b/src/hle/rt64_application.cpp index 8b40312..40f10a4 100644 --- a/src/hle/rt64_application.cpp +++ b/src/hle/rt64_application.cpp @@ -228,9 +228,12 @@ namespace RT64 { shaderLibrary->setupCommonShaders(renderInterface.get(), device.get()); shaderLibrary->setupMultisamplingShaders(renderInterface.get(), device.get(), multisampling); - // Create the shader caches. Estimate the amount of shader compiler threads by trying to use about half of the system's available threads. + // Create the shader caches. + // Estimate the amount of shader compiler threads by trying to use about half of the system's available threads. + // We need the ubershader pipelines done as soon as possible, so we use a different thread count that demands more of the system. const uint32_t rasterShaderThreads = std::max(threadsAvailable / 2U, 1U); - rasterShaderCache = std::make_unique(rasterShaderThreads); + const uint32_t ubershaderThreads = uint32_t(std::max(int(threadsAvailable) - 2, 1)); + rasterShaderCache = std::make_unique(rasterShaderThreads, ubershaderThreads); rasterShaderCache->setup(device.get(), renderInterface->getCapabilities().shaderFormat, shaderLibrary.get(), multisampling); # if RT_ENABLED diff --git a/src/render/rt64_framebuffer_renderer.cpp b/src/render/rt64_framebuffer_renderer.cpp index 5fc39f9..0facb1e 100644 --- a/src/render/rt64_framebuffer_renderer.cpp +++ b/src/render/rt64_framebuffer_renderer.cpp @@ -1544,11 +1544,8 @@ namespace RT64 { else { const bool copyMode = (call.shaderDesc.otherMode.cycleType() == G_CYC_COPY); triangles.pipeline = rasterShaderUber->getPipeline( - !copyMode && interop::Blender::usesAlphaBlend(call.shaderDesc.otherMode), - !copyMode && call.shaderDesc.flags.culling, - !copyMode && call.shaderDesc.otherMode.zCmp(), + !copyMode && call.shaderDesc.otherMode.zCmp() && (call.shaderDesc.otherMode.zMode() != ZMODE_DEC), !copyMode && call.shaderDesc.otherMode.zUpd(), - !copyMode && (call.shaderDesc.otherMode.zMode() == ZMODE_DEC), (call.shaderDesc.otherMode.cvgDst() == CVG_DST_WRAP) || (call.shaderDesc.otherMode.cvgDst() == CVG_DST_SAVE)); } diff --git a/src/render/rt64_raster_shader.cpp b/src/render/rt64_raster_shader.cpp index ea65bb1..edaef83 100644 --- a/src/render/rt64_raster_shader.cpp +++ b/src/render/rt64_raster_shader.cpp @@ -52,10 +52,31 @@ namespace RT64 { RenderInputElement("COLOR", 0, 2, RasterColorFormat, 2, 0) }; + // OptimizerCacheSPIRV + + void OptimizerCacheSPIRV::initialize() { + rasterVS.parse(RasterVSSpecConstantBlobSPIRV, std::size(RasterVSSpecConstantBlobSPIRV)); + rasterVSFlat.parse(RasterVSSpecConstantFlatBlobSPIRV, std::size(RasterVSSpecConstantFlatBlobSPIRV)); + rasterPS.parse(RasterPSSpecConstantBlobSPIRV, std::size(RasterPSSpecConstantBlobSPIRV)); + rasterPSDepth.parse(RasterPSSpecConstantDepthBlobSPIRV, std::size(RasterPSSpecConstantDepthBlobSPIRV)); + rasterPSDepthMS.parse(RasterPSSpecConstantDepthMSBlobSPIRV, std::size(RasterPSSpecConstantDepthMSBlobSPIRV)); + rasterPSFlatDepth.parse(RasterPSSpecConstantFlatDepthBlobSPIRV, std::size(RasterPSSpecConstantFlatDepthBlobSPIRV)); + rasterPSFlatDepthMS.parse(RasterPSSpecConstantFlatDepthMSBlobSPIRV, std::size(RasterPSSpecConstantFlatDepthMSBlobSPIRV)); + rasterPSFlat.parse(RasterPSSpecConstantFlatBlobSPIRV, std::size(RasterPSSpecConstantFlatBlobSPIRV)); + assert(!rasterVS.empty()); + assert(!rasterVSFlat.empty()); + assert(!rasterPS.empty()); + assert(!rasterPSDepth.empty()); + assert(!rasterPSDepthMS.empty()); + assert(!rasterPSFlatDepth.empty()); + assert(!rasterPSFlatDepthMS.empty()); + assert(!rasterPSFlat.empty()); + } + // RasterShader RasterShader::RasterShader(RenderDevice *device, const ShaderDescription &desc, const RenderPipelineLayout *pipelineLayout, RenderShaderFormat shaderFormat, const RenderMultisampling &multisampling, - const ShaderCompiler *shaderCompiler, std::vector *vsBytes, std::vector *psBytes, bool useBytes) + const ShaderCompiler *shaderCompiler, const OptimizerCacheSPIRV *optimizerCacheSPIRV, std::vector *vsBytes, std::vector *psBytes, bool useBytes) { assert(device != nullptr); @@ -65,54 +86,55 @@ namespace RT64 { const bool useMSAA = (multisampling.sampleCount > 1); std::unique_ptr vertexShader; std::unique_ptr pixelShader; - std::vector specConstants; if (shaderFormat == RenderShaderFormat::SPIRV) { // Choose the pre-compiled shader permutations. - const void *VSBlob = nullptr; - const void *PSBlob = nullptr; - uint32_t VSBlobSize = 0; - uint32_t PSBlobSize = 0; + const respv::Shader *VS = nullptr; + const respv::Shader *PS = nullptr; const bool outputDepth = desc.outputDepth(useMSAA); - if (desc.flags.smoothShade) { - VSBlob = RasterVSSpecConstantBlobSPIRV; - VSBlobSize = uint32_t(std::size(RasterVSSpecConstantBlobSPIRV)); - } - else { - VSBlob = RasterVSSpecConstantFlatBlobSPIRV; - VSBlobSize = uint32_t(std::size(RasterVSSpecConstantFlatBlobSPIRV)); - } - + VS = desc.flags.smoothShade ? &optimizerCacheSPIRV->rasterVS : &optimizerCacheSPIRV->rasterVSFlat; + // Pick the correct SPIR-V based on the configuration. if (desc.flags.smoothShade) { if (outputDepth) { - PSBlob = useMSAA ? RasterPSSpecConstantDepthMSBlobSPIRV : RasterPSSpecConstantDepthBlobSPIRV; - PSBlobSize = uint32_t(useMSAA ? std::size(RasterPSSpecConstantDepthMSBlobSPIRV) : std::size(RasterPSSpecConstantDepthBlobSPIRV)); + PS = useMSAA ? &optimizerCacheSPIRV->rasterPSDepthMS : &optimizerCacheSPIRV->rasterPSDepth; } else { - PSBlob = RasterPSSpecConstantBlobSPIRV; - PSBlobSize = uint32_t(std::size(RasterPSSpecConstantBlobSPIRV)); + PS = &optimizerCacheSPIRV->rasterPS; } } else { if (outputDepth) { - PSBlob = useMSAA ? RasterPSSpecConstantFlatDepthMSBlobSPIRV : RasterPSSpecConstantFlatDepthBlobSPIRV; - PSBlobSize = uint32_t(useMSAA ? std::size(RasterPSSpecConstantFlatDepthMSBlobSPIRV) : std::size(RasterPSSpecConstantFlatDepthBlobSPIRV)); + PS = useMSAA ? &optimizerCacheSPIRV->rasterPSFlatDepthMS : &optimizerCacheSPIRV->rasterPSFlatDepth; } else { - PSBlob = RasterPSSpecConstantFlatBlobSPIRV; - PSBlobSize = uint32_t(std::size(RasterPSSpecConstantFlatBlobSPIRV)); + PS = &optimizerCacheSPIRV->rasterPSFlat; } } + + thread_local std::vector specConstants; + thread_local bool specConstantsSetup = false; + thread_local std::vector optimizedVS; + thread_local std::vector optimizedPS; + if (!specConstantsSetup) { + for (uint32_t i = 0; i < 5; i++) { + specConstants.push_back(respv::SpecConstant(i, { 0 })); + } + + specConstantsSetup = true; + } - vertexShader = device->createShader(VSBlob, VSBlobSize, "VSMain", shaderFormat); - pixelShader = device->createShader(PSBlob, PSBlobSize, "PSMain", shaderFormat); + specConstants[0].values[0] = desc.otherMode.L; + specConstants[1].values[0] = desc.otherMode.H; + specConstants[2].values[0] = desc.colorCombiner.L; + specConstants[3].values[0] = desc.colorCombiner.H; + specConstants[4].values[0] = desc.flags.value; - // Spec constants should replace the constants embedded in the shader directly. - specConstants.emplace_back(0, desc.otherMode.L); - specConstants.emplace_back(1, desc.otherMode.H); - specConstants.emplace_back(2, desc.colorCombiner.L); - specConstants.emplace_back(3, desc.colorCombiner.H); - specConstants.emplace_back(4, desc.flags.value); + bool vsRun = respv::Optimizer::run(*VS, specConstants.data(), uint32_t(specConstants.size()), optimizedVS); + bool psRun = respv::Optimizer::run(*PS, specConstants.data(), uint32_t(specConstants.size()), optimizedPS); + assert(vsRun && psRun && "Shader optimization must always succeed as the inputs are always the same."); + + vertexShader = device->createShader(optimizedVS.data(), optimizedVS.size(), "VSMain", shaderFormat); + pixelShader = device->createShader(optimizedPS.data(), optimizedPS.size(), "PSMain", shaderFormat); } else { # if defined(_WIN32) @@ -180,13 +202,11 @@ namespace RT64 { creation.pixelShader = pixelShader.get(); creation.alphaBlend = !copyMode && interop::Blender::usesAlphaBlend(desc.otherMode); creation.culling = !copyMode && desc.flags.culling; - creation.zCmp = !copyMode && desc.otherMode.zCmp(); + creation.zCmp = !copyMode && desc.otherMode.zCmp() && (desc.otherMode.zMode() != ZMODE_DEC); creation.zUpd = !copyMode && desc.otherMode.zUpd(); - creation.zDecal = !copyMode && (desc.otherMode.zMode() == ZMODE_DEC); creation.cvgAdd = (desc.otherMode.cvgDst() == CVG_DST_WRAP) || (desc.otherMode.cvgDst() == CVG_DST_SAVE); creation.NoN = desc.flags.NoN; creation.usesHDR = desc.flags.usesHDR; - creation.specConstants = specConstants; creation.multisampling = multisampling; pipeline = createPipeline(creation); } @@ -227,7 +247,7 @@ namespace RT64 { pss << std::string_view(RenderParamsText, sizeof(RenderParamsText)); pss << "RenderParams getRenderParams() {" + renderParamsCode + "; return rp; }"; pss << - "bool RasterPS(const RenderParams, bool, float4, float2, float4, float4, uint, out float4, out float4, out float);" + "bool RasterPS(const RenderParams, bool, float4, float2, float4, float4, bool, uint, out float4, out float4, out float);" "[shader(\"pixel\")]" "void PSMain(" " in float4 vertexPosition : SV_POSITION" @@ -266,7 +286,7 @@ namespace RT64 { " float4 resultColor;" " float4 resultAlpha;" " float resultDepth;" - " if (!RasterPS(getRenderParams(), outputDepth, vertexPosition, vertexUV, vertexSmoothColor, vertexFlatColor, sampleIndex, resultColor, resultAlpha, resultDepth)) discard;" + " if (!RasterPS(getRenderParams(), outputDepth, vertexPosition, vertexUV, vertexSmoothColor, vertexFlatColor, false, sampleIndex, resultColor, resultAlpha, resultDepth)) discard;" " pixelColor = resultColor;" " pixelAlpha = resultAlpha;"; @@ -280,8 +300,6 @@ namespace RT64 { } std::unique_ptr RasterShader::createPipeline(const PipelineCreation &c) { - assert((!c.zDecal || !c.zUpd) && "Decals with depth write should never be created."); - RenderGraphicsPipelineDesc pipelineDesc; pipelineDesc.renderTargetBlend[0] = RenderBlendDesc::Copy(); pipelineDesc.renderTargetFormat[0] = RenderTarget::colorBufferFormat(c.usesHDR); @@ -289,6 +307,7 @@ namespace RT64 { pipelineDesc.cullMode = c.culling ? RenderCullMode::FRONT : RenderCullMode::NONE; pipelineDesc.depthClipEnabled = !c.NoN; pipelineDesc.depthEnabled = c.zCmp || c.zUpd; + pipelineDesc.depthFunction = c.zCmp ? RenderComparisonFunction::LESS : RenderComparisonFunction::ALWAYS; pipelineDesc.depthWriteEnabled = c.zUpd; pipelineDesc.depthTargetFormat = RenderFormat::D32_FLOAT; pipelineDesc.multisampling = c.multisampling; @@ -303,20 +322,6 @@ namespace RT64 { pipelineDesc.specConstants = c.specConstants.data(); pipelineDesc.specConstantsCount = uint32_t(c.specConstants.size()); - if (c.zCmp) { - // While these modes evaluate equality in the hardware, we use LEQUAL to simulate the depth comparison in the shader instead. - if (c.zDecal) { - pipelineDesc.depthFunction = RenderComparisonFunction::LESS_EQUAL; - } - // ZMODE_OPA, ZMODE_XLU and ZMODE_INTER only differ based on coverage, which is not emulated, so they can all be approximated the same way. - else { - pipelineDesc.depthFunction = RenderComparisonFunction::LESS; - } - } - else { - pipelineDesc.depthFunction = RenderComparisonFunction::ALWAYS; - } - // Alpha blending is performed by using dual source blending. The blending factor will be in the secondary output. RenderBlendDesc &targetBlend = pipelineDesc.renderTargetBlend[0]; if (c.alphaBlend) { @@ -438,32 +443,26 @@ namespace RT64 { pipelineLayout = layoutBuilder.create(device); // Generate all possible combinations of pipeline creations and assign them to each thread. Skip the ones that are invalid. + uint32_t pipelineCount = uint32_t(std::size(pipelines)); pipelineThreadCreations.clear(); - pipelineThreadCreations.resize(threadCount); + pipelineThreadCreations.resize(std::min(threadCount, pipelineCount)); PipelineCreation creation; creation.device = device; creation.pipelineLayout = pipelineLayout.get(); creation.vertexShader = vertexShader.get(); creation.pixelShader = pixelShader.get(); + creation.alphaBlend = true; + creation.culling = false; creation.NoN = true; creation.usesHDR = shaderLibrary->usesHDR; creation.multisampling = multisampling; uint32_t threadIndex = 0; - uint32_t pipelineCount = uint32_t(std::size(pipelines)); for (uint32_t i = 0; i < pipelineCount; i++) { - creation.alphaBlend = i & (1 << 0); - creation.culling = i & (1 << 1); - creation.zCmp = i & (1 << 2); - creation.zUpd = i & (1 << 3); - creation.zDecal = i & (1 << 4); - creation.cvgAdd = i & (1 << 5); - - // Skip all PSOs that would lead to invalid decal behavior. - if (creation.zDecal && (creation.zUpd || !creation.zCmp)) { - continue; - } + creation.zCmp = i & (1 << 0); + creation.zUpd = i & (1 << 1); + creation.cvgAdd = i & (1 << 2); pipelineThreadCreations[threadIndex].emplace_back(creation); threadIndex = (threadIndex + 1) % threadCount; @@ -471,8 +470,8 @@ namespace RT64 { // Spawn the threads that will compile all the pipelines. pipelineThreads.clear(); - pipelineThreads.resize(threadCount); - for (uint32_t i = 0; i < threadCount; i++) { + pipelineThreads.resize(pipelineThreadCreations.size()); + for (uint32_t i = 0; i < uint32_t(pipelineThreads.size()); i++) { pipelineThreads[i] = std::make_unique(&RasterShaderUber::threadCreatePipelines, this, i); } @@ -533,7 +532,7 @@ namespace RT64 { void RasterShaderUber::threadCreatePipelines(uint32_t threadIndex) { for (const PipelineCreation &creation : pipelineThreadCreations[threadIndex]) { - uint32_t pipelineIndex = pipelineStateIndex(creation.alphaBlend, creation.culling, creation.zCmp, creation.zUpd, creation.zDecal, creation.cvgAdd); + uint32_t pipelineIndex = pipelineStateIndex(creation.zCmp, creation.zUpd, creation.cvgAdd); pipelines[pipelineIndex] = RasterShader::createPipeline(creation); } } @@ -552,23 +551,14 @@ namespace RT64 { } } - uint32_t RasterShaderUber::pipelineStateIndex(bool alphaBlend, bool culling, bool zCmp, bool zUpd, bool zDecal, bool cvgAdd) const { + uint32_t RasterShaderUber::pipelineStateIndex(bool zCmp, bool zUpd, bool cvgAdd) const { return - (uint32_t(alphaBlend) << 0) | - (uint32_t(culling) << 1) | - (uint32_t(zCmp) << 2) | - (uint32_t(zUpd) << 3) | - (uint32_t(zDecal) << 4) | - (uint32_t(cvgAdd) << 5); + (uint32_t(zCmp) << 0) | + (uint32_t(zUpd) << 1) | + (uint32_t(cvgAdd) << 2); } - const RenderPipeline *RasterShaderUber::getPipeline(bool alphaBlend, bool culling, bool zCmp, bool zUpd, bool zDecal, bool cvgAdd) const { - // Force read and turn off writing on decal modes since those PSOs are not generated. - if (zDecal) { - zCmp = true; - zUpd = false; - } - - return pipelines[pipelineStateIndex(alphaBlend, culling, zCmp, zUpd, zDecal, cvgAdd)].get(); + const RenderPipeline *RasterShaderUber::getPipeline(bool zCmp, bool zUpd, bool cvgAdd) const { + return pipelines[pipelineStateIndex(zCmp, zUpd, cvgAdd)].get(); } }; \ No newline at end of file diff --git a/src/render/rt64_raster_shader.h b/src/render/rt64_raster_shader.h index ac3fa96..55c5b15 100644 --- a/src/render/rt64_raster_shader.h +++ b/src/render/rt64_raster_shader.h @@ -9,6 +9,8 @@ #include #include +#include "re-spirv/re-spirv.h" + #include "rhi/rt64_render_interface.h" #include "shared/rt64_blender.h" #include "shared/rt64_color_combiner.h" @@ -21,6 +23,19 @@ #define SAMPLE_LOCATIONS_REQUIRED 1 namespace RT64 { + struct OptimizerCacheSPIRV { + respv::Shader rasterVS; + respv::Shader rasterVSFlat; + respv::Shader rasterPS; + respv::Shader rasterPSDepth; + respv::Shader rasterPSDepthMS; + respv::Shader rasterPSFlatDepth; + respv::Shader rasterPSFlatDepthMS; + respv::Shader rasterPSFlat; + + void initialize(); + }; + struct PipelineCreation { RenderDevice *device; const RenderPipelineLayout *pipelineLayout; @@ -31,7 +46,6 @@ namespace RT64 { bool NoN; bool zCmp; bool zUpd; - bool zDecal; bool cvgAdd; bool usesHDR; std::vector specConstants; @@ -49,7 +63,7 @@ namespace RT64 { std::unique_ptr pipeline; RasterShader(RenderDevice *device, const ShaderDescription &desc, const RenderPipelineLayout *pipelineLayout, RenderShaderFormat shaderFormat, const RenderMultisampling &multisampling, - const ShaderCompiler *shaderCompiler, std::vector *vsBytes = nullptr, std::vector *psBytes = nullptr, bool useBytes = false); + const ShaderCompiler *shaderCompiler, const OptimizerCacheSPIRV *optimizerCacheSPIRV, std::vector *vsBytes = nullptr, std::vector *psBytes = nullptr, bool useBytes = false); ~RasterShader(); static RasterShaderText generateShaderText(const ShaderDescription &desc, bool multisampling); @@ -61,7 +75,7 @@ namespace RT64 { static const uint64_t RasterVSLibraryHash; static const uint64_t RasterPSLibraryHash; - std::unique_ptr pipelines[64]; + std::unique_ptr pipelines[8]; std::unique_ptr postBlendDitherNoiseAddPipeline; std::unique_ptr postBlendDitherNoiseSubPipeline; std::mutex pipelinesMutex; @@ -76,7 +90,7 @@ namespace RT64 { ~RasterShaderUber(); void threadCreatePipelines(uint32_t threadIndex); void waitForPipelineCreation(); - uint32_t pipelineStateIndex(bool alphaBlend, bool culling, bool zCmp, bool zUpd, bool zDecal, bool cvgAdd) const; - const RenderPipeline *getPipeline(bool alphaBlend, bool culling, bool zCmp, bool zUpd, bool zDecal, bool cvgAdd) const; + uint32_t pipelineStateIndex(bool zCmp, bool zUpd, bool cvgAdd) const; + const RenderPipeline *getPipeline(bool zCmp, bool zUpd, bool cvgAdd) const; }; }; \ No newline at end of file diff --git a/src/render/rt64_raster_shader_cache.cpp b/src/render/rt64_raster_shader_cache.cpp index abfaeb9..353f8c2 100644 --- a/src/render/rt64_raster_shader_cache.cpp +++ b/src/render/rt64_raster_shader_cache.cpp @@ -209,7 +209,7 @@ namespace RT64 { assert((shaderCache->shaderUber != nullptr) && "Ubershader should've been created by the time a new shader is submitted to the cache."); const RenderPipelineLayout *uberPipelineLayout = shaderCache->shaderUber->pipelineLayout.get(); const RenderMultisampling multisampling = shaderCache->multisampling; - std::unique_ptr newShader = std::make_unique(shaderCache->device, shaderDesc, uberPipelineLayout, shaderCache->shaderFormat, multisampling, shaderCache->shaderCompiler.get(), shaderVsBytes, shaderPsBytes, useShaderBytes); + std::unique_ptr newShader = std::make_unique(shaderCache->device, shaderDesc, uberPipelineLayout, shaderCache->shaderFormat, multisampling, shaderCache->shaderCompiler.get(), &shaderCache->optimizerCacheSPIRV, shaderVsBytes, shaderPsBytes, useShaderBytes); // Dump the bytes of the shader if requested. if (!useShaderBytes && (shaderVsBytes != nullptr) && (shaderPsBytes != nullptr)) { @@ -220,7 +220,7 @@ namespace RT64 { // Toggle the use of HDR and compile another shader. ShaderDescription shaderDescAlt = shaderDesc; shaderDescAlt.flags.usesHDR = (shaderDescAlt.flags.usesHDR == 0); - std::unique_ptr altShader = std::make_unique(shaderCache->device, shaderDescAlt, uberPipelineLayout, shaderCache->shaderFormat, multisampling, shaderCache->shaderCompiler.get(), shaderVsBytes, shaderPsBytes, useShaderBytes); + std::unique_ptr altShader = std::make_unique(shaderCache->device, shaderDescAlt, uberPipelineLayout, shaderCache->shaderFormat, multisampling, shaderCache->shaderCompiler.get(), &shaderCache->optimizerCacheSPIRV, shaderVsBytes, shaderPsBytes, useShaderBytes); shaderCache->offlineDumper.stepDumping(shaderDescAlt, dumperVsBytes, dumperPsBytes); } } @@ -235,10 +235,11 @@ namespace RT64 { // RasterShaderCache - RasterShaderCache::RasterShaderCache(uint32_t threadCount) { + RasterShaderCache::RasterShaderCache(uint32_t threadCount, uint32_t ubershaderThreadCount) { assert(threadCount > 0); this->threadCount = threadCount; + this->ubershaderThreadCount = ubershaderThreadCount; #ifdef ENABLE_OPTIMIZED_SHADER_GENERATION # ifdef _WIN32 @@ -264,8 +265,13 @@ namespace RT64 { this->shaderFormat = shaderFormat; this->multisampling = multisampling; - shaderUber = std::make_unique(device, shaderFormat, multisampling, shaderLibrary, threadCount); + shaderUber = std::make_unique(device, shaderFormat, multisampling, shaderLibrary, ubershaderThreadCount); usesHDR = shaderLibrary->usesHDR; + + // Initialize the re-spirv optimizer cache. + if (shaderFormat == RenderShaderFormat::SPIRV) { + optimizerCacheSPIRV.initialize(); + } } void RasterShaderCache::submit(const ShaderDescription &desc) { diff --git a/src/render/rt64_raster_shader_cache.h b/src/render/rt64_raster_shader_cache.h index 46c089c..7357fc8 100644 --- a/src/render/rt64_raster_shader_cache.h +++ b/src/render/rt64_raster_shader_cache.h @@ -55,6 +55,7 @@ namespace RT64 { RenderDevice *device; std::unique_ptr shaderUber; + OptimizerCacheSPIRV optimizerCacheSPIRV; std::mutex submissionMutex; std::queue descQueue; std::mutex descQueueMutex; @@ -65,6 +66,7 @@ namespace RT64 { std::mutex GPUShadersMutex; std::list> compilationThreads; uint32_t threadCount; + uint32_t ubershaderThreadCount; RenderShaderFormat shaderFormat; std::unique_ptr shaderCompiler; RenderMultisampling multisampling; @@ -73,7 +75,7 @@ namespace RT64 { std::mutex offlineDumperMutex; bool usesHDR = false; - RasterShaderCache(uint32_t threadCount); + RasterShaderCache(uint32_t threadCount, uint32_t ubershaderThreadCount); ~RasterShaderCache(); void setup(RenderDevice *device, RenderShaderFormat shaderFormat, const ShaderLibrary *shaderLibrary, const RenderMultisampling &multisampling); void submit(const ShaderDescription &desc); diff --git a/src/shaders/RasterPS.hlsl b/src/shaders/RasterPS.hlsl index 289b18c..cae0d0e 100644 --- a/src/shaders/RasterPS.hlsl +++ b/src/shaders/RasterPS.hlsl @@ -29,12 +29,19 @@ float sampleBackgroundDepth(int2 pixelPos, uint sampleIndex) { #endif LIBRARY_EXPORT bool RasterPS(const RenderParams rp, bool outputDepth, float4 vertexPosition, float2 vertexUV, float4 vertexSmoothColor, float4 vertexFlatColor, - uint sampleIndex, out float4 resultColor, out float4 resultAlpha, out float resultDepth) + bool isFrontFace, uint sampleIndex, out float4 resultColor, out float4 resultAlpha, out float resultDepth) { + const OtherMode otherMode = { rp.omL, rp.omH }; +#if defined(DYNAMIC_RENDER_PARAMS) + if ((otherMode.cycleType() != G_CYC_COPY) && renderFlagCulling(rp.flags) && isFrontFace) { + resultDepth = vertexPosition.z; + return false; + } +#endif + const uint instanceIndex = instanceRenderIndices[gConstants.renderIndex].instanceIndex; const float4 vertexColor = renderFlagSmoothShade(rp.flags) ? vertexSmoothColor : float4(vertexFlatColor.rgb, vertexSmoothColor.a); const ColorCombiner colorCombiner = { rp.ccL, rp.ccH }; - const OtherMode otherMode = { rp.omL, rp.omH }; const bool depthClampNear = renderFlagNoN(rp.flags); const bool depthDecal = (otherMode.zMode() == ZMODE_DEC); const bool zSourcePrim = (otherMode.zSource() == G_ZS_PRIM); @@ -269,6 +276,9 @@ void PSMain( #if defined(DYNAMIC_RENDER_PARAMS) || defined(VERTEX_FLAT_COLOR) , nointerpolation in float4 vertexFlatColor : COLOR1 #endif +#if defined(DYNAMIC_RENDER_PARAMS) + , bool isFrontFace : SV_IsFrontFace +#endif #if defined(MULTISAMPLING) , in uint sampleIndex : SV_SampleIndex #endif @@ -283,6 +293,7 @@ void PSMain( #if !defined(VERTEX_FLAT_COLOR) float4 vertexFlatColor = 0.0f; #endif + bool isFrontFace = false; #endif #if !defined(MULTISAMPLING) uint sampleIndex = 0; @@ -295,7 +306,7 @@ void PSMain( float4 resultColor; float4 resultAlpha; float resultDepth; - if (!RasterPS(getRenderParams(), outputDepth, vertexPosition, vertexUV, vertexSmoothColor, vertexFlatColor, sampleIndex, resultColor, resultAlpha, resultDepth)) { + if (!RasterPS(getRenderParams(), outputDepth, vertexPosition, vertexUV, vertexSmoothColor, vertexFlatColor, isFrontFace, sampleIndex, resultColor, resultAlpha, resultDepth)) { discard; }