From 7d78cf0f6fbca526e748590aeb6dfdc3c016669e Mon Sep 17 00:00:00 2001 From: Stenzek Date: Thu, 20 Jul 2017 15:25:24 +1000 Subject: [PATCH] ShaderGen: Implement pixel ubershaders --- Source/Core/VideoBackends/D3D/main.cpp | 2 + Source/Core/VideoBackends/OGL/Render.cpp | 10 + .../VideoBackends/Vulkan/VulkanContext.cpp | 2 + Source/Core/VideoCommon/BPStructs.cpp | 36 +- Source/Core/VideoCommon/CMakeLists.txt | 2 + Source/Core/VideoCommon/ConstantManager.h | 17 +- Source/Core/VideoCommon/PixelEngine.cpp | 2 + Source/Core/VideoCommon/PixelShaderGen.cpp | 24 +- .../Core/VideoCommon/PixelShaderManager.cpp | 215 +++- Source/Core/VideoCommon/PixelShaderManager.h | 14 +- Source/Core/VideoCommon/ShaderGenCommon.cpp | 5 +- Source/Core/VideoCommon/ShaderGenCommon.h | 4 +- Source/Core/VideoCommon/UberShaderCommon.cpp | 27 + Source/Core/VideoCommon/UberShaderCommon.h | 24 + Source/Core/VideoCommon/UberShaderPixel.cpp | 1102 +++++++++++++++++ Source/Core/VideoCommon/UberShaderPixel.h | 31 + Source/Core/VideoCommon/VideoCommon.vcxproj | 6 +- .../VideoCommon/VideoCommon.vcxproj.filters | 12 + Source/Core/VideoCommon/VideoConfig.h | 2 + 19 files changed, 1520 insertions(+), 17 deletions(-) create mode 100644 Source/Core/VideoCommon/UberShaderCommon.cpp create mode 100644 Source/Core/VideoCommon/UberShaderCommon.h create mode 100644 Source/Core/VideoCommon/UberShaderPixel.cpp create mode 100644 Source/Core/VideoCommon/UberShaderPixel.h diff --git a/Source/Core/VideoBackends/D3D/main.cpp b/Source/Core/VideoBackends/D3D/main.cpp index 5cc05f2af9..9063c735d5 100644 --- a/Source/Core/VideoBackends/D3D/main.cpp +++ b/Source/Core/VideoBackends/D3D/main.cpp @@ -78,6 +78,8 @@ void VideoBackend::InitBackendInfo() g_Config.backend_info.bSupportsInternalResolutionFrameDumps = false; g_Config.backend_info.bSupportsGPUTextureDecoding = false; g_Config.backend_info.bSupportsST3CTextures = false; + g_Config.backend_info.bSupportsBitfield = false; + g_Config.backend_info.bSupportsDynamicSamplerIndexing = false; IDXGIFactory* factory; IDXGIAdapter* ad; diff --git a/Source/Core/VideoBackends/OGL/Render.cpp b/Source/Core/VideoBackends/OGL/Render.cpp index 4392da994c..66edbf5d74 100644 --- a/Source/Core/VideoBackends/OGL/Render.cpp +++ b/Source/Core/VideoBackends/OGL/Render.cpp @@ -447,6 +447,12 @@ Renderer::Renderer() // Clip distance support is useless without a method to clamp the depth range g_Config.backend_info.bSupportsDepthClamp = GLExtensions::Supports("GL_ARB_depth_clamp"); + // Desktop OpenGL supports bitfield manulipation and dynamic sampler indexing if it supports + // shader5. OpenGL ES 3.1 supports it implicitly without an extension + g_Config.backend_info.bSupportsBitfield = GLExtensions::Supports("GL_ARB_gpu_shader5"); + g_Config.backend_info.bSupportsDynamicSamplerIndexing = + GLExtensions::Supports("GL_ARB_gpu_shader5"); + g_ogl_config.bSupportsGLSLCache = GLExtensions::Supports("GL_ARB_get_program_binary"); g_ogl_config.bSupportsGLPinnedMemory = GLExtensions::Supports("GL_AMD_pinned_memory"); g_ogl_config.bSupportsGLSync = GLExtensions::Supports("GL_ARB_sync"); @@ -515,6 +521,8 @@ Renderer::Renderer() g_ogl_config.bSupportsMSAA = true; g_ogl_config.bSupportsTextureStorage = true; g_ogl_config.bSupports2DTextureStorageMultisample = true; + g_Config.backend_info.bSupportsBitfield = true; + g_Config.backend_info.bSupportsDynamicSamplerIndexing = g_ogl_config.bSupportsAEP; if (g_ActiveConfig.iStereoMode > 0 && g_ActiveConfig.iMultisamples > 1 && !g_ogl_config.bSupports3DTextureStorageMultisample) { @@ -542,6 +550,8 @@ Renderer::Renderer() g_ogl_config.bSupportsTextureStorage = true; g_ogl_config.bSupports2DTextureStorageMultisample = true; g_ogl_config.bSupports3DTextureStorageMultisample = true; + g_Config.backend_info.bSupportsBitfield = true; + g_Config.backend_info.bSupportsDynamicSamplerIndexing = true; } } else diff --git a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp index 8ab3245455..af9e249833 100644 --- a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp +++ b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp @@ -236,6 +236,8 @@ void VulkanContext::PopulateBackendInfo(VideoConfig* config) config->backend_info.bSupportsMultithreading = true; // Assumed support. config->backend_info.bSupportsComputeShaders = true; // Assumed support. config->backend_info.bSupportsGPUTextureDecoding = true; // Assumed support. + config->backend_info.bSupportsBitfield = true; // Assumed support. + config->backend_info.bSupportsDynamicSamplerIndexing = true; // Assumed support. config->backend_info.bSupportsInternalResolutionFrameDumps = true; // Assumed support. config->backend_info.bSupportsPostProcessing = true; // Assumed support. config->backend_info.bSupportsDualSourceBlend = false; // Dependent on features. diff --git a/Source/Core/VideoCommon/BPStructs.cpp b/Source/Core/VideoCommon/BPStructs.cpp index 509718003c..c686d4ff01 100644 --- a/Source/Core/VideoCommon/BPStructs.cpp +++ b/Source/Core/VideoCommon/BPStructs.cpp @@ -93,6 +93,9 @@ static void BPWritten(const BPCmd& bp) (u32)bpmem.genMode.cullmode, (u32)bpmem.genMode.numindstages, (u32)bpmem.genMode.zfreeze); + if (bp.changes) + PixelShaderManager::SetGenModeChanged(); + // Only call SetGenerationMode when cull mode changes. if (bp.changes & 0xC000) SetGenerationMode(); @@ -155,13 +158,20 @@ static void BPWritten(const BPCmd& bp) // Set Color Mask if (bp.changes & 0x18) // colorupdate | alphaupdate SetColorMask(); + + // Dither + if (bp.changes & 0x04) + PixelShaderManager::SetBlendModeChanged(); } return; case BPMEM_CONSTANTALPHA: // Set Destination Alpha PRIM_LOG("constalpha: alp=%d, en=%d", bpmem.dstalpha.alpha.Value(), bpmem.dstalpha.enable.Value()); - if (bp.changes & 0xFF) - PixelShaderManager::SetDestAlpha(); + if (bp.changes) + { + PixelShaderManager::SetAlpha(); + PixelShaderManager::SetDestAlphaChanged(); + } if (bp.changes & 0x100) SetBlendMode(); return; @@ -238,6 +248,7 @@ static void BPWritten(const BPCmd& bp) // the number of lines copied is determined by the y scale * source efb height BoundingBox::active = false; + PixelShaderManager::SetBoundingBoxActive(false); float yScale; if (PE_copy.scale_invert) @@ -318,6 +329,7 @@ static void BPWritten(const BPCmd& bp) PixelShaderManager::SetAlpha(); if (bp.changes) { + PixelShaderManager::SetAlphaTestChanged(); g_renderer->SetColorMask(); SetBlendMode(); } @@ -332,7 +344,7 @@ static void BPWritten(const BPCmd& bp) if (bp.changes & 3) PixelShaderManager::SetZTextureTypeChanged(); if (bp.changes & 12) - VertexShaderManager::SetViewportChanged(); + PixelShaderManager::SetZTextureOpChanged(); #if defined(_DEBUG) || defined(DEBUGFAST) const char* pzop[] = {"DISABLE", "ADD", "REPLACE", "?"}; const char* pztype[] = {"Z8", "Z16", "Z24", "?"}; @@ -390,6 +402,7 @@ static void BPWritten(const BPCmd& bp) { u8 offset = bp.address & 2; BoundingBox::active = true; + PixelShaderManager::SetBoundingBoxActive(true); if (g_ActiveConfig.backend_info.bSupportsBBox && g_ActiveConfig.bBBoxEnable) { @@ -426,6 +439,11 @@ static void BPWritten(const BPCmd& bp) * 3 BC0 - Ind. Tex Stage 0 NTexCoord * 0 BI0 - Ind. Tex Stage 0 NTexMap */ case BPMEM_IREF: + { + if (bp.changes) + PixelShaderManager::SetTevIndirectChanged(); + return; + } case BPMEM_TEV_KSEL: // Texture Environment Swap Mode Table 0 case BPMEM_TEV_KSEL + 1: // Texture Environment Swap Mode Table 1 @@ -435,6 +453,8 @@ static void BPWritten(const BPCmd& bp) case BPMEM_TEV_KSEL + 5: // Texture Environment Swap Mode Table 5 case BPMEM_TEV_KSEL + 6: // Texture Environment Swap Mode Table 6 case BPMEM_TEV_KSEL + 7: // Texture Environment Swap Mode Table 7 + PixelShaderManager::SetTevKSel(bp.address - BPMEM_TEV_KSEL, bp.newvalue); + return; /* This Register can be used to limit to which bits of BP registers is * actually written to. The mask is only valid for the next BP write, @@ -567,6 +587,7 @@ static void BPWritten(const BPCmd& bp) // ------------------------- case BPMEM_TREF: case BPMEM_TREF + 4: + PixelShaderManager::SetTevOrder(bp.address - BPMEM_TREF, bp.newvalue); return; // ---------------------- // Set wrap size @@ -630,15 +651,18 @@ static void BPWritten(const BPCmd& bp) // -------------- // Indirect Tev // -------------- - case BPMEM_IND_CMD: // Indirect 0-15 + case BPMEM_IND_CMD: + PixelShaderManager::SetTevIndirectChanged(); return; // -------------------------------------------------- // Set Color/Alpha of a Tev // BPMEM_TEV_COLOR_ENV - Dest, Shift, Clamp, Sub, Bias, Sel A, Sel B, Sel C, Sel D // BPMEM_TEV_ALPHA_ENV - Dest, Shift, Clamp, Sub, Bias, Sel A, Sel B, Sel C, Sel D, T Swap, R Swap // -------------------------------------------------- - case BPMEM_TEV_COLOR_ENV: // Texture Environment Color/Alpha 0-7 - case BPMEM_TEV_COLOR_ENV + 16: // Texture Environment Color/Alpha 8-15 + case BPMEM_TEV_COLOR_ENV: // Texture Environment 1 + case BPMEM_TEV_COLOR_ENV + 16: + PixelShaderManager::SetTevCombiner((bp.address - BPMEM_TEV_COLOR_ENV) >> 1, + (bp.address - BPMEM_TEV_COLOR_ENV) & 1, bp.newvalue); return; default: break; diff --git a/Source/Core/VideoCommon/CMakeLists.txt b/Source/Core/VideoCommon/CMakeLists.txt index 782fc7af7e..406e726d71 100644 --- a/Source/Core/VideoCommon/CMakeLists.txt +++ b/Source/Core/VideoCommon/CMakeLists.txt @@ -32,6 +32,8 @@ set(SRCS RenderState.cpp ShaderGenCommon.cpp Statistics.cpp + UberShaderCommon.cpp + UberShaderPixel.cpp TextureCacheBase.cpp TextureConfig.cpp TextureConversionShader.cpp diff --git a/Source/Core/VideoCommon/ConstantManager.h b/Source/Core/VideoCommon/ConstantManager.h index c3b7004e69..98a23c8f76 100644 --- a/Source/Core/VideoCommon/ConstantManager.h +++ b/Source/Core/VideoCommon/ConstantManager.h @@ -24,7 +24,22 @@ struct PixelShaderConstants int4 fogi; float4 fogf[2]; float4 zslope; - float4 efbscale; + float efbscale[2]; + + // Constants from here onwards are only used in ubershaders. + u32 genmode; // .z + u32 alphaTest; // .w + u32 fogParam3; // .x + u32 fogRangeBase; // .y + u32 dstalpha; // .z + u32 ztex_op; // .w + u32 early_ztest; // .x (bool) + u32 rgba6_format; // .y (bool) + u32 dither; // .z (bool) + u32 bounding_box; // .w (bool) + uint4 pack1[16]; // .xy - combiners, .z - tevind, .w - iref + uint4 pack2[8]; // .x - tevorder, .y - tevksel + int4 konst[32]; // .rgba }; struct VertexShaderConstants diff --git a/Source/Core/VideoCommon/PixelEngine.cpp b/Source/Core/VideoCommon/PixelEngine.cpp index 9c4282170b..02cda26917 100644 --- a/Source/Core/VideoCommon/PixelEngine.cpp +++ b/Source/Core/VideoCommon/PixelEngine.cpp @@ -18,6 +18,7 @@ #include "VideoCommon/CommandProcessor.h" #include "VideoCommon/Fifo.h" #include "VideoCommon/PixelEngine.h" +#include "VideoCommon/PixelShaderManager.h" namespace PixelEngine { @@ -231,6 +232,7 @@ void RegisterMMIO(MMIO::Mapping* mmio, u32 base) { mmio->Register(base | (PE_BBOX_LEFT + 2 * i), MMIO::ComplexRead([i](u32) { BoundingBox::active = false; + PixelShaderManager::SetBoundingBoxActive(false); return g_video_backend->Video_GetBoundingBox(i); }), MMIO::InvalidWrite()); diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp index fe0932ba70..036709aa1a 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.cpp +++ b/Source/Core/VideoCommon/PixelShaderGen.cpp @@ -394,8 +394,26 @@ void WritePixelShaderCommonHeader(ShaderCode& out, APIType ApiType, bool boundin "\tint4 " I_FOGI ";\n" "\tfloat4 " I_FOGF "[2];\n" "\tfloat4 " I_ZSLOPE ";\n" - "\tfloat4 " I_EFBSCALE ";\n" - "};\n"); + "\tfloat2 " I_EFBSCALE ";\n" + "\tuint bpmem_genmode;\n" + "\tuint bpmem_alphaTest;\n" + "\tuint bpmem_fogParam3;\n" + "\tuint bpmem_fogRangeBase;\n" + "\tuint bpmem_dstalpha;\n" + "\tuint bpmem_ztex_op;\n" + "\tbool bpmem_early_ztest;\n" + "\tbool bpmem_rgba6_format;\n" + "\tbool bpmem_dither;\n" + "\tbool bpmem_bounding_box;\n" + "\tuint4 bpmem_pack1[16];\n" // .xy - combiners, .z - tevind + "\tuint4 bpmem_pack2[8];\n" // .x - tevorder, .y - tevksel + "\tint4 konstLookup[32];\n" + "};\n\n"); + out.Write("#define bpmem_combiners(i) (bpmem_pack1[(i)].xy)\n" + "#define bpmem_tevind(i) (bpmem_pack1[(i)].z)\n" + "#define bpmem_iref(i) (bpmem_pack1[(i)].w)\n" + "#define bpmem_tevorder(i) (bpmem_pack2[(i)].x)\n" + "#define bpmem_tevksel(i) (bpmem_pack2[(i)].y)\n\n"); if (bounding_box) { @@ -449,7 +467,7 @@ ShaderCode GeneratePixelShaderCode(APIType ApiType, const ShaderHostConfig& host if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) out.Write("UBO_BINDING(std140, 2) uniform VSBlock {\n"); else - out.Write("cbuffer VSBlock : register(b2) {\n"); + out.Write("cbuffer VSBlock : register(b1) {\n"); out.Write(s_shader_uniforms); out.Write("};\n"); diff --git a/Source/Core/VideoCommon/PixelShaderManager.cpp b/Source/Core/VideoCommon/PixelShaderManager.cpp index c5a23a22c7..98fba2f08f 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.cpp +++ b/Source/Core/VideoCommon/PixelShaderManager.cpp @@ -15,6 +15,8 @@ bool PixelShaderManager::s_bFogRangeAdjustChanged; bool PixelShaderManager::s_bViewPortChanged; +bool PixelShaderManager::s_bIndirectDirty; +bool PixelShaderManager::s_bDestAlphaDirty; PixelShaderConstants PixelShaderManager::constants; bool PixelShaderManager::dirty; @@ -40,6 +42,38 @@ void PixelShaderManager::Init() SetTexCoordChanged(6); SetTexCoordChanged(7); + // fixed Konstants + for (int component = 0; component < 4; component++) + { + constants.konst[0][component] = 255; // 1 + constants.konst[1][component] = 223; // 7/8 + constants.konst[2][component] = 191; // 3/4 + constants.konst[3][component] = 159; // 5/8 + constants.konst[4][component] = 128; // 1/2 + constants.konst[5][component] = 96; // 3/8 + constants.konst[6][component] = 64; // 1/4 + constants.konst[7][component] = 32; // 1/8 + + // Invalid Konstants (reads as zero on hardware) + constants.konst[8][component] = 0; + constants.konst[9][component] = 0; + constants.konst[10][component] = 0; + constants.konst[11][component] = 0; + + // Annoyingly, alpha reads zero values for the .rgb colors (offically + // defined as invalid) + // If it wasn't for this, we could just use one of the first 3 colunms + // instead of + // wasting an entire 4th column just for alpha. + if (component == 3) + { + constants.konst[12][component] = 0; + constants.konst[13][component] = 0; + constants.konst[14][component] = 0; + constants.konst[15][component] = 0; + } + } + dirty = true; } @@ -99,6 +133,59 @@ void PixelShaderManager::SetConstants() dirty = true; s_bViewPortChanged = false; } + + if (s_bIndirectDirty) + { + for (int i = 0; i < 4; i++) + constants.pack1[i][3] = 0; + + for (u32 i = 0; i < (bpmem.genMode.numtevstages + 1); ++i) + { + u32 stage = bpmem.tevind[i].bt; + if (stage < bpmem.genMode.numindstages) + { + // We set some extra bits so the ubershader can quickly check if these + // features are in use. + if (bpmem.tevind[i].IsActive()) + constants.pack1[stage][3] = + bpmem.tevindref.getTexCoord(stage) | bpmem.tevindref.getTexMap(stage) << 8 | 1 << 16; + // Note: a tevind of zero just happens to be a passthrough, so no need + // to set an extra bit. + constants.pack1[i][2] = + bpmem.tevind[i].hex; // TODO: This match shadergen, but videosw will + // always wrap. + + // The ubershader uses tevind != 0 as a condition whether to calculate texcoords, + // even when texture is disabled, instead of the stage < bpmem.genMode.numindstages. + // We set an unused bit here to indicate that the stage is active, even if it + // is just a pass-through. + constants.pack1[i][2] |= 0x80000000; + } + else + { + constants.pack1[i][2] = 0; + } + } + + dirty = true; + s_bIndirectDirty = false; + } + + if (s_bDestAlphaDirty) + { + // Destination alpha is only enabled if alpha writes are enabled. Force entire uniform to zero + // when disabled. + u32 dstalpha = bpmem.blendmode.alphaupdate && bpmem.dstalpha.enable && + bpmem.zcontrol.pixel_format == PEControl::RGBA6_Z24 ? + bpmem.dstalpha.hex : + 0; + + if (constants.dstalpha != dstalpha) + { + constants.dstalpha = dstalpha; + dirty = true; + } + } } void PixelShaderManager::SetTevColor(int index, int component, s32 value) @@ -116,20 +203,78 @@ void PixelShaderManager::SetTevKonstColor(int index, int component, s32 value) c[component] = value; dirty = true; + // Konst for ubershaders. We build the whole array on cpu so the gpu can do a single indirect + // access. + if (component != 3) // Alpha doesn't included in the .rgb konsts + constants.konst[index + 12][component] = value; + + // .rrrr .gggg .bbbb .aaaa konsts + constants.konst[index + 16 + component * 4][0] = value; + constants.konst[index + 16 + component * 4][1] = value; + constants.konst[index + 16 + component * 4][2] = value; + constants.konst[index + 16 + component * 4][3] = value; + PRIM_LOG("tev konst color%d: %d %d %d %d", index, c[0], c[1], c[2], c[3]); } +void PixelShaderManager::SetTevOrder(int index, u32 order) +{ + if (constants.pack2[index][0] != order) + { + constants.pack2[index][0] = order; + dirty = true; + } +} + +void PixelShaderManager::SetTevKSel(int index, u32 ksel) +{ + if (constants.pack2[index][1] != ksel) + { + constants.pack2[index][1] = ksel; + dirty = true; + } +} + +void PixelShaderManager::SetTevCombiner(int index, int alpha, u32 combiner) +{ + if (constants.pack1[index][alpha] != combiner) + { + constants.pack1[index][alpha] = combiner; + dirty = true; + } +} + +void PixelShaderManager::SetTevIndirectChanged() +{ + s_bIndirectDirty = true; +} + void PixelShaderManager::SetAlpha() { constants.alpha[0] = bpmem.alpha_test.ref0; constants.alpha[1] = bpmem.alpha_test.ref1; + constants.alpha[3] = static_cast(bpmem.dstalpha.alpha); dirty = true; } -void PixelShaderManager::SetDestAlpha() +void PixelShaderManager::SetAlphaTestChanged() { - constants.alpha[3] = bpmem.dstalpha.alpha; - dirty = true; + // Force alphaTest Uniform to zero if it will always pass. + // (set an extra bit to distinguish from "never && never") + // TODO: we could optimize this further and check the actual constants, + // i.e. "a <= 0" and "a >= 255" will always pass. + u32 alpha_test = + bpmem.alpha_test.TestResult() != AlphaTest::PASS ? bpmem.alpha_test.hex | 1 << 31 : 0; + if (constants.alphaTest != alpha_test) + { + constants.alphaTest = alpha_test; + dirty = true; + } +} + +void PixelShaderManager::SetDestAlphaChanged() +{ + s_bDestAlphaDirty = true; } void PixelShaderManager::SetTexDims(int texmapid, u32 width, u32 height) @@ -235,6 +380,12 @@ void PixelShaderManager::SetZTextureTypeChanged() dirty = true; } +void PixelShaderManager::SetZTextureOpChanged() +{ + constants.ztex_op = bpmem.ztex2.op; + dirty = true; +} + void PixelShaderManager::SetTexCoordChanged(u8 texmapid) { TCoordInfo& tc = bpmem.texcoords[texmapid]; @@ -262,6 +413,7 @@ void PixelShaderManager::SetFogParamChanged() constants.fogi[1] = bpmem.fog.b_magnitude; constants.fogf[1][2] = bpmem.fog.c_proj_fsel.GetC(); constants.fogi[3] = bpmem.fog.b_shift; + constants.fogParam3 = bpmem.fog.c_proj_fsel.hex; } else { @@ -269,6 +421,7 @@ void PixelShaderManager::SetFogParamChanged() constants.fogi[1] = 1; constants.fogf[1][2] = 0.f; constants.fogi[3] = 1; + constants.fogParam3 = 0; } dirty = true; } @@ -279,12 +432,68 @@ void PixelShaderManager::SetFogRangeAdjustChanged() return; s_bFogRangeAdjustChanged = true; + + if (constants.fogRangeBase != bpmem.fogRange.Base.hex) + { + constants.fogRangeBase = bpmem.fogRange.Base.hex; + dirty = true; + } +} + +void PixelShaderManager::SetGenModeChanged() +{ + constants.genmode = bpmem.genMode.hex; + s_bIndirectDirty = true; + dirty = true; +} + +void PixelShaderManager::SetZControlChanged() +{ + u32 early_ztest = bpmem.zcontrol.early_ztest ? 1 : 0; + u32 rgba6_format = + (bpmem.zcontrol.pixel_format == PEControl::RGBA6_Z24 && !g_ActiveConfig.bForceTrueColor) ? 1 : + 0; + u32 dither = rgba6_format && bpmem.blendmode.dither; + if (constants.early_ztest != early_ztest || constants.rgba6_format != rgba6_format || + constants.dither != dither) + { + constants.early_ztest = early_ztest; + constants.rgba6_format = rgba6_format; + constants.dither = dither; + dirty = true; + } + s_bDestAlphaDirty = true; +} + +void PixelShaderManager::SetBlendModeChanged() +{ + u32 dither = constants.rgba6_format && bpmem.blendmode.dither; + if (constants.dither != dither) + { + constants.dither = dither; + dirty = true; + } + s_bDestAlphaDirty = true; +} + +void PixelShaderManager::SetBoundingBoxActive(bool active) +{ + const bool enable = + active && g_ActiveConfig.bBBoxEnable && g_ActiveConfig.BBoxUseFragmentShaderImplementation(); + + if (enable == (constants.bounding_box != 0)) + return; + + constants.bounding_box = active; + dirty = true; } void PixelShaderManager::DoState(PointerWrap& p) { p.Do(s_bFogRangeAdjustChanged); p.Do(s_bViewPortChanged); + p.Do(s_bIndirectDirty); + p.Do(s_bDestAlphaDirty); p.Do(constants); diff --git a/Source/Core/VideoCommon/PixelShaderManager.h b/Source/Core/VideoCommon/PixelShaderManager.h index c7d6e3b9ee..dcb244bccb 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.h +++ b/Source/Core/VideoCommon/PixelShaderManager.h @@ -24,24 +24,36 @@ public: // so make sure to call them after memory is committed static void SetTevColor(int index, int component, s32 value); static void SetTevKonstColor(int index, int component, s32 value); + static void SetTevOrder(int index, u32 order); + static void SetTevKSel(int index, u32 ksel); + static void SetTevCombiner(int index, int alpha, u32 combiner); static void SetAlpha(); - static void SetDestAlpha(); + static void SetAlphaTestChanged(); + static void SetDestAlphaChanged(); static void SetTexDims(int texmapid, u32 width, u32 height); static void SetZTextureBias(); static void SetViewportChanged(); static void SetEfbScaleChanged(float scalex, float scaley); static void SetZSlope(float dfdx, float dfdy, float f0); static void SetIndMatrixChanged(int matrixidx); + static void SetTevIndirectChanged(); static void SetZTextureTypeChanged(); + static void SetZTextureOpChanged(); static void SetIndTexScaleChanged(bool high); static void SetTexCoordChanged(u8 texmapid); static void SetFogColorChanged(); static void SetFogParamChanged(); static void SetFogRangeAdjustChanged(); + static void SetGenModeChanged(); + static void SetZControlChanged(); + static void SetBlendModeChanged(); + static void SetBoundingBoxActive(bool active); static PixelShaderConstants constants; static bool dirty; static bool s_bFogRangeAdjustChanged; static bool s_bViewPortChanged; + static bool s_bIndirectDirty; + static bool s_bDestAlphaDirty; }; diff --git a/Source/Core/VideoCommon/ShaderGenCommon.cpp b/Source/Core/VideoCommon/ShaderGenCommon.cpp index 58085fdafa..94819f3e34 100644 --- a/Source/Core/VideoCommon/ShaderGenCommon.cpp +++ b/Source/Core/VideoCommon/ShaderGenCommon.cpp @@ -29,6 +29,9 @@ ShaderHostConfig ShaderHostConfig::GetCurrent() bits.backend_atomics = g_ActiveConfig.backend_info.bSupportsFragmentStoresAndAtomics; bits.backend_depth_clamp = g_ActiveConfig.backend_info.bSupportsDepthClamp; bits.backend_reversed_depth_range = g_ActiveConfig.backend_info.bSupportsReversedDepthRange; + bits.backend_bitfield = g_ActiveConfig.backend_info.bSupportsBitfield; + bits.backend_dynamic_sampler_indexing = + g_ActiveConfig.backend_info.bSupportsDynamicSamplerIndexing; return bits; } @@ -65,7 +68,7 @@ std::string GetDiskShaderCacheFileName(APIType api_type, const char* type, bool if (include_host_config) { - // We're using 18 bits, so 5 hex characters. + // We're using 20 bits, so 5 hex characters. ShaderHostConfig host_config = ShaderHostConfig::GetCurrent(); filename += StringFromFormat("-%05X", host_config.bits); } diff --git a/Source/Core/VideoCommon/ShaderGenCommon.h b/Source/Core/VideoCommon/ShaderGenCommon.h index 5750e58c3e..fe4b48d36a 100644 --- a/Source/Core/VideoCommon/ShaderGenCommon.h +++ b/Source/Core/VideoCommon/ShaderGenCommon.h @@ -176,7 +176,9 @@ union ShaderHostConfig u32 backend_atomics : 1; u32 backend_depth_clamp : 1; u32 backend_reversed_depth_range : 1; - u32 pad : 14; + u32 backend_bitfield : 1; + u32 backend_dynamic_sampler_indexing : 1; + u32 pad : 12; }; static ShaderHostConfig GetCurrent(); diff --git a/Source/Core/VideoCommon/UberShaderCommon.cpp b/Source/Core/VideoCommon/UberShaderCommon.cpp new file mode 100644 index 0000000000..2c6ba23c28 --- /dev/null +++ b/Source/Core/VideoCommon/UberShaderCommon.cpp @@ -0,0 +1,27 @@ +// Copyright 2017 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include "VideoCommon/UberShaderCommon.h" +#include "VideoCommon/VideoConfig.h" + +namespace UberShader +{ +void WriteUberShaderCommonHeader(ShaderCode& out, APIType api_type, + const ShaderHostConfig& host_config) +{ + // ============================================== + // BitfieldExtract for APIs which don't have it + // ============================================== + if (!host_config.backend_bitfield) + { + out.Write("uint bitfieldExtract(uint val, int off, int size) {\n" + " // This built-in function is only support in OpenGL 4.0+ and ES 3.1+\n" + " // Microsoft's HLSL compiler automatically optimises this to a bitfield extract " + "instruction.\n" + " uint mask = uint((1 << size) - 1);\n" + " return uint(val >> off) & mask;\n" + "}\n\n"); + } +} +} \ No newline at end of file diff --git a/Source/Core/VideoCommon/UberShaderCommon.h b/Source/Core/VideoCommon/UberShaderCommon.h new file mode 100644 index 0000000000..cca2cc38bf --- /dev/null +++ b/Source/Core/VideoCommon/UberShaderCommon.h @@ -0,0 +1,24 @@ +// Copyright 2017 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include "VideoCommon/ShaderGenCommon.h" +#include "VideoCommon/VideoCommon.h" + +namespace UberShader +{ +// Common functions across all ubershaders +void WriteUberShaderCommonHeader(ShaderCode& out, APIType api_type, + const ShaderHostConfig& host_config); + +// bitfieldExtract generator for BitField types +template +std::string BitfieldExtract(const std::string& source, T type) +{ + return StringFromFormat("bitfieldExtract(%s, %u, %u)", source.c_str(), + static_cast(type.StartBit()), static_cast(type.NumBits())); +} + +} // namespace UberShader diff --git a/Source/Core/VideoCommon/UberShaderPixel.cpp b/Source/Core/VideoCommon/UberShaderPixel.cpp new file mode 100644 index 0000000000..82e3382e2d --- /dev/null +++ b/Source/Core/VideoCommon/UberShaderPixel.cpp @@ -0,0 +1,1102 @@ +// Copyright 2015 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include "VideoCommon/UberShaderPixel.h" +#include "VideoCommon/BPMemory.h" +#include "VideoCommon/DriverDetails.h" +#include "VideoCommon/UberShaderCommon.h" +#include "VideoCommon/XFMemory.h" + +namespace UberShader +{ +PixelShaderUid GetPixelShaderUid() +{ + PixelShaderUid out; + pixel_ubershader_uid_data* uid_data = out.GetUidData(); + memset(uid_data, 0, sizeof(*uid_data)); + uid_data->num_texgens = xfmem.numTexGen.numTexGens; + uid_data->early_depth = + bpmem.UseEarlyDepthTest() && + (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED) && + !(bpmem.zmode.testenable && bpmem.genMode.zfreeze); + uid_data->per_pixel_depth = + (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) || + (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !uid_data->early_depth) || + (bpmem.zmode.testenable && bpmem.genMode.zfreeze); + return out; +} + +ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config, + const pixel_ubershader_uid_data* uid_data) +{ + // TODO: Support per-pixel lighting. + // This can be based on the vertex ubershaders, at the cost of a more expensive pixel shader. + const bool per_pixel_lighting = host_config.per_pixel_lighting; + const bool msaa = host_config.msaa; + const bool ssaa = host_config.ssaa; + const bool stereo = host_config.stereo; + const bool use_dual_source = host_config.backend_dual_source_blend; + const bool early_depth = uid_data->early_depth != 0; + const bool per_pixel_depth = uid_data->per_pixel_depth != 0; + const bool bounding_box = + host_config.bounding_box && g_ActiveConfig.BBoxUseFragmentShaderImplementation(); + const u32 numTexgen = uid_data->num_texgens; + ShaderCode out; + + out.Write("// Pixel UberShader for %u texgens%s%s\n", numTexgen, + early_depth ? ", early-depth" : "", per_pixel_depth ? ", per-pixel depth" : ""); + WritePixelShaderCommonHeader(out, ApiType, bounding_box); + WriteUberShaderCommonHeader(out, ApiType, host_config); + + out.Write("struct VS_OUTPUT {\n"); + GenerateVSOutputMembers(out, ApiType, numTexgen, per_pixel_lighting, ""); + out.Write("};\n"); + + // Shader inputs/outputs in GLSL (HLSL is in main). + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + { + if (use_dual_source) + { + if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_FRAGMENT_SHADER_INDEX_DECORATION)) + { + out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n"); + out.Write("FRAGMENT_OUTPUT_LOCATION(1) out vec4 ocol1;\n"); + } + else + { + out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 0) out vec4 ocol0;\n"); + out.Write("FRAGMENT_OUTPUT_LOCATION_INDEXED(0, 1) out vec4 ocol1;\n"); + } + } + else + { + out.Write("FRAGMENT_OUTPUT_LOCATION(0) out vec4 ocol0;\n"); + } + + if (per_pixel_depth) + out.Write("#define depth gl_FragDepth\n"); + + if (host_config.backend_geometry_shaders || ApiType == APIType::Vulkan) + { + out.Write("VARYING_LOCATION(0) in VertexData {\n"); + GenerateVSOutputMembers(out, ApiType, numTexgen, per_pixel_lighting, + GetInterpolationQualifier(msaa, ssaa)); + + if (stereo) + out.Write(" flat int layer;\n"); + + out.Write("};\n\n"); + } + else + { + out.Write("%s in float4 colors_0;\n", GetInterpolationQualifier(msaa, ssaa)); + out.Write("%s in float4 colors_1;\n", GetInterpolationQualifier(msaa, ssaa)); + // compute window position if needed because binding semantic WPOS is not widely supported + // Let's set up attributes + for (u32 i = 0; i < numTexgen; ++i) + out.Write("%s in float3 tex%d;\n", GetInterpolationQualifier(msaa, ssaa), i); + out.Write("%s in float4 clipPos;\n", GetInterpolationQualifier(msaa, ssaa)); + if (per_pixel_lighting) + { + out.Write("%s in float3 Normal;\n", GetInterpolationQualifier(msaa, ssaa)); + out.Write("%s in float3 WorldPos;\n", GetInterpolationQualifier(msaa, ssaa)); + } + } + } + + // Uniform index -> texture coordinates + if (numTexgen > 0) + { + if (ApiType != APIType::D3D) + { + out.Write("float3 selectTexCoord(uint index) {\n"); + } + else + { + out.Write("float3 selectTexCoord(uint index"); + for (u32 i = 0; i < numTexgen; i++) + out.Write(", float3 tex%u", i); + out.Write(") {\n"); + } + + out.Write(" switch (index) {\n"); + for (u32 i = 0; i < numTexgen; i++) + { + out.Write(" case %uu:\n" + " return tex%u;\n", + i, i); + } + out.Write(" default:\n" + " return float3(0.0, 0.0, 0.0);\n" + " }\n" + "}\n\n"); + } + + // TODO: Per pixel lighting (not really needed) + + // ===================== + // Texture Sampling + // ===================== + + if (host_config.backend_dynamic_sampler_indexing) + { + // Doesn't look like directx supports this. Oh well the code path is here just incase it + // supports this in the future. + out.Write("int4 sampleTexture(uint sampler_num, float2 uv) {\n"); + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + out.Write(" return iround(texture(samp[sampler_num], float3(uv, 0.0)) * 255.0);\n"); + else if (ApiType == APIType::D3D) + out.Write(" return iround(Tex[sampler_num].Sample(samp[sampler_num], float3(uv, 0.0)) * " + "255.0);\n"); + out.Write("}\n\n"); + } + else + { + out.Write("int4 sampleTexture(uint sampler_num, float2 uv) {\n" + " // This is messy, but DirectX, OpenGl 3.3 and Opengl ES 3.0 doesn't support " + "dynamic indexing of the sampler array\n" + " // With any luck the shader compiler will optimise this if the hardware supports " + "dynamic indexing.\n" + " switch(sampler_num) {\n"); + for (int i = 0; i < 8; i++) + { + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + out.Write(" case %du: return iround(texture(samp[%d], float3(uv, 0.0)) * 255.0);\n", i, i); + else if (ApiType == APIType::D3D) + out.Write(" case %du: return iround(Tex[%d].Sample(samp[%d], float3(uv, 0.0)) * 255.0);\n", + i, i, i); + } + out.Write(" }\n" + "}\n\n"); + } + + // ====================== + // Arbatary Swizzling + // ====================== + + out.Write("int4 Swizzle(uint s, int4 color) {\n" + " // AKA: Color Channel Swapping\n" + "\n" + " int4 ret;\n"); + out.Write(" ret.r = color[%s];\n", + BitfieldExtract("bpmem_tevksel(s * 2u)", TevKSel().swap1).c_str()); + out.Write(" ret.g = color[%s];\n", + BitfieldExtract("bpmem_tevksel(s * 2u)", TevKSel().swap2).c_str()); + out.Write(" ret.b = color[%s];\n", + BitfieldExtract("bpmem_tevksel(s * 2u + 1u)", TevKSel().swap1).c_str()); + out.Write(" ret.a = color[%s];\n", + BitfieldExtract("bpmem_tevksel(s * 2u + 1u)", TevKSel().swap2).c_str()); + out.Write(" return ret;\n" + "}\n\n"); + + // ====================== + // Indirect Wrappping + // ====================== + out.Write("int Wrap(int coord, uint mode) {\n" + " if (mode == 0u) // ITW_OFF\n" + " return coord;\n" + " else if (mode < 6u) // ITW_256 to ITW_16\n" + " return coord & (0xfffe >> mode);\n" + " else // ITW_0\n" + " return 0;\n" + "}\n\n"); + + // ====================== + // Indirect Lookup + // ====================== + auto LookupIndirectTexture = [&out](const char* out_var_name, const char* in_index_name) { + out.Write( + "{\n" + " uint iref = bpmem_iref(%s);\n" + " if ( iref != 0u)\n" + " {\n" + " uint texcoord = bitfieldExtract(iref, 0, 3);\n" + " uint texmap = bitfieldExtract(iref, 8, 3);\n" + " float3 uv = getTexCoord(texcoord);\n" + " int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * " I_TEXDIMS + "[texcoord].zw);\n" + "\n" + " if ((%s & 1u) == 0u)\n" + " fixedPoint_uv = fixedPoint_uv >> " I_INDTEXSCALE "[%s >> 1].xy;\n" + " else\n" + " fixedPoint_uv = fixedPoint_uv >> " I_INDTEXSCALE "[%s >> 1].zw;\n" + "\n" + " %s = sampleTexture(texmap, float2(fixedPoint_uv) * " I_TEXDIMS "[texmap].xy).abg;\n" + " }\n" + " else\n" + " {\n" + " %s = int3(0, 0, 0);\n" + " }\n" + "}\n", + in_index_name, in_index_name, in_index_name, in_index_name, out_var_name, out_var_name); + }; + + // ====================== + // TEV's Special Lerp + // ====================== + auto WriteTevLerp = [&out](const char* components) { + out.Write("// TEV's Linear Interpolate, plus bias, add/subtract and scale\n" + "int%s tevLerp%s(int%s A, int%s B, int%s C, int%s D, uint bias, bool op, bool alpha, " + "uint shift) {\n" + " // Scale C from 0..255 to 0..256\n" + " C += C >> 7;\n" + "\n" + " // Add bias to D\n" + " if (bias == 1u) D += 128;\n" + " else if (bias == 2u) D -= 128;\n" + "\n" + " int%s lerp = (A << 8) + (B - A)*C;\n" + " if (shift != 3u) {\n" + " lerp = lerp << shift;\n" + " D = D << shift;\n" + " }\n" + "\n" + " if ((shift == 3u) == alpha)\n" + " lerp = lerp + (op ? 127 : 128);\n" + "\n" + " int%s result = lerp >> 8;\n" + "\n" + " // Add/Subtract D\n" + " if(op) // Subtract\n" + " result = D - result;\n" + " else // Add\n" + " result = D + result;\n" + "\n" + " // Most of the Shift was moved inside the lerp for improved percision\n" + " // But we still do the divide by 2 here\n" + " if (shift == 3u)\n" + " result = result >> 1;\n" + " return result;\n" + "}\n\n", + components, components, components, components, components, components, components, + components); + }; + WriteTevLerp(""); // int + WriteTevLerp("3"); // int3 + + // ======================= + // TEV's Color Compare + // ======================= + + out.Write( + "// Implements operations 0-5 of tev's compare mode,\n" + "// which are common to both color and alpha channels\n" + "bool tevCompare(uint op, int3 color_A, int3 color_B) {\n" + " switch (op) {\n" + " case 0u: // TEVCMP_R8_GT\n" + " return (color_A.r > color_B.r);\n" + " case 1u: // TEVCMP_R8_EQ\n" + " return (color_A.r == color_B.r);\n" + " case 2u: // TEVCMP_GR16_GT\n" + " int A_16 = (color_A.r | (color_A.g << 8));\n" + " int B_16 = (color_B.r | (color_B.g << 8));\n" + " return A_16 > B_16;\n" + " case 3u: // TEVCMP_GR16_EQ\n" + " return (color_A.r == color_B.r && color_A.g == color_B.g);\n" + " case 4u: // TEVCMP_BGR24_GT\n" + " int A_24 = (color_A.r | (color_A.g << 8) | (color_A.b << 16));\n" + " int B_24 = (color_B.r | (color_B.g << 8) | (color_B.b << 16));\n" + " return A_24 > B_24;\n" + " case 5u: // TEVCMP_BGR24_EQ\n" + " return (color_A.r == color_B.r && color_A.g == color_B.g && color_A.b == color_B.b);\n" + " default:\n" + " return false;\n" + " }\n" + "}\n\n"); + + // ================= + // Alpha Compare + // ================= + + out.Write("// Helper function for Alpha Test\n" + "bool alphaCompare(int a, int b, uint compare) {\n" + " switch (compare) {\n" + " case 0u: // NEVER\n" + " return false;\n" + " case 1u: // LESS\n" + " return a < b;\n" + " case 2u: // EQUAL\n" + " return a == b;\n" + " case 3u: // LEQUAL\n" + " return a <= b;\n" + " case 4u: // GREATER\n" + " return a > b;\n" + " case 5u: // NEQUAL;\n" + " return a != b;\n" + " case 6u: // GEQUAL\n" + " return a >= b;\n" + " case 7u: // ALWAYS\n" + " return true;\n" + " }\n" + "}\n\n"); + + // ================= + // Input Selects + // ================= + + out.Write("struct State {\n" + " int4 Reg[4];\n" + " int4 TexColor;\n" + " int AlphaBump;\n" + "};\n" + "struct StageState {\n" + " uint stage;\n" + " uint order;\n" + " uint cc;\n" + " uint ac;\n"); + + // For D3D, we need to store colors in the struct, since we access it from outside + // the main function, where they are declared. Hopefully the compiler can propagate + // these through when it inlines the function. + if (ApiType == APIType::D3D) + { + for (u32 i = 0; i < numTexgen; i++) + out.Write(" float3 tex%d;\n", i); + out.Write(" float4 colors_0;\n" + " float4 colors_1;\n"); + } + + out.Write("};\n" + "\n" + "int4 getRasColor(State s, StageState ss);\n" + "int4 getKonstColor(State s, StageState ss);\n" + "\n" + "int3 selectColorInput(State s, StageState ss, uint index) {\n" + " switch (index) {\n" + " case 0u: // prev.rgb\n" + " return s.Reg[0].rgb;\n" + " case 1u: // prev.aaa\n" + " return s.Reg[0].aaa;\n" + " case 2u: // c0.rgb\n" + " return s.Reg[1].rgb;\n" + " case 3u: // c0.aaa\n" + " return s.Reg[1].aaa;\n" + " case 4u: // c1.rgb\n" + " return s.Reg[2].rgb;\n" + " case 5u: // c1.aaa\n" + " return s.Reg[2].aaa;\n" + " case 6u: // c2.rgb\n" + " return s.Reg[3].rgb;\n" + " case 7u: // c2.aaa\n" + " return s.Reg[3].aaa;\n" + " case 8u:\n" + " return s.TexColor.rgb;\n" + " case 9u:\n" + " return s.TexColor.aaa;\n" + " case 10u:\n" + " return getRasColor(s, ss).rgb;\n" + " case 11u:\n" + " return getRasColor(s, ss).aaa;\n" + " case 12u: // One\n" + " return int3(255, 255, 255);\n" + " case 13u: // Half\n" + " return int3(128, 128, 128);\n" + " case 14u:\n" + " return getKonstColor(s, ss).rgb;\n" + " case 15u: // Zero\n" + " return int3(0, 0, 0);\n" + " }\n" + "}\n" + "\n" + "int selectAlphaInput(State s, StageState ss, uint index) {\n" + " switch (index) {\n" + " case 0u: // prev.a\n" + " return s.Reg[0].a;\n" + " case 1u: // c0.a\n" + " return s.Reg[1].a;\n" + " case 2u: // c1.a\n" + " return s.Reg[2].a;\n" + " case 3u: // c2.a\n" + " return s.Reg[3].a;\n" + " case 4u:\n" + " return s.TexColor.a;\n" + " case 5u:\n" + " return getRasColor(s, ss).a;\n" + " case 6u:\n" + " return getKonstColor(s, ss).a;\n" + " case 7u: // Zero\n" + " return 0;\n" + " }\n" + "}\n" + "\n" + "int4 getTevReg(in State s, uint index) {\n" + " switch (index) {\n" + " case 0u: // prev\n" + " return s.Reg[0];\n" + " case 1u: // c0\n" + " return s.Reg[1];\n" + " case 2u: // c1\n" + " return s.Reg[2];\n" + " case 3u: // c2\n" + " return s.Reg[3];\n" + " default: // prev\n" + " return s.Reg[0];\n" + " }\n" + "}\n" + "\n" + "void setRegColor(inout State s, uint index, int3 color) {\n" + " switch (index) {\n" + " case 0u: // prev\n" + " s.Reg[0].rgb = color;\n" + " break;\n" + " case 1u: // c0\n" + " s.Reg[1].rgb = color;\n" + " break;\n" + " case 2u: // c1\n" + " s.Reg[2].rgb = color;\n" + " break;\n" + " case 3u: // c2\n" + " s.Reg[3].rgb = color;\n" + " break;\n" + " }\n" + "}\n" + "\n" + "void setRegAlpha(inout State s, uint index, int alpha) {\n" + " switch (index) {\n" + " case 0u: // prev\n" + " s.Reg[0].a = alpha;\n" + " break;\n" + " case 1u: // c0\n" + " s.Reg[1].a = alpha;\n" + " break;\n" + " case 2u: // c1\n" + " s.Reg[2].a = alpha;\n" + " break;\n" + " case 3u: // c2\n" + " s.Reg[3].a = alpha;\n" + " break;\n" + " }\n" + "}\n" + "\n"); + + // Since the texture coodinate variables aren't global, we need to pass + // them to the select function in D3D. + if (numTexgen > 0) + { + if (ApiType != APIType::D3D) + { + out.Write("#define getTexCoord(index) selectTexCoord((index))\n\n"); + } + else + { + out.Write("#define getTexCoord(index) selectTexCoord((index)"); + for (u32 i = 0; i < numTexgen; i++) + out.Write(", tex%u", i); + out.Write(")\n\n"); + } + } + + if (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) + { + if (early_depth && host_config.backend_early_z) + out.Write("FORCE_EARLY_Z;\n"); + + out.Write("void main()\n{\n"); + out.Write(" float4 rawpos = gl_FragCoord;\n"); + } + else // D3D + { + if (early_depth && host_config.backend_early_z) + out.Write("[earlydepthstencil]\n"); + + out.Write("void main(\n" + " out float4 ocol0 : SV_Target0,\n" + " out float4 ocol1 : SV_Target1,\n" + " %s\n", + per_pixel_depth ? "\n out float depth : SV_Depth," : ""); + out.Write(" in float4 rawpos : SV_Position,\n"); + + out.Write(" in %s float4 colors_0 : COLOR0,\n", GetInterpolationQualifier(msaa, ssaa)); + out.Write(" in %s float4 colors_1 : COLOR1", GetInterpolationQualifier(msaa, ssaa)); + + // compute window position if needed because binding semantic WPOS is not widely supported + for (u32 i = 0; i < numTexgen; ++i) + out.Write(",\n in %s float3 tex%u : TEXCOORD%u", GetInterpolationQualifier(msaa, ssaa), i, + i); + out.Write("\n,\n in %s float4 clipPos : TEXCOORD%u", GetInterpolationQualifier(msaa, ssaa), + numTexgen); + if (per_pixel_lighting) + { + out.Write(",\n in %s float3 Normal : TEXCOORD%u", GetInterpolationQualifier(msaa, ssaa), + numTexgen + 1); + out.Write(",\n in %s float3 WorldPos : TEXCOORD%u", GetInterpolationQualifier(msaa, ssaa), + numTexgen + 2); + } + out.Write(",\n in float clipDist0 : SV_ClipDistance0\n"); + out.Write(",\n in float clipDist1 : SV_ClipDistance1\n"); + if (stereo) + out.Write(",\n in uint layer : SV_RenderTargetArrayIndex\n"); + out.Write("\n ) {\n"); + } + + out.Write(" int3 tevcoord = int3(0, 0, 0);\n" + " State s;\n" + " s.TexColor = int4(0, 0, 0, 0);\n" + " s.AlphaBump = 0;\n" + "\n"); + for (int i = 0; i < 4; i++) + out.Write(" s.Reg[%d] = " I_COLORS "[%d];\n", i, i); + + out.Write(" uint num_stages = %s;\n\n", + BitfieldExtract("bpmem_genmode", bpmem.genMode.numtevstages).c_str()); + + out.Write(" // Main tev loop\n"); + if (ApiType == APIType::D3D) + { + // Tell DirectX we don't want this loop unrolled (it crashes if it tries to) + out.Write(" [loop]\n"); + } + + out.Write(" for(uint stage = 0u; stage <= num_stages; stage++)\n" + " {\n" + " StageState ss;\n" + " ss.stage = stage;\n" + " ss.cc = bpmem_combiners(stage).x;\n" + " ss.ac = bpmem_combiners(stage).y;\n" + " ss.order = bpmem_tevorder(stage>>1);\n" + " if ((stage & 1u) == 1u)\n" + " ss.order = ss.order >> %d;\n\n", + int(TwoTevStageOrders().enable1.StartBit() - TwoTevStageOrders().enable0.StartBit())); + + if (ApiType == APIType::D3D) + { + out.Write(" ss.colors_0 = colors_0;\n" + " ss.colors_1 = colors_1;\n"); + } + + // Disable texturing when there are no texgens (for now) + if (numTexgen != 0) + { + out.Write(" uint tex_coord = %s;\n", + BitfieldExtract("ss.order", TwoTevStageOrders().texcoord0).c_str()); + out.Write(" float3 uv = getTexCoord(tex_coord);\n" + " int2 fixedPoint_uv = int2((uv.z == 0.0 ? uv.xy : (uv.xy / uv.z)) * " I_TEXDIMS + "[tex_coord].zw);\n" + "\n" + " bool texture_enabled = (ss.order & %du) != 0u;\n", + 1 << TwoTevStageOrders().enable0.StartBit()); + out.Write("\n" + " // Indirect textures\n" + " uint tevind = bpmem_tevind(stage);\n" + " if (tevind != 0u)\n" + " {\n" + " uint bs = %s;\n", + BitfieldExtract("tevind", TevStageIndirect().bs).c_str()); + out.Write(" uint fmt = %s;\n", BitfieldExtract("tevind", TevStageIndirect().fmt).c_str()); + out.Write(" uint bias = %s;\n", + BitfieldExtract("tevind", TevStageIndirect().bias).c_str()); + out.Write(" uint bt = %s;\n", BitfieldExtract("tevind", TevStageIndirect().bt).c_str()); + out.Write(" uint mid = %s;\n", BitfieldExtract("tevind", TevStageIndirect().mid).c_str()); + out.Write("\n"); + out.Write(" int3 indcoord;\n"); + LookupIndirectTexture("indcoord", "bt"); + out.Write(" if (bs != 0u)\n" + " s.AlphaBump = indcoord[bs - 1u];\n" + " switch(fmt)\n" + " {\n" + " case %iu:\n", + ITF_8); + out.Write(" indcoord.x = indcoord.x + ((bias & 1u) != 0u ? -128 : 0);\n" + " indcoord.y = indcoord.y + ((bias & 2u) != 0u ? -128 : 0);\n" + " indcoord.z = indcoord.z + ((bias & 4u) != 0u ? -128 : 0);\n" + " s.AlphaBump = s.AlphaBump & 0xf8;\n" + " break;\n" + " case %iu:\n", + ITF_5); + out.Write(" indcoord.x = (indcoord.x & 0x1f) + ((bias & 1u) != 0u ? 1 : 0);\n" + " indcoord.y = (indcoord.y & 0x1f) + ((bias & 2u) != 0u ? 1 : 0);\n" + " indcoord.z = (indcoord.z & 0x1f) + ((bias & 4u) != 0u ? 1 : 0);\n" + " s.AlphaBump = s.AlphaBump & 0xe0;\n" + " break;\n" + " case %iu:\n", + ITF_4); + out.Write(" indcoord.x = (indcoord.x & 0x0f) + ((bias & 1u) != 0u ? 1 : 0);\n" + " indcoord.y = (indcoord.y & 0x0f) + ((bias & 2u) != 0u ? 1 : 0);\n" + " indcoord.z = (indcoord.z & 0x0f) + ((bias & 4u) != 0u ? 1 : 0);\n" + " s.AlphaBump = s.AlphaBump & 0xf0;\n" + " break;\n" + " case %iu:\n", + ITF_3); + out.Write(" indcoord.x = (indcoord.x & 0x07) + ((bias & 1u) != 0u ? 1 : 0);\n" + " indcoord.y = (indcoord.y & 0x07) + ((bias & 2u) != 0u ? 1 : 0);\n" + " indcoord.z = (indcoord.z & 0x07) + ((bias & 4u) != 0u ? 1 : 0);\n" + " s.AlphaBump = s.AlphaBump & 0xf8;\n" + " break;\n" + " }\n" + "\n" + " // Matrix multiply\n" + " int2 indtevtrans = int2(0, 0);\n" + " if ((mid & 3u) != 0u)\n" + " {\n" + " uint mtxidx = 2u * ((mid & 3u) - 1u);\n" + " int shift = " I_INDTEXMTX "[mtxidx].w;\n" + "\n" + " switch (mid >> 2)\n" + " {\n" + " case 0u: // 3x2 S0.10 matrix\n" + " indtevtrans = int2(idot(" I_INDTEXMTX + "[mtxidx].xyz, indcoord), idot(" I_INDTEXMTX "[mtxidx + 1u].xyz, indcoord)) >> 3;\n" + " break;\n" + " case 1u: // S matrix, S17.7 format\n" + " indtevtrans = (fixedPoint_uv * indcoord.xx) >> 8;\n" + " break;\n" + " case 2u: // T matrix, S17.7 format\n" + " indtevtrans = (fixedPoint_uv * indcoord.yy) >> 8;\n" + " break;\n" + " }\n" + "\n" + " if (shift >= 0)\n" + " indtevtrans = indtevtrans >> shift;\n" + " else\n" + " indtevtrans = indtevtrans << ((-shift) & 31);\n" + " }\n" + "\n" + " // Wrapping\n" + " uint sw = %s;\n", + BitfieldExtract("tevind", TevStageIndirect().sw).c_str()); + out.Write(" uint tw = %s; \n", BitfieldExtract("tevind", TevStageIndirect().tw).c_str()); + out.Write( + " int2 wrapped_coord = int2(Wrap(fixedPoint_uv.x, sw), Wrap(fixedPoint_uv.y, tw));\n" + "\n" + " if ((tevind & %du) != 0u) // add previous tevcoord\n", + 1 << TevStageIndirect().fb_addprev.StartBit()); + out.Write(" tevcoord.xy += wrapped_coord + indtevtrans;\n" + " else\n" + " tevcoord.xy = wrapped_coord + indtevtrans;\n" + "\n" + " // Emulate s24 overflows\n" + " tevcoord.xy = (tevcoord.xy << 8) >> 8;\n" + " }\n" + " else if (texture_enabled)\n" + " {\n" + " tevcoord.xy = fixedPoint_uv;\n" + " }\n" + "\n" + " // Sample texture for stage\n" + " if(texture_enabled) {\n" + " uint sampler_num = %s;\n", + BitfieldExtract("ss.order", TwoTevStageOrders().texmap0).c_str()); + out.Write("\n" + " float2 uv = (float2(tevcoord.xy)) * " I_TEXDIMS "[sampler_num].xy;\n" + "\n" + " int4 color = sampleTexture(sampler_num, uv);\n" + "\n" + " uint swap = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.tswap).c_str()); + out.Write(" s.TexColor = Swizzle(swap, color);\n"); + out.Write(" } else {\n" + " // Texture is disabled\n" + " s.TexColor = int4(255, 255, 255, 255);\n" + " }\n" + "\n"); + } + + out.Write(" // This is the Meat of TEV\n" + " {\n" + " // Color Combiner\n"); + out.Write(" uint color_a = %s;\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.a).c_str()); + out.Write(" uint color_b = %s;\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.b).c_str()); + out.Write(" uint color_c = %s;\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.c).c_str()); + out.Write(" uint color_d = %s;\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.d).c_str()); + + out.Write(" uint color_bias = %s;\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.bias).c_str()); + out.Write(" bool color_op = bool(%s);\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.op).c_str()); + out.Write(" bool color_clamp = bool(%s);\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.clamp).c_str()); + out.Write(" uint color_shift = %s;\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.shift).c_str()); + out.Write(" uint color_dest = %s;\n", + BitfieldExtract("ss.cc", TevStageCombiner().colorC.dest).c_str()); + + out.Write( + " uint color_compare_op = color_shift << 1 | uint(color_op);\n" + "\n" + " int3 color_A = selectColorInput(s, ss, color_a) & int3(255, 255, 255);\n" + " int3 color_B = selectColorInput(s, ss, color_b) & int3(255, 255, 255);\n" + " int3 color_C = selectColorInput(s, ss, color_c) & int3(255, 255, 255);\n" + " int3 color_D = selectColorInput(s, ss, color_d); // 10 bits + sign\n" // TODO: do we + // need to sign + // extend? + "\n" + " int3 color;\n" + " if(color_bias != 3u) { // Normal mode\n" + " color = tevLerp3(color_A, color_B, color_C, color_D, color_bias, color_op, false, " + "color_shift);\n" + " } else { // Compare mode\n" + " // op 6 and 7 do a select per color channel\n" + " if (color_compare_op == 6u) {\n" + " // TEVCMP_RGB8_GT\n" + " color.r = (color_A.r > color_B.r) ? color_C.r : 0;\n" + " color.g = (color_A.g > color_B.g) ? color_C.g : 0;\n" + " color.b = (color_A.b > color_B.b) ? color_C.b : 0;\n" + " } else if (color_compare_op == 7u) {\n" + " // TEVCMP_RGB8_EQ\n" + " color.r = (color_A.r == color_B.r) ? color_C.r : 0;\n" + " color.g = (color_A.g == color_B.g) ? color_C.g : 0;\n" + " color.b = (color_A.b == color_B.b) ? color_C.b : 0;\n" + " } else {\n" + " // The remaining ops do one compare which selects all 3 channels\n" + " color = tevCompare(color_compare_op, color_A, color_B) ? color_C : int3(0, 0, " + "0);\n" + " }\n" + " color = color_D + color;\n" + " }\n" + "\n" + " // Clamp result\n" + " if (color_clamp)\n" + " color = clamp(color, 0, 255);\n" + " else\n" + " color = clamp(color, -1024, 1023);\n" + "\n" + " // Write result to the correct input register of the next stage\n" + " setRegColor(s, color_dest, color);\n" + "\n"); + + // Alpha combiner + out.Write(" // Alpha Combiner\n"); + out.Write(" uint alpha_a = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.a).c_str()); + out.Write(" uint alpha_b = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.b).c_str()); + out.Write(" uint alpha_c = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.c).c_str()); + out.Write(" uint alpha_d = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.d).c_str()); + + out.Write(" uint alpha_bias = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.bias).c_str()); + out.Write(" bool alpha_op = bool(%s);\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.op).c_str()); + out.Write(" bool alpha_clamp = bool(%s);\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.clamp).c_str()); + out.Write(" uint alpha_shift = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.shift).c_str()); + out.Write(" uint alpha_dest = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.dest).c_str()); + + out.Write( + " uint alpha_compare_op = alpha_shift << 1 | uint(alpha_op);\n" + "\n" + " int alpha_A;\n" + " int alpha_B;\n" + " if (alpha_bias != 3u || alpha_compare_op > 5u) {\n" + " // Small optimisation here: alpha_A and alpha_B are unused by compare ops 0-5\n" + " alpha_A = selectAlphaInput(s, ss, alpha_a) & 255;\n" + " alpha_B = selectAlphaInput(s, ss, alpha_b) & 255;\n" + " };\n" + " int alpha_C = selectAlphaInput(s, ss, alpha_c) & 255;\n" + " int alpha_D = selectAlphaInput(s, ss, alpha_d); // 10 bits + sign\n" // TODO: do we + // need to sign + // extend? + "\n" + " int alpha;\n" + " if(alpha_bias != 3u) { // Normal mode\n" + " alpha = tevLerp(alpha_A, alpha_B, alpha_C, alpha_D, alpha_bias, alpha_op, " + "true, alpha_shift);\n" + " } else { // Compare mode\n" + " if (alpha_compare_op == 6u) {\n" + " // TEVCMP_A8_GT\n" + " alpha = (alpha_A > alpha_B) ? alpha_C : 0;\n" + " } else if (alpha_compare_op == 7u) {\n" + " // TEVCMP_A8_EQ\n" + " alpha = (alpha_A == alpha_B) ? alpha_C : 0;\n" + " } else {\n" + " // All remaining alpha compare ops actually compare the color channels\n" + " alpha = tevCompare(alpha_compare_op, color_A, color_B) ? alpha_C : 0;\n" + " }\n" + " alpha = alpha_D + alpha;\n" + " }\n" + "\n" + " // Clamp result\n" + " if (alpha_clamp)\n" + " alpha = clamp(alpha, 0, 255);\n" + " else\n" + " alpha = clamp(alpha, -1024, 1023);\n" + "\n" + " // Write result to the correct input register of the next stage\n" + " setRegAlpha(s, alpha_dest, alpha);\n" + " }\n"); + + out.Write(" } // Main tev loop\n" + "\n"); + + // Select the output color and alpha registers from the last stage. + out.Write(" int4 TevResult;\n"); + out.Write( + " TevResult.xyz = getTevReg(s, %s).xyz;\n", + BitfieldExtract("bpmem_combiners(num_stages).x", TevStageCombiner().colorC.dest).c_str()); + out.Write( + " TevResult.w = getTevReg(s, %s).w;\n", + BitfieldExtract("bpmem_combiners(num_stages).y", TevStageCombiner().alphaC.dest).c_str()); + + out.Write(" TevResult &= 255;\n\n"); + + if (host_config.fast_depth_calc) + { + if (ApiType == APIType::D3D || ApiType == APIType::Vulkan) + out.Write(" int zCoord = int((1.0 - rawpos.z) * 16777216.0);\n"); + else + out.Write(" int zCoord = int(rawpos.z * 16777216.0);\n"); + out.Write(" zCoord = clamp(zCoord, 0, 0xFFFFFF);\n" + "\n"); + } + else + { + out.Write("\tint zCoord = " I_ZBIAS "[1].x + int((clipPos.z / clipPos.w) * float(" I_ZBIAS + "[1].y));\n"); + } + + // =========== + // ZFreeze + // =========== + + if (per_pixel_depth) + { + // Zfreeze forces early depth off + out.Write(" // ZFreeze\n" + " if ((bpmem_genmode & %du) != 0u) {\n", + 1 << GenMode().zfreeze.StartBit()); + out.Write(" float2 screenpos = rawpos.xy * " I_EFBSCALE ".xy;\n"); + if (ApiType == APIType::OpenGL) + out.Write(" // Opengl has reversed vertical screenspace coordiantes\n" + " screenpos.y = 528.0 - screenpos.y;\n"); + + out.Write(" zCoord = int(" I_ZSLOPE ".z + " I_ZSLOPE ".x * screenpos.x + " I_ZSLOPE + ".y * screenpos.y);\n" + " }\n" + "\n"); + } + + // ================= + // Depth Texture + // ================= + + out.Write(" // Depth Texture\n" + " int early_zCoord = zCoord;\n" + " if (bpmem_ztex_op != 0u) {\n" + " int ztex = int(" I_ZBIAS "[1].w); // fixed bias\n" + "\n" + " // Whatever texture was in our last stage, it's now our depth texture\n" + " ztex += idot(s.TexColor.xyzw, " I_ZBIAS "[0].xyzw);\n" + " ztex += (bpmem_ztex_op == 1u) ? zCoord : 0;\n" + " zCoord = ztex & 0xFFFFFF;\n" + " }\n" + "\n"); + + if (per_pixel_depth) + { + out.Write(" // If early depth is enabled, write to zbuffer before depth textures\n"); + out.Write(" // If early depth isn't enabled, we write to the zbuffer here\n"); + out.Write(" int zbuffer_zCoord = bpmem_early_ztest ? early_zCoord : zCoord;\n"); + if (ApiType == APIType::D3D || ApiType == APIType::Vulkan) + out.Write(" depth = 1.0 - float(zbuffer_zCoord) / 16777216.0;\n"); + else + out.Write(" depth = float(zbuffer_zCoord) / 16777216.0;\n"); + } + + out.Write(" // Alpha Test\n" + " if (bpmem_alphaTest != 0u) {\n" + " bool comp0 = alphaCompare(TevResult.a, " I_ALPHA ".r, %s);\n", + BitfieldExtract("bpmem_alphaTest", AlphaTest().comp0).c_str()); + out.Write(" bool comp1 = alphaCompare(TevResult.a, " I_ALPHA ".g, %s);\n", + BitfieldExtract("bpmem_alphaTest", AlphaTest().comp1).c_str()); + out.Write("\n" + " // These if statements are written weirdly to work around intel and qualcom bugs " + "with handling booleans.\n" + " switch (%s) {\n", + BitfieldExtract("bpmem_alphaTest", AlphaTest().logic).c_str()); + out.Write(" case 0u: // AND\n" + " if (comp0 && comp1) break; else discard; break;\n" + " case 1u: // OR\n" + " if (comp0 || comp1) break; else discard; break;\n" + " case 2u: // XOR\n" + " if (comp0 != comp1) break; else discard; break;\n" + " case 3u: // XNOR\n" + " if (comp0 == comp1) break; else discard; break;\n" + " }\n" + " }\n" + "\n"); + + // ========= + // Dithering + // ========= + out.Write(" if (bpmem_dither) {\n" + " // Flipper uses a standard 2x2 Bayer Matrix for 6 bit dithering\n" + " // Here the matrix is encoded into the two factor constants\n" + " int2 dither = int2(rawpos.xy) & 1;\n" + " TevResult.rgb = (TevResult.rgb - (TevResult.rgb >> 6)) + abs(dither.y * 3 - " + "dither.x * 2);\n" + " }\n\n"); + + // ========= + // Fog + // ========= + + // FIXME: Fog is implemented the same as ShaderGen, but ShaderGen's fog is all hacks. + // Should be fixed point, and should not make guesses about Range-Based adjustments. + out.Write(" // Fog\n" + " uint fog_function = %s;\n", + BitfieldExtract("bpmem_fogParam3", FogParam3().fsel).c_str()); + out.Write(" if (fog_function != 0u) {\n" + " // TODO: This all needs to be converted from float to fixed point\n" + " float ze;\n" + " if (%s == 0u) {\n", + BitfieldExtract("bpmem_fogParam3", FogParam3().proj).c_str()); + out.Write(" // perspective\n" + " // ze = A/(B - (Zs >> B_SHF)\n" + " ze = (" I_FOGF "[1].x * 16777216.0) / float(" I_FOGI ".y - (zCoord >> " I_FOGI + ".w));\n" + " } else {\n" + " // orthographic\n" + " // ze = a*Zs (here, no B_SHF)\n" + " ze = " I_FOGF "[1].x * float(zCoord) / 16777216.0;\n" + " }\n" + "\n" + " if (bool(%s)) {\n", + BitfieldExtract("bpmem_fogRangeBase", FogRangeParams::RangeBase().Enabled).c_str()); + out.Write(" // x_adjust = sqrt((x-center)^2 + k^2)/k\n" + " // ze *= x_adjust\n" + " // TODO Instead of this theoretical calculation, we should use the\n" + " // coefficient table given in the fog range BP registers!\n" + " float x_adjust = (2.0 * (rawpos.x / " I_FOGF "[0].y)) - 1.0 - " I_FOGF + "[0].x; \n" + " x_adjust = sqrt(x_adjust * x_adjust + " I_FOGF "[0].z * " I_FOGF + "[0].z) / " I_FOGF "[0].z;\n" + " ze *= x_adjust;\n" + " }\n" + "\n" + " float fog = clamp(ze - " I_FOGF "[1].z, 0.0, 1.0);\n" + "\n" + " if (fog_function > 3u) {\n" + " switch (fog_function) {\n" + " case 4u:\n" + " fog = 1.0 - exp2(-8.0 * fog);\n" + " break;\n" + " case 5u:\n" + " fog = 1.0 - exp2(-8.0 * fog * fog);\n" + " break;\n" + " case 6u:\n" + " fog = exp2(-8.0 * (1.0 - fog));\n" + " break;\n" + " case 7u:\n" + " fog = 1.0 - fog;\n" + " fog = exp2(-8.0 * fog * fog);\n" + " break;\n" + " }\n" + " }\n" + "\n" + " int ifog = iround(fog * 256.0);\n" + " TevResult.rgb = (TevResult.rgb * (256 - ifog) + " I_FOGCOLOR ".rgb * ifog) >> 8;\n" + " }\n" + "\n"); + + // TODO: Do we still want to support two pass alpha blending? + out.Write(" if (bpmem_rgba6_format)\n" + " ocol0.rgb = float3(TevResult.rgb >> 2) / 63.0;\n" + " else\n" + " ocol0.rgb = float3(TevResult.rgb) / 255.0;\n" + "\n" + " if (bpmem_dstalpha != 0u)\n"); + out.Write(" ocol0.a = float(%s >> 2) / 63.0;\n", + BitfieldExtract("bpmem_dstalpha", ConstantAlpha().alpha).c_str()); + out.Write(" else\n" + " ocol0.a = float(TevResult.a >> 2) / 63.0;\n" + " \n"); + + if (use_dual_source) + { + out.Write(" // Dest alpha override (dual source blending)\n" + " // Colors will be blended against the alpha from ocol1 and\n" + " // the alpha from ocol0 will be written to the framebuffer.\n" + " ocol1 = float4(0.0, 0.0, 0.0, float(TevResult.a) / 255.0);\n"); + } + + if (bounding_box) + { + const char* atomic_op = + (ApiType == APIType::OpenGL || ApiType == APIType::Vulkan) ? "atomic" : "Interlocked"; + out.Write(" if (bpmem_bounding_box) {\n"); + out.Write(" if(bbox_data[0] > int(rawpos.x)) %sMin(bbox_data[0], int(rawpos.x));\n", + atomic_op); + out.Write(" if(bbox_data[1] < int(rawpos.x)) %sMax(bbox_data[1], int(rawpos.x));\n", + atomic_op); + out.Write(" if(bbox_data[2] > int(rawpos.y)) %sMin(bbox_data[2], int(rawpos.y));\n", + atomic_op); + out.Write(" if(bbox_data[3] < int(rawpos.y)) %sMax(bbox_data[3], int(rawpos.y));\n", + atomic_op); + out.Write(" }\n"); + } + + out.Write("}\n" + "\n" + "int4 getRasColor(State s, StageState ss) {\n" + " // Select Ras for stage\n" + " uint ras = %s;\n", + BitfieldExtract("ss.order", TwoTevStageOrders().colorchan0).c_str()); + out.Write(" if (ras < 2u) { // Lighting Channel 0 or 1\n" + " int4 color = iround(((ras == 0u) ? %scolors_0 : %scolors_1) * 255.0);\n", + (ApiType == APIType::D3D) ? "ss." : "", (ApiType == APIType::D3D) ? "ss." : ""); + out.Write(" uint swap = %s;\n", + BitfieldExtract("ss.ac", TevStageCombiner().alphaC.rswap).c_str()); + out.Write(" return Swizzle(swap, color);\n"); + out.Write(" } else if (ras == 5u) { // Alpha Bumb\n" + " return int4(s.AlphaBump, s.AlphaBump, s.AlphaBump, s.AlphaBump);\n" + " } else if (ras == 6u) { // Normalzied Alpha Bump\n" + " int normalized = s.AlphaBump | s.AlphaBump >> 5;\n" + " return int4(normalized, normalized, normalized, normalized);\n" + " } else {\n" + " return int4(0, 0, 0, 0);\n" + " }\n" + "}\n" + "\n" + "int4 getKonstColor(State s, StageState ss) {\n" + " // Select Konst for stage\n" + " // TODO: a switch case might be better here than an dynamically" + " // indexed uniform lookup\n" + " uint tevksel = bpmem_tevksel(ss.stage>>1);\n" + " if ((ss.stage & 1u) == 0u)\n" + " return int4(konstLookup[%s].rgb, konstLookup[%s].a);\n", + BitfieldExtract("tevksel", bpmem.tevksel[0].kcsel0).c_str(), + BitfieldExtract("tevksel", bpmem.tevksel[0].kasel0).c_str()); + out.Write(" else\n" + " return int4(konstLookup[%s].rgb, konstLookup[%s].a);\n", + BitfieldExtract("tevksel", bpmem.tevksel[0].kcsel1).c_str(), + BitfieldExtract("tevksel", bpmem.tevksel[0].kasel1).c_str()); + out.Write("}\n"); + + return out; +} + +void EnumeratePixelShaderUids(const std::function& callback) +{ + PixelShaderUid uid; + std::memset(&uid, 0, sizeof(uid)); + + for (u32 texgens = 0; texgens <= 8; texgens++) + { + auto* puid = uid.GetUidData(); + puid->num_texgens = texgens; + + for (u32 early_depth = 0; early_depth < 2; early_depth++) + { + puid->early_depth = early_depth != 0; + for (u32 per_pixel_depth = 0; per_pixel_depth < 2; per_pixel_depth++) + { + // Don't generate shaders where we have early depth tests enabled, and write gl_FragDepth. + if (early_depth && per_pixel_depth) + continue; + + puid->per_pixel_depth = per_pixel_depth != 0; + callback(uid); + } + } + } +} +} diff --git a/Source/Core/VideoCommon/UberShaderPixel.h b/Source/Core/VideoCommon/UberShaderPixel.h new file mode 100644 index 0000000000..d7dc8109e8 --- /dev/null +++ b/Source/Core/VideoCommon/UberShaderPixel.h @@ -0,0 +1,31 @@ +// Copyright 2015 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#pragma once + +#include +#include "VideoCommon/PixelShaderGen.h" + +namespace UberShader +{ +#pragma pack(1) +struct pixel_ubershader_uid_data +{ + u32 num_texgens : 4; + u32 early_depth : 1; + u32 per_pixel_depth : 1; + + u32 NumValues() const { return sizeof(pixel_ubershader_uid_data); } +}; +#pragma pack() + +typedef ShaderUid PixelShaderUid; + +PixelShaderUid GetPixelShaderUid(); + +ShaderCode GenPixelShader(APIType ApiType, const ShaderHostConfig& host_config, + const pixel_ubershader_uid_data* uid_data); + +void EnumeratePixelShaderUids(const std::function& callback); +} diff --git a/Source/Core/VideoCommon/VideoCommon.vcxproj b/Source/Core/VideoCommon/VideoCommon.vcxproj index 8dad6ea023..0d1bc70f4e 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcxproj +++ b/Source/Core/VideoCommon/VideoCommon.vcxproj @@ -67,6 +67,8 @@ + + @@ -109,6 +111,8 @@ + + @@ -174,4 +178,4 @@ - \ No newline at end of file + diff --git a/Source/Core/VideoCommon/VideoCommon.vcxproj.filters b/Source/Core/VideoCommon/VideoCommon.vcxproj.filters index 4d0348d7b0..43f6a29b84 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcxproj.filters +++ b/Source/Core/VideoCommon/VideoCommon.vcxproj.filters @@ -179,6 +179,12 @@ Util + + Shader Generators + + + Shader Generators + @@ -338,6 +344,12 @@ Util + + Shader Generators + + + Shader Generators + diff --git a/Source/Core/VideoCommon/VideoConfig.h b/Source/Core/VideoCommon/VideoConfig.h index 1216bcc431..e6cbfde566 100644 --- a/Source/Core/VideoCommon/VideoConfig.h +++ b/Source/Core/VideoCommon/VideoConfig.h @@ -204,6 +204,8 @@ struct VideoConfig final bool bSupportsInternalResolutionFrameDumps; bool bSupportsGPUTextureDecoding; bool bSupportsST3CTextures; + bool bSupportsBitfield; // Needed by UberShaders, so must stay in VideoCommon + bool bSupportsDynamicSamplerIndexing; // Needed by UberShaders, so must stay in VideoCommon } backend_info; // Utility