Move clampWrapMirrorSample and sampleTextureLevel calls into loops to significantly improve ubershader size

2024-12-26 03:15:44 +00:00 · 2024-11-02 19:18:10 -04:00 · 2024-11-02 19:18:10 -04:00 · 2b5b379448
commit 2b5b379448
parent d450596417
1 changed files with 47 additions and 12 deletions
--- a/src/shaders/TextureSampler.hlsli
+++ b/src/shaders/TextureSampler.hlsli
@ -118,12 +118,19 @@ float4 sampleTextureLevel(const RDPTile rdpTile, const GPUTile gpuTile, bool fil
    tcScale /= mipScale;
    
    int2 texelBaseInt = floor(uvCoord);
-    float4 sample00 = clampWrapMirrorSample(rdpTile, gpuTile, tcScale, texelBaseInt + int2(0, 0), textureIndex, tlut, canDecodeTMEM, mipLevel, usesHDR);
-    if (filterBilerp || linearFiltering) {
+    bool filtering = or(filterBilerp, linearFiltering);
+    int numSamples = select_uint(filtering, 4, 1);
+    float4 samples[4];
+    [unroll]
+    for (int i = 0; i < numSamples; i++) {
+        samples[i] = clampWrapMirrorSample(rdpTile, gpuTile, tcScale, texelBaseInt + int2(i >> 1, i & 1), textureIndex, tlut, canDecodeTMEM, mipLevel, usesHDR);
+    }
+    float4 sample00 = samples[0];
+    if (filtering) {
        float2 fracPart = uvCoord - texelBaseInt;
-        float4 sample01 = clampWrapMirrorSample(rdpTile, gpuTile, tcScale, texelBaseInt + int2(0, 1), textureIndex, tlut, canDecodeTMEM, mipLevel, usesHDR);
-        float4 sample10 = clampWrapMirrorSample(rdpTile, gpuTile, tcScale, texelBaseInt + int2(1, 0), textureIndex, tlut, canDecodeTMEM, mipLevel, usesHDR);
-        float4 sample11 = clampWrapMirrorSample(rdpTile, gpuTile, tcScale, texelBaseInt + int2(1, 1), textureIndex, tlut, canDecodeTMEM, mipLevel, usesHDR);
+        float4 sample01 = samples[1];
+        float4 sample10 = samples[2];
+        float4 sample11 = samples[3];
        if (linearFiltering) {
            return lerp(lerp(sample00, sample10, fracPart.x), lerp(sample01, sample11, fracPart.x), fracPart.y);
        }
@ -187,27 +194,37 @@ float4 sampleTexture(OtherMode otherMode, RenderFlags renderFlags, float2 inputU
    const bool canDecodeTMEM = renderFlagCanDecodeTMEM(renderFlags);
    const bool usesHDR = renderFlagUsesHDR(renderFlags);
    const uint nativeSampler = rdpTile.nativeSampler;
-    if (!gpuTileFlagHasMipmaps(gpuTile.flags)) {
-        return sampleTextureLevel(rdpTile, gpuTile, filterBilerp, filterAverage, linearFiltering, uvCoord, textureIndex, tlut, canDecodeTMEM, 0, usesHDR);
+    const bool flagHasMipmaps = gpuTileFlagHasMipmaps(gpuTile.flags);
+    uint numRDPSamples = 0;
+    uint RDPMipLevels[2];
+    RDPMipLevels[0] = 0;
+    RDPMipLevels[1] = 0;
+    float mip;
+
+    // Determine the RDP sample count and mip levels.
+    if (!flagHasMipmaps) {
+        numRDPSamples = 1;
    }
    else {
        // Retrieve the dimensions of the texture for either type of sampler.
        Texture2D texture = gTextures[NonUniformResourceIndex(textureIndex)];
        uint textureWidth, textureHeight, textureLevels;
        texture.GetDimensions(0, textureWidth, textureHeight, textureLevels);
-        
+
        if (nativeSampler == NATIVE_SAMPLER_NONE) {
            float2 ddxUVxScaled = ddxUVx * gpuTile.tcScale;
            float2 ddxUVyScaled = ddyUVy * gpuTile.tcScale;
            float ddMax = max(dot(ddxUVxScaled, ddxUVxScaled), dot(ddxUVyScaled, ddxUVyScaled));
            float mipBias = -0.5f;
-            float mip = 0.5 * log2(ddMax) + mipBias;
+            mip = 0.5 * log2(ddMax) + mipBias;
            float maxMip = float(textureLevels - 1);
-            float4 hiLevel = sampleTextureLevel(rdpTile, gpuTile, filterBilerp, filterAverage, linearFiltering, uvCoord, textureIndex, tlut, canDecodeTMEM, min(floor(mip), maxMip), usesHDR);
-            float4 loLevel = sampleTextureLevel(rdpTile, gpuTile, filterBilerp, filterAverage, linearFiltering, uvCoord, textureIndex, tlut, canDecodeTMEM, min(floor(mip) + 1, maxMip), usesHDR);
-            return lerp(hiLevel, loLevel, frac(mip));
+            RDPMipLevels[0] = min(floor(mip), maxMip);
+            RDPMipLevels[1] = min(floor(mip) + 1, maxMip);
+            numRDPSamples = 2;
        }
        else {
+            // Native sampling, just return the native sampling result directly.
+            numRDPSamples = 0;
            // Must normalize the texture coordinate and the derivatives.
            float2 nativeUVCoord = uvCoord / float2(textureWidth, textureHeight);
            float2 originalSize = float2(textureWidth, textureHeight) / gpuTile.tcScale;
@ -238,4 +255,22 @@ float4 sampleTexture(OtherMode otherMode, RenderFlags renderFlags, float2 inputU
            }
        }
    }
+
+    float4 textureSamples[2];
+
+    // Perform the RDP sampling.
+    [unroll]
+    for (uint i = 0; i < numRDPSamples; i++) {
+        textureSamples[i] = sampleTextureLevel(rdpTile, gpuTile, filterBilerp, filterAverage, linearFiltering, uvCoord, textureIndex, tlut, canDecodeTMEM, RDPMipLevels[i], usesHDR);
+    }
+
+    // Compute the result from the RDP samples.
+    if (!flagHasMipmaps) {
+        return textureSamples[0];
+    }
+    else {
+        float4 hiLevel = textureSamples[0];
+        float4 loLevel = textureSamples[1];
+        return lerp(hiLevel, loLevel, frac(mip));
+    }
 }