Fix TLUT masking to the lower half of TMEM. (#91)

2024-12-26 03:15:44 +00:00 · 2024-11-05 21:40:11 -03:00 · 2024-11-05 21:40:11 -03:00 · c7e270cf15
commit c7e270cf15
parent 180f663938
2 changed files with 16 additions and 7 deletions
--- a/src/common/rt64_tmem_hasher.h
+++ b/src/common/rt64_tmem_hasher.h
@ -6,7 +6,7 @@

 namespace RT64 {
    struct TMEMHasher {
-        static const uint32_t CurrentHashVersion = 2;
+        static const uint32_t CurrentHashVersion = 3;

        static bool needsToHashRowsIndividually(const LoadTile &loadTile, uint32_t width) {
            // When using 32-bit formats, TMEM contents are split in half in the lower and upper half, so the size per row is effectively
@ -24,10 +24,18 @@ namespace RT64 {
            XXH3_state_t xxh3;
            XXH3_64bits_reset(&xxh3);
            const bool RGBA32 = (loadTile.siz == G_IM_SIZ_32b) && (loadTile.fmt == G_IM_FMT_RGBA);
-            const uint32_t tmemSize = RGBA32 ? (TMEMBytes >> 1) : TMEMBytes;
+            const bool usesTLUT = tlut > 0;
+            bool halfTMEM = RGBA32;
+            
+            // Version 3 fixes an error where using TLUT did not mask the address to the lower half of TMEM.
+            if ((version >= 3) && usesTLUT) {
+                halfTMEM = true;
+            }
+
+            const uint32_t tmemSize = halfTMEM ? (TMEMBytes >> 1) : TMEMBytes;
            const uint32_t drawBytesPerRow = std::max(uint32_t(width) << (RGBA32 ? G_IM_SIZ_16b : loadTile.siz) >> 1U, 1U);
            const uint32_t drawBytesTotal = (loadTile.line << 3) * (height - 1) + drawBytesPerRow;
-            const uint32_t tmemMask = RGBA32 ? TMEMMask16 : TMEMMask8;
+            const uint32_t tmemMask = halfTMEM ? TMEMMask16 : TMEMMask8;
            const uint32_t tmemAddress = (loadTile.tmem << 3) & tmemMask;
            auto hashTMEM = [&](uint32_t tmemBaseAddress, uint32_t tmemOrAddress, uint32_t byteCount) {
                // Too many bytes to hash in a single step. Wrap around TMEM and hash the rest.
@ -64,7 +72,7 @@ namespace RT64 {
            }

            // If TLUT is active, we also hash the corresponding palette bytes.
-            if (tlut > 0) {
+            if (usesTLUT) {
                const bool CI4 = (loadTile.siz == G_IM_SIZ_4b);
                const int32_t paletteOffset = CI4 ? (loadTile.palette << 7) : 0;
                const int32_t bytesToHash = CI4 ? 0x80 : 0x800;
--- a/src/shaders/TextureDecoder.hlsli
+++ b/src/shaders/TextureDecoder.hlsli
@ -168,6 +168,7 @@ float4 sampleTMEM(int2 texelInt, uint siz, uint fmt, uint address, uint stride,
    const bool oddRow = (texelInt.y & 1);
    const bool oddColumn = (texelInt.x & 1);
    const bool isRgba32 = and(fmt == G_IM_FMT_RGBA, siz == G_IM_SIZ_32b);
+    const bool usesTlut = tlut > 0;
    // Determine the left shift to use to calculate the TMEM address. Effectively log2 of the pixel stride in half-bytes.
    //   4-bit (siz 0) -> 0
    //   8-bit (siz 1) -> 1
@ -176,8 +177,8 @@ float4 sampleTMEM(int2 texelInt, uint siz, uint fmt, uint address, uint stride,
    //   RGBA32 (siz 3) -> 2 (32-bit RGBA textures sample both halves of TMEM, so their pixel stride is only 16 bits).
    const uint tmemShift = select_uint(isRgba32, 2, siz);

-    // Determin the TMEM address mask. Each sample in RGBA32 only addresses half of TMEM.
-    const uint addressMask = select_uint(isRgba32, RDP_TMEM_MASK16, RDP_TMEM_MASK8);
+    // Determine the TMEM address mask. When using RGBA32 or TLUT, each sample only addresses half of TMEM.
+    const uint addressMask = select_uint(or(isRgba32, usesTlut), RDP_TMEM_MASK16, RDP_TMEM_MASK8);

    // Load the two low samples for most formats.
    const uint pixelAddress = texelInt.y * stride + ((texelInt.x << tmemShift) >> 1);
@ -188,7 +189,7 @@ float4 sampleTMEM(int2 texelInt, uint siz, uint fmt, uint address, uint stride,
    const uint pixelShift = select_uint(oddColumn, 0, 4);
    const uint pixelValue4bit = (pixelValue0 >> pixelShift) & 0xF;

-    if (tlut > 0) {
+    if (usesTlut) {
        // Determine the palette index and load the value from the palette.
        const uint paletteAddress = select_uint(siz == G_IM_SIZ_4b,
            RDP_TMEM_PALETTE + (palette << 7) + ((pixelValue4bit) << 3),