From c7e270cf1588b7bbf73913d542ca5d40ee2728a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dar=C3=ADo?= <dariosamo@gmail.com>
Date: Tue, 5 Nov 2024 21:40:11 -0300
Subject: [PATCH] Fix TLUT masking to the lower half of TMEM. (#91)

---
 src/common/rt64_tmem_hasher.h    | 16 ++++++++++++----
 src/shaders/TextureDecoder.hlsli |  7 ++++---
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/common/rt64_tmem_hasher.h b/src/common/rt64_tmem_hasher.h
index c5b0124..b260a5d 100644
--- a/src/common/rt64_tmem_hasher.h
+++ b/src/common/rt64_tmem_hasher.h
@@ -6,7 +6,7 @@
 
 namespace RT64 {
     struct TMEMHasher {
-        static const uint32_t CurrentHashVersion = 2;
+        static const uint32_t CurrentHashVersion = 3;
 
         static bool needsToHashRowsIndividually(const LoadTile &loadTile, uint32_t width) {
             // When using 32-bit formats, TMEM contents are split in half in the lower and upper half, so the size per row is effectively
@@ -24,10 +24,18 @@ namespace RT64 {
             XXH3_state_t xxh3;
             XXH3_64bits_reset(&xxh3);
             const bool RGBA32 = (loadTile.siz == G_IM_SIZ_32b) && (loadTile.fmt == G_IM_FMT_RGBA);
-            const uint32_t tmemSize = RGBA32 ? (TMEMBytes >> 1) : TMEMBytes;
+            const bool usesTLUT = tlut > 0;
+            bool halfTMEM = RGBA32;
+            
+            // Version 3 fixes an error where using TLUT did not mask the address to the lower half of TMEM.
+            if ((version >= 3) && usesTLUT) {
+                halfTMEM = true;
+            }
+
+            const uint32_t tmemSize = halfTMEM ? (TMEMBytes >> 1) : TMEMBytes;
             const uint32_t drawBytesPerRow = std::max(uint32_t(width) << (RGBA32 ? G_IM_SIZ_16b : loadTile.siz) >> 1U, 1U);
             const uint32_t drawBytesTotal = (loadTile.line << 3) * (height - 1) + drawBytesPerRow;
-            const uint32_t tmemMask = RGBA32 ? TMEMMask16 : TMEMMask8;
+            const uint32_t tmemMask = halfTMEM ? TMEMMask16 : TMEMMask8;
             const uint32_t tmemAddress = (loadTile.tmem << 3) & tmemMask;
             auto hashTMEM = [&](uint32_t tmemBaseAddress, uint32_t tmemOrAddress, uint32_t byteCount) {
                 // Too many bytes to hash in a single step. Wrap around TMEM and hash the rest.
@@ -64,7 +72,7 @@ namespace RT64 {
             }
 
             // If TLUT is active, we also hash the corresponding palette bytes.
-            if (tlut > 0) {
+            if (usesTLUT) {
                 const bool CI4 = (loadTile.siz == G_IM_SIZ_4b);
                 const int32_t paletteOffset = CI4 ? (loadTile.palette << 7) : 0;
                 const int32_t bytesToHash = CI4 ? 0x80 : 0x800;
diff --git a/src/shaders/TextureDecoder.hlsli b/src/shaders/TextureDecoder.hlsli
index 62af9c9..c85b0c4 100644
--- a/src/shaders/TextureDecoder.hlsli
+++ b/src/shaders/TextureDecoder.hlsli
@@ -168,6 +168,7 @@ float4 sampleTMEM(int2 texelInt, uint siz, uint fmt, uint address, uint stride,
     const bool oddRow = (texelInt.y & 1);
     const bool oddColumn = (texelInt.x & 1);
     const bool isRgba32 = and(fmt == G_IM_FMT_RGBA, siz == G_IM_SIZ_32b);
+    const bool usesTlut = tlut > 0;
     // Determine the left shift to use to calculate the TMEM address. Effectively log2 of the pixel stride in half-bytes.
     //   4-bit (siz 0) -> 0
     //   8-bit (siz 1) -> 1
@@ -176,8 +177,8 @@ float4 sampleTMEM(int2 texelInt, uint siz, uint fmt, uint address, uint stride,
     //   RGBA32 (siz 3) -> 2 (32-bit RGBA textures sample both halves of TMEM, so their pixel stride is only 16 bits).
     const uint tmemShift = select_uint(isRgba32, 2, siz);
 
-    // Determin the TMEM address mask. Each sample in RGBA32 only addresses half of TMEM.
-    const uint addressMask = select_uint(isRgba32, RDP_TMEM_MASK16, RDP_TMEM_MASK8);
+    // Determine the TMEM address mask. When using RGBA32 or TLUT, each sample only addresses half of TMEM.
+    const uint addressMask = select_uint(or(isRgba32, usesTlut), RDP_TMEM_MASK16, RDP_TMEM_MASK8);
 
     // Load the two low samples for most formats.
     const uint pixelAddress = texelInt.y * stride + ((texelInt.x << tmemShift) >> 1);
@@ -188,7 +189,7 @@ float4 sampleTMEM(int2 texelInt, uint siz, uint fmt, uint address, uint stride,
     const uint pixelShift = select_uint(oddColumn, 0, 4);
     const uint pixelValue4bit = (pixelValue0 >> pixelShift) & 0xF;
 
-    if (tlut > 0) {
+    if (usesTlut) {
         // Determine the palette index and load the value from the palette.
         const uint paletteAddress = select_uint(siz == G_IM_SIZ_4b,
             RDP_TMEM_PALETTE + (palette << 7) + ((pixelValue4bit) << 3),