diff --git a/src/common/rt64_tmem_hasher.h b/src/common/rt64_tmem_hasher.h index c5b0124..b260a5d 100644 --- a/src/common/rt64_tmem_hasher.h +++ b/src/common/rt64_tmem_hasher.h @@ -6,7 +6,7 @@ namespace RT64 { struct TMEMHasher { - static const uint32_t CurrentHashVersion = 2; + static const uint32_t CurrentHashVersion = 3; static bool needsToHashRowsIndividually(const LoadTile &loadTile, uint32_t width) { // When using 32-bit formats, TMEM contents are split in half in the lower and upper half, so the size per row is effectively @@ -24,10 +24,18 @@ namespace RT64 { XXH3_state_t xxh3; XXH3_64bits_reset(&xxh3); const bool RGBA32 = (loadTile.siz == G_IM_SIZ_32b) && (loadTile.fmt == G_IM_FMT_RGBA); - const uint32_t tmemSize = RGBA32 ? (TMEMBytes >> 1) : TMEMBytes; + const bool usesTLUT = tlut > 0; + bool halfTMEM = RGBA32; + + // Version 3 fixes an error where using TLUT did not mask the address to the lower half of TMEM. + if ((version >= 3) && usesTLUT) { + halfTMEM = true; + } + + const uint32_t tmemSize = halfTMEM ? (TMEMBytes >> 1) : TMEMBytes; const uint32_t drawBytesPerRow = std::max(uint32_t(width) << (RGBA32 ? G_IM_SIZ_16b : loadTile.siz) >> 1U, 1U); const uint32_t drawBytesTotal = (loadTile.line << 3) * (height - 1) + drawBytesPerRow; - const uint32_t tmemMask = RGBA32 ? TMEMMask16 : TMEMMask8; + const uint32_t tmemMask = halfTMEM ? TMEMMask16 : TMEMMask8; const uint32_t tmemAddress = (loadTile.tmem << 3) & tmemMask; auto hashTMEM = [&](uint32_t tmemBaseAddress, uint32_t tmemOrAddress, uint32_t byteCount) { // Too many bytes to hash in a single step. Wrap around TMEM and hash the rest. @@ -64,7 +72,7 @@ namespace RT64 { } // If TLUT is active, we also hash the corresponding palette bytes. - if (tlut > 0) { + if (usesTLUT) { const bool CI4 = (loadTile.siz == G_IM_SIZ_4b); const int32_t paletteOffset = CI4 ? (loadTile.palette << 7) : 0; const int32_t bytesToHash = CI4 ? 0x80 : 0x800; diff --git a/src/shaders/TextureDecoder.hlsli b/src/shaders/TextureDecoder.hlsli index 62af9c9..c85b0c4 100644 --- a/src/shaders/TextureDecoder.hlsli +++ b/src/shaders/TextureDecoder.hlsli @@ -168,6 +168,7 @@ float4 sampleTMEM(int2 texelInt, uint siz, uint fmt, uint address, uint stride, const bool oddRow = (texelInt.y & 1); const bool oddColumn = (texelInt.x & 1); const bool isRgba32 = and(fmt == G_IM_FMT_RGBA, siz == G_IM_SIZ_32b); + const bool usesTlut = tlut > 0; // Determine the left shift to use to calculate the TMEM address. Effectively log2 of the pixel stride in half-bytes. // 4-bit (siz 0) -> 0 // 8-bit (siz 1) -> 1 @@ -176,8 +177,8 @@ float4 sampleTMEM(int2 texelInt, uint siz, uint fmt, uint address, uint stride, // RGBA32 (siz 3) -> 2 (32-bit RGBA textures sample both halves of TMEM, so their pixel stride is only 16 bits). const uint tmemShift = select_uint(isRgba32, 2, siz); - // Determin the TMEM address mask. Each sample in RGBA32 only addresses half of TMEM. - const uint addressMask = select_uint(isRgba32, RDP_TMEM_MASK16, RDP_TMEM_MASK8); + // Determine the TMEM address mask. When using RGBA32 or TLUT, each sample only addresses half of TMEM. + const uint addressMask = select_uint(or(isRgba32, usesTlut), RDP_TMEM_MASK16, RDP_TMEM_MASK8); // Load the two low samples for most formats. const uint pixelAddress = texelInt.y * stride + ((texelInt.x << tmemShift) >> 1); @@ -188,7 +189,7 @@ float4 sampleTMEM(int2 texelInt, uint siz, uint fmt, uint address, uint stride, const uint pixelShift = select_uint(oddColumn, 0, 4); const uint pixelValue4bit = (pixelValue0 >> pixelShift) & 0xF; - if (tlut > 0) { + if (usesTlut) { // Determine the palette index and load the value from the palette. const uint paletteAddress = select_uint(siz == G_IM_SIZ_4b, RDP_TMEM_PALETTE + (palette << 7) + ((pixelValue4bit) << 3),