SPU ASMJIT: allow holes in raw block data

This is preparation for further changes. This commit shouldn't affect anything.
2024-11-17 08:11:51 +00:00 · 2018-04-28 20:19:57 +03:00 · 2018-04-28 20:19:57 +03:00 · df453d6d4f
commit df453d6d4f
parent 2fecddcde2
1 changed files with 266 additions and 169 deletions
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -160,6 +160,8 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)

 	// Start compilation
 	m_pos = func[0];
+	const u32 start = m_pos;
+	const u32 end = m_pos + (func.size() - 1) * 4;

 	// Set PC and check status
 	c->mov(SPU_OFF_32(pc), m_pos);
@ -174,6 +176,23 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		//c->jnz(label_stop);
 	}

+	// Get bit mask of valid code words for a given range (up to 128 bytes)
+	auto get_code_mask = [&](u32 starta, u32 enda) -> u32
+	{
+		u32 result = 0;
+
+		for (u32 addr = starta, m = 1; addr < enda && m; addr += 4, m <<= 1)
+		{
+			// Filter out if out of range, or is a hole
+			if (addr >= start && addr < end && func[(addr - start) / 4 + 1])
+			{
+				result |= m;
+			}
+		}
+
+		return result;
+	};
+
 	// Check code
 	if (false)
 	{
@ -196,82 +215,78 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		words_align = 64;

 		const u32 starta = m_pos & -64;
-		const u32 end = m_pos + (func.size() - 1) * 4;
 		const u32 enda = ::align(end, 64);
 		const u32 sizea = (enda - starta) / 64;
 		verify(HERE), sizea;

-		// Load masks
-		if (m_pos != starta || sizea == 1)
-		{
-			Label label = c->newLabel();
-			c->kmovw(x86::k1, x86::word_ptr(label));
-			const u16 mask = (0xffff << (m_pos - starta) / 4) & (0xffff >> (sizea == 1 ? enda - end : 0) / 4);
-
-			consts.emplace_back([=]
-			{
-				c->bind(label);
-				c->dw(mask);
-			});
-		}
-
-		if (sizea > 1 && end != enda && end + 32 != enda)
-		{
-			Label label = c->newLabel();
-			c->kmovw(x86::k2, x86::word_ptr(label));
-			const u16 mask = 0xffff >> (enda - end) / 4;
-
-			consts.emplace_back([=]
-			{
-				c->bind(label);
-				c->dw(mask);
-			});
-		}
-
 		// Initialize pointers
 		c->lea(x86::rax, x86::qword_ptr(label_code));
 		c->lea(*qw1, x86::qword_ptr(*ls, starta));
+		u32 code_off = 0;
+		u32 ls_off = starta;

 		for (u32 j = starta; j < enda; j += 64)
 		{
-			// Small offset for disp8*N
-			const u32 off = (j - starta) % 8192;
+			const u32 cmask = get_code_mask(j, j + 64);

-			if (j != starta && off == 0)
+			if (UNLIKELY(cmask == 0))
 			{
-				// Almost unexpected: update pointers
-				c->lea(x86::rax, x86::qword_ptr(label_code, j));
-				c->lea(*qw1, x86::qword_ptr(*ls, j));
+				continue;
 			}

-			if (j < m_pos || j + 64 > end)
+			// Ensure small distance for disp8*N
+			if (j - ls_off >= 8192)
 			{
-				c->setExtraReg(j < m_pos || sizea == 1 ? x86::k1 : x86::k2);
-				c->z().vmovdqa32(x86::zmm0, x86::zword_ptr(*qw1, off));
+				c->lea(*qw1, x86::qword_ptr(*ls, j));
+				ls_off = j;
+			}
+
+			if (code_off >= 8192)
+			{
+				c->lea(x86::rax, x86::qword_ptr(x86::rax, 8192));
+				code_off -= 8192;
+			}
+
+			if (cmask != 0xffff)
+			{
+				// Generate k-mask for the block
+				Label label = c->newLabel();
+				c->kmovw(x86::k7, x86::word_ptr(label));
+
+				consts.emplace_back([=]
+				{
+					c->bind(label);
+					c->dq(cmask);
+				});
+
+				c->setExtraReg(x86::k7);
+				c->z().vmovdqa32(x86::zmm0, x86::zword_ptr(*qw1, j - ls_off));
 			}
 			else
 			{
-				c->vmovdqa32(x86::zmm0, x86::zword_ptr(*qw1, off));
+				c->vmovdqa32(x86::zmm0, x86::zword_ptr(*qw1, j - ls_off));
 			}

 			if (j == starta)
 			{
-				c->vpcmpud(x86::k1, x86::zmm0, x86::zword_ptr(x86::rax, off), 4);
+				c->vpcmpud(x86::k1, x86::zmm0, x86::zword_ptr(x86::rax, code_off), 4);
 			}
 			else
 			{
-				c->vpcmpud(x86::k3, x86::zmm0, x86::zword_ptr(x86::rax, off), 4);
+				c->vpcmpud(x86::k3, x86::zmm0, x86::zword_ptr(x86::rax, code_off), 4);
 				c->korw(x86::k1, x86::k3, x86::k1);
 			}
+
+			for (u32 i = j; i < j + 64; i += 4)
+			{
+				words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
+			}
+
+			code_off += 64;
 		}

 		c->ktestw(x86::k1, x86::k1);
 		c->jnz(label_diff);
-
-		for (u32 i = starta; i < enda; i += 4)
-		{
-			words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
-		}
 	}
 	else if (utils::has_512())
 	{
@ -279,21 +294,22 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		words_align = 32;

 		const u32 starta = m_pos & -32;
-		const u32 end = m_pos + (func.size() - 1) * 4;
 		const u32 enda = ::align(end, 32);
 		const u32 sizea = (enda - starta) / 32;
 		verify(HERE), sizea;

 		if (sizea == 1)
 		{
-			if (starta == m_pos && enda == end)
+			const u32 cmask = get_code_mask(starta, enda);
+
+			if (cmask == 0xff)
 			{
 				c->vmovdqa(x86::ymm0, x86::yword_ptr(*ls, starta));
 			}
 			else
 			{
 				c->vpxor(x86::ymm0, x86::ymm0, x86::ymm0);
-				c->vpblendd(x86::ymm0, x86::ymm0, x86::yword_ptr(*ls, starta), (0xff << (m_pos - starta) / 4) & (0xff >> (enda - end) / 4));
+				c->vpblendd(x86::ymm0, x86::ymm0, x86::yword_ptr(*ls, starta), cmask);
 			}

 			c->vpxor(x86::ymm0, x86::ymm0, x86::yword_ptr(label_code));
@ -307,9 +323,12 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		}
 		else if (sizea == 2 && (end - m_pos) <= 32)
 		{
+			const u32 cmask0 = get_code_mask(starta, starta + 32);
+			const u32 cmask1 = get_code_mask(starta + 32, enda);
+
 			c->vpxor(x86::ymm0, x86::ymm0, x86::ymm0);
-			c->vpblendd(x86::ymm0, x86::ymm0, x86::yword_ptr(*ls, starta), 0xff & (0xff << (m_pos - starta) / 4));
-			c->vpblendd(x86::ymm0, x86::ymm0, x86::yword_ptr(*ls, starta + 32), 0xff & (0xff >> (enda - end) / 4));
+			c->vpblendd(x86::ymm0, x86::ymm0, x86::yword_ptr(*ls, starta), cmask0);
+			c->vpblendd(x86::ymm0, x86::ymm0, x86::yword_ptr(*ls, starta + 32), cmask1);
 			c->vpxor(x86::ymm0, x86::ymm0, x86::yword_ptr(label_code));
 			c->vptest(x86::ymm0, x86::ymm0);
 			c->jnz(label_diff);
@ -321,59 +340,71 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		}
 		else
 		{
-			if (starta < m_pos || enda > end)
-			{
-				c->vpxor(x86::xmm2, x86::xmm2, x86::xmm2);
-			}
+			bool xmm2z = false;

 			// Initialize pointers
 			c->lea(x86::rax, x86::qword_ptr(label_code));
 			c->lea(*qw1, x86::qword_ptr(*ls, starta));
+			u32 code_off = 0;
+			u32 ls_off = starta;

 			for (u32 j = starta; j < enda; j += 32)
 			{
-				// Small offset for disp8*N
-				const u32 off = (j - starta) % 4096;
+				const u32 cmask = get_code_mask(j, j + 32);

-				if (j != starta && off == 0)
+				if (UNLIKELY(cmask == 0))
+				{
+					continue;
+				}
+
+				// Ensure small distance for disp8*N
+				if (j - ls_off >= 4096)
 				{
-					// Almost unexpected: update pointers
-					c->lea(x86::rax, x86::qword_ptr(label_code, j - starta));
 					c->lea(*qw1, x86::qword_ptr(*ls, j));
+					ls_off = j;
 				}

-				// Load aligned code block from LS, mask if necessary (at the end or the beginning)
-				if (j < m_pos)
+				if (code_off >= 4096)
 				{
-					c->vpblendd(x86::ymm1, x86::ymm2, x86::yword_ptr(*qw1, off), 0xff & (0xff << (m_pos - starta) / 4));
+					c->lea(x86::rax, x86::qword_ptr(x86::rax, 4096));
+					code_off -= 4096;
 				}
-				else if (j + 32 > end)
+
+				if (cmask != 0xff)
 				{
-					c->vpblendd(x86::ymm1, x86::ymm2, x86::yword_ptr(*qw1, off), 0xff & (0xff >> (enda - end) / 4));
+					if (!xmm2z)
+					{
+						c->vpxor(x86::xmm2, x86::xmm2, x86::xmm2);
+						xmm2z = true;
+					}
+
+					c->vpblendd(x86::ymm1, x86::ymm2, x86::yword_ptr(*qw1, j - ls_off), cmask);
 				}
 				else
 				{
-					c->vmovdqa32(x86::ymm1, x86::yword_ptr(*qw1, off));
+					c->vmovdqa32(x86::ymm1, x86::yword_ptr(*qw1, j - ls_off));
 				}

 				// Perform bitwise comparison and accumulate
 				if (j == starta)
 				{
-					c->vpxor(x86::ymm0, x86::ymm1, x86::yword_ptr(x86::rax, off));
+					c->vpxor(x86::ymm0, x86::ymm1, x86::yword_ptr(x86::rax, code_off));
 				}
 				else
 				{
-					c->vpternlogd(x86::ymm0, x86::ymm1, x86::yword_ptr(x86::rax, off), 0xf6 /* orAxorBC */);
+					c->vpternlogd(x86::ymm0, x86::ymm1, x86::yword_ptr(x86::rax, code_off), 0xf6 /* orAxorBC */);
 				}
+
+				for (u32 i = j; i < j + 32; i += 4)
+				{
+					words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
+				}
+
+				code_off += 32;
 			}

 			c->vptest(x86::ymm0, x86::ymm0);
 			c->jnz(label_diff);
-
-			for (u32 i = starta; i < enda; i += 4)
-			{
-				words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
-			}
 		}
 	}
 	else if (utils::has_avx())
@ -382,21 +413,22 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		words_align = 32;

 		const u32 starta = m_pos & -32;
-		const u32 end = m_pos + (func.size() - 1) * 4;
 		const u32 enda = ::align(end, 32);
 		const u32 sizea = (enda - starta) / 32;
 		verify(HERE), sizea;

 		if (sizea == 1)
 		{
-			if (starta == m_pos && enda == end)
+			const u32 cmask = get_code_mask(starta, enda);
+
+			if (cmask == 0xff)
 			{
 				c->vmovaps(x86::ymm0, x86::yword_ptr(*ls, starta));
 			}
 			else
 			{
 				c->vxorps(x86::ymm0, x86::ymm0, x86::ymm0);
-				c->vblendps(x86::ymm0, x86::ymm0, x86::yword_ptr(*ls, starta), (0xff << (m_pos - starta) / 4) & (0xff >> (enda - end) / 4));
+				c->vblendps(x86::ymm0, x86::ymm0, x86::yword_ptr(*ls, starta), cmask);
 			}

 			c->vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(label_code));
@ -410,9 +442,12 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		}
 		else if (sizea == 2 && (end - m_pos) <= 32)
 		{
+			const u32 cmask0 = get_code_mask(starta, starta + 32);
+			const u32 cmask1 = get_code_mask(starta + 32, enda);
+
 			c->vxorps(x86::ymm0, x86::ymm0, x86::ymm0);
-			c->vblendps(x86::ymm0, x86::ymm0, x86::yword_ptr(*ls, starta), 0xff & (0xff << (m_pos - starta) / 4));
-			c->vblendps(x86::ymm0, x86::ymm0, x86::yword_ptr(*ls, starta + 32), 0xff & (0xff >> (enda - end) / 4));
+			c->vblendps(x86::ymm0, x86::ymm0, x86::yword_ptr(*ls, starta), cmask0);
+			c->vblendps(x86::ymm0, x86::ymm0, x86::yword_ptr(*ls, starta + 32), cmask1);
 			c->vxorps(x86::ymm0, x86::ymm0, x86::yword_ptr(label_code));
 			c->vptest(x86::ymm0, x86::ymm0);
 			c->jnz(label_diff);
@ -424,76 +459,104 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		}
 		else
 		{
-			if (starta < m_pos || enda > end)
-			{
-				c->vxorps(x86::xmm2, x86::xmm2, x86::xmm2);
-			}
+			bool xmm2z = false;

 			// Initialize pointers
 			c->add(*ls, starta);
 			c->lea(x86::rax, x86::qword_ptr(label_code));
+			u32 code_off = 0;
 			u32 ls_off = starta;
+			u32 order0 = 0;
+			u32 order1 = 0;

 			for (u32 j = starta; j < enda; j += 32)
 			{
-				// Small offset
-				const u32 off = (j - starta) % 128;
+				const u32 cmask = get_code_mask(j, j + 32);
+
+				if (UNLIKELY(cmask == 0))
+				{
+					continue;
+				}

 				// Interleave two threads
-				const auto& reg0 = off % 64 ? x86::ymm3 : x86::ymm0;
-				const auto& reg1 = off % 64 ? x86::ymm4 : x86::ymm1;
+				auto& order = order0 > order1 ? order1 : order0;
+				const auto& reg0 = order0 > order1 ? x86::ymm3 : x86::ymm0;
+				const auto& reg1 = order0 > order1 ? x86::ymm4 : x86::ymm1;

-				if (j != starta && off == 0)
+				// Ensure small distance for disp8
+				if (j - ls_off >= 256)
+				{
+					c->add(*ls, j - ls_off);
+					ls_off = j;
+				}
+				else if (j - ls_off >= 128)
 				{
-					ls_off += 128;
 					c->sub(*ls, -128);
-					c->sub(x86::rax, -128);
+					ls_off += 128;
 				}

-				// Load aligned code block from LS, mask if necessary (at the end or the beginning)
-				if (j < m_pos)
+				if (code_off >= 128)
 				{
-					c->vblendps(reg1, x86::ymm2, x86::yword_ptr(*ls, off), 0xff & (0xff << (m_pos - starta) / 4));
+					c->sub(x86::rax, -128);
+					code_off -= 128;
 				}
-				else if (j + 32 > end)
+
+				if (cmask != 0xff)
 				{
-					c->vblendps(reg1, x86::ymm2, x86::yword_ptr(*ls, off), 0xff & (0xff >> (enda - end) / 4));
+					if (!xmm2z)
+					{
+						c->vxorps(x86::xmm2, x86::xmm2, x86::xmm2);
+						xmm2z = true;
+					}
+
+					c->vblendps(reg1, x86::ymm2, x86::yword_ptr(*ls, j - ls_off), cmask);
 				}
 				else
 				{
-					c->vmovaps(reg1, x86::yword_ptr(*ls, off));
+					c->vmovaps(reg1, x86::yword_ptr(*ls, j - ls_off));
 				}

 				// Perform bitwise comparison and accumulate
-				if (j == starta || j == starta + 32)
+				if (!order++)
 				{
-					c->vxorps(reg0, reg1, x86::yword_ptr(x86::rax, off));
+					c->vxorps(reg0, reg1, x86::yword_ptr(x86::rax, code_off));
 				}
 				else
 				{
-					c->vxorps(reg1, reg1, x86::yword_ptr(x86::rax, off));
+					c->vxorps(reg1, reg1, x86::yword_ptr(x86::rax, code_off));
 					c->vorps(reg0, reg1, reg0);
 				}
+
+				for (u32 i = j; i < j + 32; i += 4)
+				{
+					words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
+				}
+
+				code_off += 32;
 			}

 			c->sub(*ls, ls_off);
-			c->vorps(x86::ymm0, x86::ymm3, x86::ymm0);
+
+			if (order1)
+			{
+				c->vorps(x86::ymm0, x86::ymm3, x86::ymm0);
+			}
+
 			c->vptest(x86::ymm0, x86::ymm0);
 			c->jnz(label_diff);
-
-			for (u32 i = starta; i < enda; i += 4)
-			{
-				words.push_back(i >= m_pos && i < end ? func[(i - m_pos) / 4 + 1] : 0);
-			}
 		}
 	}
-	else if (true)
+	else
 	{
+		if (utils::has_avx())
+		{
+			c->vzeroupper();
+		}
+
 		// Compatible SSE2
 		words_align = 16;

 		const u32 starta = m_pos & -16;
-		const u32 end = m_pos + (func.size() - 1) * 4;
 		const u32 enda = ::align(end, 16);
 		const u32 sizea = (enda - starta) / 16;
 		verify(HERE), sizea;
@ -501,57 +564,95 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		// Initialize pointers
 		c->add(*ls, starta);
 		c->lea(x86::rax, x86::qword_ptr(label_code));
+		u32 code_off = 0;
 		u32 ls_off = starta;
+		u32 order0 = 0;
+		u32 order1 = 0;

 		for (u32 j = starta; j < enda; j += 16)
 		{
-			// Small offset
-			const u32 off = (j - starta) % 128;
+			const u32 cmask = get_code_mask(j, j + 16);
+
+			if (UNLIKELY(cmask == 0))
+			{
+				continue;
+			}

 			// Interleave two threads
-			const auto& reg0 = off % 32 ? x86::xmm3 : x86::xmm0;
-			const auto& reg1 = off % 32 ? x86::xmm4 : x86::xmm1;
-			const auto& dest = j == starta || j == starta + 16 ? reg0 : reg1;
+			auto& order = order0 > order1 ? order1 : order0;
+			const auto& reg0 = order0 > order1 ? x86::xmm3 : x86::xmm0;
+			const auto& reg1 = order0 > order1 ? x86::xmm4 : x86::xmm1;

-			if (j != starta && off == 0)
+			// Ensure small distance for disp8
+			if (j - ls_off >= 256)
 			{
-				ls_off += 128;
-				c->sub(*ls, -128);
-				c->sub(x86::rax, -128);
+				c->add(*ls, j - ls_off);
+				ls_off = j;
 			}
+			else if (j - ls_off >= 128)
+			{
+				c->sub(*ls, -128);
+				ls_off += 128;
+			}
+
+			if (code_off >= 128)
+			{
+				c->sub(x86::rax, -128);
+				code_off -= 128;
+			}
+
+			// Determine which value will be duplicated at hole positions
+			const u32 w3 = func.at((j - m_pos + ~::cntlz32(cmask, true) % 4 * 4) / 4 + 1);
+			words.push_back(cmask & 1 ? func[(j - m_pos + 0) / 4 + 1] : w3);
+			words.push_back(cmask & 2 ? func[(j - m_pos + 4) / 4 + 1] : w3);
+			words.push_back(cmask & 4 ? func[(j - m_pos + 8) / 4 + 1] : w3);
+			words.push_back(w3);
+
+			// PSHUFD immediate table for all possible hole mask values, holes repeat highest valid word
+			static constexpr s32 s_pshufd_imm[16]
+			{
+				-1, // invalid index
+				0b00000000, // copy 0
+				0b01010101, // copy 1
+				0b01010100, // copy 1
+				0b10101010, // copy 2
+				0b10101000, // copy 2
+				0b10100110, // copy 2
+				0b10100100, // copy 2
+				0b11111111, // copy 3
+				0b11111100, // copy 3
+				0b11110111, // copy 3
+				0b11110100, // copy 3
+				0b11101111, // copy 3
+				0b11101100, // copy 3
+				0b11100111, // copy 3
+				0b11100100, // full
+			};
+
+			const auto& dest = !order++ ? reg0 : reg1;

 			// Load aligned code block from LS
-			if (j < m_pos)
+			if (cmask != 0xf)
 			{
-				static constexpr u8 s_masks[4]{0b11100100, 0b11100101, 0b11101010, 0b11111111};
-				c->pshufd(dest, x86::dqword_ptr(*ls, off), s_masks[(m_pos - starta) / 4]);
-			}
-			else if (j + 16 > end)
-			{
-				static constexpr u8 s_masks[4]{0b11100100, 0b10100100, 0b01010100, 0b00000000};
-				c->pshufd(dest, x86::dqword_ptr(*ls, off), s_masks[(enda - end) / 4]);
+				c->pshufd(dest, x86::dqword_ptr(*ls, j - ls_off), s_pshufd_imm[cmask]);
 			}
 			else
 			{
-				c->movaps(dest, x86::dqword_ptr(*ls, off));
+				c->movaps(dest, x86::dqword_ptr(*ls, j - ls_off));
 			}

 			// Perform bitwise comparison and accumulate
-			c->xorps(dest, x86::dqword_ptr(x86::rax, off));
+			c->xorps(dest, x86::dqword_ptr(x86::rax, code_off));

 			if (j != starta && j != starta + 16)
 			{
 				c->orps(reg0, dest);
 			}
+
+			code_off += 16;
 		}

-		for (u32 i = starta; i < enda; i += 4)
-		{
-			// Fill alignment holes with first or last elements
-			words.push_back(func[(i < m_pos ? 0 : i >= end ? end - 4 - m_pos : i - m_pos) / 4 + 1]);
-		}
-
-		if (sizea != 1)
+		if (order1)
 		{
 			c->orps(x86::xmm0, x86::xmm3);
 		}
@ -571,28 +672,6 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 			c->jne(label_diff);
 		}
 	}
-	else
-	{
-		// Legacy (slow, disabled)
-		save_rcx();
-		c->mov(x86::r9, x86::rdi);
-		c->mov(x86::r10, x86::rsi);
-		c->lea(x86::rsi, x86::qword_ptr(*ls, m_pos));
-		c->lea(x86::rdi, x86::qword_ptr(label_code));
-		c->mov(x86::ecx, (func.size() - 1) / 2);
-		if ((func.size() - 1) % 2)
-			c->cmpsd();
-		c->repe().cmpsq();
-		load_rcx();
-		c->mov(x86::rdi, x86::r9);
-		c->mov(x86::rsi, x86::r10);
-		c->jnz(label_diff);
-
-		for (u32 i = 1; i < func.size(); i++)
-		{
-			words.push_back(func[i]);
-		}
-	}

 	if (utils::has_avx())
 	{
@ -603,11 +682,13 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)

 	for (u32 i = 1; i < func.size(); i++)
 	{
+		const u32 pos = start + (i - 1) * 4;
+
 		if (g_cfg.core.spu_debug)
 		{
 			// Disasm
-			dis_asm.dump_pc = m_pos;
-			dis_asm.disasm(m_pos);
+			dis_asm.dump_pc = pos;
+			dis_asm.disasm(pos);
 			compiler.comment(dis_asm.last_opcode.c_str());
 			log += dis_asm.last_opcode;
 			log += '\n';
@ -616,6 +697,22 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		// Get opcode
 		const u32 op = se_storage<u32>::swap(func[i]);

+		if (!op)
+		{
+			// Ignore hole
+			if (m_pos != -1)
+			{
+				LOG_ERROR(SPU, "Unexpected fallthrough to 0x%x", pos);
+				branch_fixed(spu_branch_target(pos));
+				m_pos = -1;
+			}
+
+			continue;
+		}
+
+		// Update position
+		m_pos = pos;
+
 		// Execute recompiler function
 		(this->*s_spu_decoder.decode(op))({op});

@ -624,15 +721,6 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		{
 			vec[i] = vec_vars[i];
 		}
-
-		// Check if block was terminated
-		if (m_pos == -1)
-		{
-			break;
-		}
-
-		// Set next position
-		m_pos += 4;
 	}

 	if (g_cfg.core.spu_debug)
@ -643,7 +731,7 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 	// Make fallthrough if necessary
 	if (m_pos != -1)
 	{
-		branch_fixed(spu_branch_target(m_pos));
+		branch_fixed(spu_branch_target(end));
 	}

 	// Simply return
@ -689,8 +777,8 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 	std::vector<u32> addrv{func[0]};
 	const auto beg = m_spurt->m_map.lower_bound(addrv);
 	addrv[0] += 4;
-	const auto end = m_spurt->m_map.lower_bound(addrv);
-	const u32 size0 = std::distance(beg, end);
+	const auto _end = m_spurt->m_map.lower_bound(addrv);
+	const u32 size0 = std::distance(beg, _end);

 	if (size0 == 1)
 	{
@ -727,7 +815,7 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 		workload.back().size = size0;
 		workload.back().level = 1;
 		workload.back().beg = beg;
-		workload.back().end = end;
+		workload.back().end = _end;

 		for (std::size_t i = 0; i < workload.size(); i++)
 		{
@ -746,8 +834,17 @@ spu_function_t spu_recompiler::compile(const std::vector<u32>& func)
 				it = it2;
 				size1 = w.size - size2;

+				const u32 x1 = w.beg->first.at(w.level);
+
+				if (!x1)
+				{
+					// Cannot split: some functions contain holes at this level
+					w.level++;
+					continue;
+				}
+
 				// Adjust ranges (forward)
-				while (it != w.end && w.beg->first.at(w.level) == it->first.at(w.level))
+				while (it != w.end && x1 == it->first.at(w.level))
 				{
 					it++;
 					size1++;