SPU: Copy with memcpy() instead of hand-rolled SSE2

In some very unscientific benchmark: spu_thread::do_dma_transfer() was taking 2.27% of my CPU before, now 0.07%, while __memmove_avx_unaligned_erms() was taking 1.47% and now 2.88%, which added makes about 0.8% saved.
2025-02-07 03:40:07 +00:00 · 2019-11-20 16:53:22 +01:00 · 2019-11-20 16:53:22 +01:00 · 425e032a62
commit 425e032a62
parent 5261886449
1 changed files with 8 additions and 0 deletions
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -1433,6 +1433,9 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)

 			auto lock = vm::passive_lock(eal & -128, ::align(eal + size, 128));

+#ifdef __GNUG__
+			std::memcpy(dst, src, size);
+#else
 			while (size >= 128)
 			{
 				mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
@ -1450,6 +1453,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 				src += 16;
 				size -= 16;
 			}
+#endif

 			lock->release(0);
 			break;
@ -1483,6 +1487,9 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 	}
 	default:
 	{
+#ifdef __GNUG__
+		std::memcpy(dst, src, size);
+#else
 		while (size >= 128)
 		{
 			mov_rdata(*reinterpret_cast<decltype(spu_thread::rdata)*>(dst), *reinterpret_cast<const decltype(spu_thread::rdata)*>(src));
@ -1500,6 +1507,7 @@ void spu_thread::do_dma_transfer(const spu_mfc_cmd& args)
 			src += 16;
 			size -= 16;
 		}
+#endif

 		break;
 	}