SPU: Inline and batch MFC list transfers

This commit is contained in:
Eladash 2022-10-05 18:30:12 +03:00 committed by Ivan
parent d6d7ade6e3
commit 6a36967e6f
4 changed files with 535 additions and 67 deletions

View File

@ -2688,79 +2688,540 @@ bool spu_thread::do_dma_check(const spu_mfc_cmd& args)
bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
{
perf_meter<"MFC_LIST"_u64> perf0;
// Amount of elements to fetch in one go
constexpr u32 fetch_size = 6;
struct alignas(8) list_element
{
be_t<u16> sb; // Stall-and-Notify bit (0x8000)
u8 sb; // Stall-and-Notify bit (0x80)
u8 pad;
be_t<u16> ts; // List Transfer Size
be_t<u32> ea; // External Address Low
};
union
{
list_element items[fetch_size];
alignas(v128) char bufitems[sizeof(items)];
};
alignas(16) list_element items[fetch_size];
static_assert(sizeof(v128) % sizeof(list_element) == 0);
spu_mfc_cmd transfer;
transfer.eah = 0;
transfer.tag = args.tag;
transfer.cmd = MFC(args.cmd & ~MFC_LIST_MASK);
args.lsa &= 0x3fff0;
args.eal &= 0x3fff8;
transfer.cmd = MFC{static_cast<u8>(args.cmd & ~0xf)};
u32 index = fetch_size;
auto item_ptr = _ptr<const list_element>(args.eal & 0x3fff8);
u32 arg_lsa = args.lsa & 0x3fff0;
u32 arg_size = args.size;
u8 optimization_compatible = transfer.cmd & (MFC_GET_CMD | MFC_PUT_CMD);
if (spu_log.trace || g_cfg.core.spu_accurate_dma || g_cfg.core.mfc_debug)
{
optimization_compatible = 0;
}
else if (optimization_compatible == MFC_PUT_CMD && (g_cfg.video.strict_rendering_mode || g_cfg.core.rsx_fifo_accuracy))
{
optimization_compatible &= ~MFC_PUT_CMD;
}
constexpr u32 ts_mask = 0x7fff;
// Assume called with size greater than 0
while (true)
{
// Check if fetching is needed
if (index == fetch_size)
{
const v128 data0 = v128::loadu(item_ptr, 0);
const v128 data1 = v128::loadu(item_ptr, 1);
const v128 data2 = v128::loadu(item_ptr, 2);
// In a perfect world this would not be needed until after the if but relying on the compiler to keep the elements in SSE registers through it all is unrealistic
std::memcpy(&items[sizeof(v128) / sizeof(list_element) * 0], &data0, sizeof(v128));
std::memcpy(&items[sizeof(v128) / sizeof(list_element) * 1], &data1, sizeof(v128));
std::memcpy(&items[sizeof(v128) / sizeof(list_element) * 2], &data2, sizeof(v128));
u32 s_size = data0._u32[0];
// We need to verify matching between odd and even elements (vector test is position independant)
// 0-5 is the most unlikely couple match for many reasons so it skips the entire check very efficiently in most cases
// Assumes padding bits should match
if (optimization_compatible == MFC_GET_CMD && s_size == data0._u32[2] && arg_size >= fetch_size * 8)
{
const v128 ored = (data0 | data1 | data2) & v128::from64p(std::bit_cast<be_t<u64>>(1ull << 63 | (u64{ts_mask} << 32) | 0xe000'0000));
const v128 anded = (data0 & data1 & data2) & v128::from64p(std::bit_cast<be_t<u64>>(0xe000'0000 | (u64{ts_mask} << 32)));
// Tests:
// 1. Unset stall-and-notify bit on all 6 elements
// 2. Equality of transfer size across all 6 elements
// 3. Be in the same 512mb region, this is because this case is not expected to be broken usually and we need to ensure MMIO is not involved in any of the transfers (assumes MMIO to be so rare that this is the last check)
if (ored == anded && items[0].ea < RAW_SPU_BASE_ADDR && items[1].ea < RAW_SPU_BASE_ADDR)
{
// Execute the postponed byteswapping and masking
s_size = std::bit_cast<be_t<u32>>(s_size) & ts_mask;
u8* src = vm::_ptr<u8>(0);
u8* dst = this->ls + arg_lsa;
// Assume success, prepare the next elements
arg_lsa += fetch_size * utils::align<u32>(s_size, 16);
item_ptr += fetch_size;
arg_size -= fetch_size * 8;
// Type which is friendly for fused address calculations
constexpr usz _128 = 128;
// This whole function relies on many constraints to be met (crashes real MFC), we can a have minor optimization assuming EA alignment to be +16 with +16 byte transfers
#define MOV_T(type, index, _ea) { const usz ea = _ea; *reinterpret_cast<type*>(dst + index * utils::align<u32>(sizeof(type), 16) + ea % (sizeof(type) < 16 ? 16 : 1)) = *reinterpret_cast<const type*>(src + ea); } void()
#define MOV_128(index, ea) mov_rdata(*reinterpret_cast<decltype(rdata)*>(dst + index * _128), *reinterpret_cast<const decltype(rdata)*>(src + (ea)))
switch (s_size)
{
case 0:
{
if (!arg_size)
{
return true;
}
continue;
}
case 1:
{
MOV_T(u8, 0, items[0].ea);
MOV_T(u8, 1, items[1].ea);
MOV_T(u8, 2, items[2].ea);
MOV_T(u8, 3, items[3].ea);
MOV_T(u8, 4, items[4].ea);
MOV_T(u8, 5, items[5].ea);
if (!arg_size)
{
return true;
}
continue;
}
case 2:
{
MOV_T(u16, 0, items[0].ea);
MOV_T(u16, 1, items[1].ea);
MOV_T(u16, 2, items[2].ea);
MOV_T(u16, 3, items[3].ea);
MOV_T(u16, 4, items[4].ea);
MOV_T(u16, 5, items[5].ea);
if (!arg_size)
{
return true;
}
continue;
}
case 4:
{
MOV_T(u32, 0, items[0].ea);
MOV_T(u32, 1, items[1].ea);
MOV_T(u32, 2, items[2].ea);
MOV_T(u32, 3, items[3].ea);
MOV_T(u32, 4, items[4].ea);
MOV_T(u32, 5, items[5].ea);
if (!arg_size)
{
return true;
}
continue;
}
case 8:
{
MOV_T(u64, 0, items[0].ea);
MOV_T(u64, 1, items[1].ea);
MOV_T(u64, 2, items[2].ea);
MOV_T(u64, 3, items[3].ea);
MOV_T(u64, 4, items[4].ea);
MOV_T(u64, 5, items[5].ea);
if (!arg_size)
{
return true;
}
continue;
}
case 16:
{
MOV_T(v128, 0, items[0].ea);
MOV_T(v128, 1, items[1].ea);
MOV_T(v128, 2, items[2].ea);
MOV_T(v128, 3, items[3].ea);
MOV_T(v128, 4, items[4].ea);
MOV_T(v128, 5, items[5].ea);
if (!arg_size)
{
return true;
}
continue;
}
case 32:
{
struct mem
{
v128 a[2];
};
MOV_T(mem, 0, items[0].ea);
MOV_T(mem, 1, items[1].ea);
MOV_T(mem, 2, items[2].ea);
MOV_T(mem, 3, items[3].ea);
MOV_T(mem, 4, items[4].ea);
MOV_T(mem, 5, items[5].ea);
if (!arg_size)
{
return true;
}
continue;
}
case 48:
{
struct mem
{
v128 a[3];
};
MOV_T(mem, 0, items[0].ea);
MOV_T(mem, 1, items[1].ea);
MOV_T(mem, 2, items[2].ea);
MOV_T(mem, 3, items[3].ea);
MOV_T(mem, 4, items[4].ea);
MOV_T(mem, 5, items[5].ea);
if (!arg_size)
{
return true;
}
continue;
}
case 64:
{
struct mem
{
v128 a[4];
};
// TODO: Optimize (4 16-bytes movings is bad)
MOV_T(mem, 0, items[0].ea);
MOV_T(mem, 1, items[1].ea);
MOV_T(mem, 2, items[2].ea);
MOV_T(mem, 3, items[3].ea);
MOV_T(mem, 4, items[4].ea);
MOV_T(mem, 5, items[5].ea);
if (!arg_size)
{
return true;
}
continue;
}
case 128:
{
MOV_128(0, items[0].ea);
MOV_128(1, items[1].ea);
MOV_128(2, items[2].ea);
MOV_128(3, items[3].ea);
MOV_128(4, items[4].ea);
MOV_128(5, items[5].ea);
if (!arg_size)
{
return true;
}
continue;
}
case 256:
{
const usz ea0 = items[0].ea;
MOV_128(0, ea0 + 0);
MOV_128(1, ea0 + _128);
const usz ea1 = items[1].ea;
MOV_128(2, ea1 + 0);
MOV_128(3, ea1 + _128);
const usz ea2 = items[2].ea;
MOV_128(4, ea2 + 0);
MOV_128(5, ea2 + _128);
const usz ea3 = items[3].ea;
MOV_128(6, ea3 + 0);
MOV_128(7, ea3 + _128);
const usz ea4 = items[4].ea;
MOV_128(8, ea4 + 0);
MOV_128(9, ea4 + _128);
const usz ea5 = items[5].ea;
MOV_128(10, ea5 + 0);
MOV_128(11, ea5 + _128);
if (!arg_size)
{
return true;
}
continue;
}
case 512:
{
const usz ea0 = items[0].ea;
MOV_128(0 , ea0 + _128 * 0);
MOV_128(1 , ea0 + _128 * 1);
MOV_128(2 , ea0 + _128 * 2);
MOV_128(3 , ea0 + _128 * 3);
const usz ea1 = items[1].ea;
MOV_128(4 , ea1 + _128 * 0);
MOV_128(5 , ea1 + _128 * 1);
MOV_128(6 , ea1 + _128 * 2);
MOV_128(7 , ea1 + _128 * 3);
const usz ea2 = items[2].ea;
MOV_128(8 , ea2 + _128 * 0);
MOV_128(9 , ea2 + _128 * 1);
MOV_128(10, ea2 + _128 * 2);
MOV_128(11, ea2 + _128 * 3);
const usz ea3 = items[3].ea;
MOV_128(12, ea3 + _128 * 0);
MOV_128(13, ea3 + _128 * 1);
MOV_128(14, ea3 + _128 * 2);
MOV_128(15, ea3 + _128 * 3);
const usz ea4 = items[4].ea;
MOV_128(16, ea4 + _128 * 0);
MOV_128(17, ea4 + _128 * 1);
MOV_128(18, ea4 + _128 * 2);
MOV_128(19, ea4 + _128 * 3);
const usz ea5 = items[5].ea;
MOV_128(20, ea5 + _128 * 0);
MOV_128(21, ea5 + _128 * 1);
MOV_128(22, ea5 + _128 * 2);
MOV_128(23, ea5 + _128 * 3);
if (!arg_size)
{
return true;
}
continue;
}
default:
{
// TODO: Are more cases common enough? (in the range of less than 512 bytes because for more than that the optimization is doubtful)
break;
}
}
#undef MOV_T
#undef MOV_128
// Optimization miss, revert changes
arg_lsa -= fetch_size * utils::align<u32>(s_size, 16);
item_ptr -= fetch_size;
arg_size += fetch_size * 8;
}
}
// Reset to elements array head
index = 0;
const auto src = _ptr<const void>(args.eal);
const v128 data0 = v128::loadu(src, 0);
const v128 data1 = v128::loadu(src, 1);
const v128 data2 = v128::loadu(src, 2);
reinterpret_cast<v128*>(bufitems)[0] = data0;
reinterpret_cast<v128*>(bufitems)[1] = data1;
reinterpret_cast<v128*>(bufitems)[2] = data2;
}
const u32 size = items[index].ts & 0x7fff;
const u32 size = items[index].ts & ts_mask;
const u32 addr = items[index].ea;
spu_log.trace("LIST: item=0x%016x, lsa=0x%05x", std::bit_cast<be_t<u64>>(items[index]), args.lsa | (addr & 0xf));
if (size)
auto check_carry_16 = [](u16 addr, u16 size)
{
#ifdef _MSC_VER
u16 out;
return _addcarry_u16(0, addr, size - 1, &out);
#else
return ((addr + size - 1) >> 16) != 0;
#endif
};
// Try to inline the transfer
if (addr < RAW_SPU_BASE_ADDR && size && optimization_compatible == MFC_GET_CMD)
{
const u8* src = vm::_ptr<u8>(addr);
u8* dst = this->ls + arg_lsa + (addr & 0xf);
switch (u32 _size = size)
{
case 1:
{
*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
break;
}
case 2:
{
*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
break;
}
case 4:
{
*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
break;
}
case 8:
{
*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
break;
}
default:
{
if (_size > s_rep_movsb_threshold)
{
__movsb(dst, src, _size);
}
else
{
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
_size -= 16;
}
while (_size >= 128)
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
dst += 128;
src += 128;
_size -= 128;
}
while (_size)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
_size -= 16;
}
}
break;
}
}
arg_lsa += utils::align<u32>(size, 16);
}
// Avoid inlining huge transfers because it intentionally drops range lock unlock
else if (addr < RAW_SPU_BASE_ADDR && size - 1 <= 0x400 - 1 && optimization_compatible == MFC_PUT_CMD && !check_carry_16(static_cast<u16>(addr), static_cast<u16>(size)))
{
if (!g_use_rtm)
{
vm::range_lock(range_lock, addr & -128, utils::align<u32>(addr + size, 128) - (addr & -128));
}
u8* dst = vm::_ptr<u8>(addr);
const u8* src = this->ls + arg_lsa + (addr & 0xf);
switch (u32 _size = size)
{
case 1:
{
*reinterpret_cast<u8*>(dst) = *reinterpret_cast<const u8*>(src);
break;
}
case 2:
{
*reinterpret_cast<u16*>(dst) = *reinterpret_cast<const u16*>(src);
break;
}
case 4:
{
*reinterpret_cast<u32*>(dst) = *reinterpret_cast<const u32*>(src);
break;
}
case 8:
{
*reinterpret_cast<u64*>(dst) = *reinterpret_cast<const u64*>(src);
break;
}
default:
{
if (_size > s_rep_movsb_threshold)
{
__movsb(dst, src, _size);
}
else
{
// Avoid unaligned stores in mov_rdata_avx
if (reinterpret_cast<u64>(dst) & 0x10)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
_size -= 16;
}
while (_size >= 128)
{
mov_rdata(*reinterpret_cast<spu_rdata_t*>(dst), *reinterpret_cast<const spu_rdata_t*>(src));
dst += 128;
src += 128;
_size -= 128;
}
while (_size)
{
*reinterpret_cast<v128*>(dst) = *reinterpret_cast<const v128*>(src);
dst += 16;
src += 16;
_size -= 16;
}
}
break;
}
}
arg_lsa += utils::align<u32>(size, 16);
}
else if (size)
{
range_lock->release(0);
spu_log.trace("LIST: item=0x%016x, lsa=0x%05x", std::bit_cast<be_t<u64>>(items[index]), arg_lsa | (addr & 0xf));
transfer.eal = addr;
transfer.lsa = args.lsa | (addr & 0xf);
transfer.lsa = arg_lsa | (addr & 0xf);
transfer.size = size;
arg_lsa += utils::align<u32>(size, 16);
do_dma_transfer(this, transfer, ls);
const u32 add_size = std::max<u32>(size, 16);
args.lsa += add_size;
}
args.size -= 8;
arg_size -= 8;
if (!args.size)
if (!arg_size)
{
// No more elements
break;
}
args.eal += 8;
item_ptr++;
if (items[index].sb & 0x8000) [[unlikely]]
if (items[index].sb & 0x80) [[unlikely]]
{
range_lock->release(0);
ch_stall_mask |= utils::rol32(1, args.tag);
if (!ch_stall_stat.get_count())
@ -2771,12 +3232,16 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
ch_stall_stat.set_value(utils::rol32(1, args.tag) | ch_stall_stat.get_value());
args.tag |= 0x80; // Set stalled status
args.eal = reinterpret_cast<const u8*>(item_ptr) - this->ls;
args.lsa = arg_lsa;
args.size = arg_size;
return false;
}
index++;
}
range_lock->release(0);
return true;
}

View File

@ -159,19 +159,17 @@ namespace vm
void range_lock_internal(atomic_t<u64, 64>* range_lock, u32 begin, u32 size)
{
perf_meter<"RHW_LOCK"_u64> perf0;
perf_meter<"RHW_LOCK"_u64> perf0(0);
auto _cpu = get_current_cpu_thread();
cpu_thread* _cpu = nullptr;
if (_cpu)
if (u64 to_store = begin | (u64{size} << 32); *range_lock != to_store)
{
_cpu->state += cpu_flag::wait + cpu_flag::temp;
range_lock->store(to_store);
}
for (u64 i = 0;; i++)
{
range_lock->store(begin | (u64{size} << 32));
const u64 lock_val = g_range_lock.load();
const u64 is_share = g_shmem[begin >> 16].load();
@ -215,6 +213,11 @@ namespace vm
{
range_lock->release(0);
if (!perf0)
{
perf0.restart();
}
// Try triggering a page fault (write)
// TODO: Read memory if needed
vm::_ref<atomic_t<u8>>(test / 4096 == begin / 4096 ? begin : test) += 0;
@ -223,8 +226,26 @@ namespace vm
}
// Wait a bit before accessing global lock
range_lock->store(0);
range_lock->release(0);
if (!perf0)
{
perf0.restart();
}
busy_wait(200);
if (i >= 2 && !_cpu)
{
_cpu = cpu_thread::get_current();
if (_cpu)
{
_cpu->state += cpu_flag::wait + cpu_flag::temp;
}
}
range_lock->store(begin | (u64{size} << 32));
}
if (_cpu)

View File

@ -57,43 +57,15 @@ namespace vm
return;
}
const u64 lock_val = g_range_lock.load();
const u64 is_share = g_shmem[begin >> 16].load();
#ifndef _MSC_VER
__asm__(""); // Tiny barrier
#endif
u64 lock_addr = static_cast<u32>(lock_val); // -> u64
u32 lock_size = static_cast<u32>(lock_val << range_bits >> (32 + range_bits));
u64 addr = begin;
// Optimization: if range_locked is not used, the addr check will always pass
// Otherwise, g_shmem is unchanged and its value is reliable to read
if ((lock_val >> range_pos) == (range_locked >> range_pos))
if (!g_range_lock)
{
lock_size = 128;
if (is_share) [[unlikely]]
{
addr = static_cast<u16>(begin) | is_share;
lock_addr = lock_val;
}
return;
}
if (addr + size <= lock_addr || addr >= lock_addr + lock_size) [[likely]]
{
const u64 new_lock_val = g_range_lock.load();
if (!new_lock_val || new_lock_val == lock_val) [[likely]]
{
return;
}
}
range_lock->release(0);
// Fallback to slow path
range_lock_internal(range_lock, begin, size);
}

View File

@ -93,6 +93,16 @@ public:
restart();
}
FORCE_INLINE SAFE_BUFFERS() perf_meter(int) noexcept
{
std::fill(std::begin(m_timestamps), std::end(m_timestamps), 0);
}
FORCE_INLINE SAFE_BUFFERS(operator bool) () const noexcept
{
return m_timestamps[0] != 0;
}
// Copy first timestamp
template <auto SN, auto... S>
FORCE_INLINE SAFE_BUFFERS() perf_meter(const perf_meter<SN, S...>& r) noexcept