From 0a7df9d02e6eed08b91bd3eb6ec7e592619c7b1b Mon Sep 17 00:00:00 2001 From: Malcolm Jestadt Date: Mon, 12 Apr 2021 02:26:48 -0400 Subject: [PATCH] SPU LLVM: add AVX-512 SPU verification - This is hidden behind a new setting, as some cpus may downclock agressively when executing 512 wide instructions --- rpcs3/Emu/CPU/CPUTranslator.cpp | 1 + rpcs3/Emu/CPU/CPUTranslator.h | 3 + rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp | 4 +- rpcs3/Emu/Cell/SPURecompiler.cpp | 76 +++++++++++++++++++++----- rpcs3/Emu/system_config.h | 1 + rpcs3/rpcs3qt/emu_settings_type.h | 2 + rpcs3/rpcs3qt/settings_dialog.cpp | 4 ++ rpcs3/rpcs3qt/settings_dialog.ui | 7 +++ rpcs3/rpcs3qt/tooltips.h | 1 + 9 files changed, 84 insertions(+), 15 deletions(-) diff --git a/rpcs3/Emu/CPU/CPUTranslator.cpp b/rpcs3/Emu/CPU/CPUTranslator.cpp index 6b1ca7817c..6ad1275d05 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.cpp +++ b/rpcs3/Emu/CPU/CPUTranslator.cpp @@ -59,6 +59,7 @@ void cpu_translator::initialize(llvm::LLVMContext& context, llvm::ExecutionEngin cpu == "tigerlake") { m_use_fma = true; + m_use_avx512 = true; } // Test AVX-512_icelake features (TODO) diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index 5ebb38569c..669e1d8238 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -2422,6 +2422,9 @@ protected: // Allow FMA bool m_use_fma = false; + // Allow skylake-x tier AVX-512 + bool m_use_avx512 = false; + // Allow Icelake tier AVX-512 bool m_use_avx512_icl = false; diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index e9b361aa92..6be6828d30 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -281,9 +281,9 @@ spu_function_t spu_recompiler::compile(spu_program&& _func) c->vzeroupper(); } } - else if (utils::has_avx512() && false) + else if (utils::has_avx512() && g_cfg.core.full_width_avx512) { - // AVX-512 optimized check using 512-bit registers (disabled) + // AVX-512 optimized check using 512-bit registers words_align = 64; const u32 starta = start & -64; diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp index 15dae29e83..0fd6584168 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.cpp +++ b/rpcs3/Emu/Cell/SPURecompiler.cpp @@ -4366,25 +4366,48 @@ public: } } + u32 stride; + u32 elements; + u32 dwords; + + if (m_use_avx512 && g_cfg.core.full_width_avx512) + { + stride = 64; + elements = 16; + dwords = 8; + } + else if (true) + { + stride = 32; + elements = 8; + dwords = 4; + } + else // TODO: Use this path when the cpu doesn't support AVX + { + stride = 16; + elements = 4; + dwords = 2; + } + // Get actual pc corresponding to the found beginning of the data llvm::Value* starta_pc = m_ir->CreateAnd(get_pc(starta), 0x3fffc); llvm::Value* data_addr = m_ir->CreateGEP(m_lsptr, starta_pc); llvm::Value* acc = nullptr; - for (u32 j = starta; j < end; j += 32) + for (u32 j = starta; j < end; j += stride) { - int indices[8]; + int indices[16]; bool holes = false; bool data = false; - for (u32 i = 0; i < 8; i++) + for (u32 i = 0; i < elements; i++) { const u32 k = j + i * 4; if (k < start || k >= end || !func.data[(k - start) / 4]) { - indices[i] = 8; + indices[i] = elements; holes = true; } else @@ -4400,35 +4423,62 @@ public: continue; } + llvm::Value* vls = nullptr; + // Load unaligned code block from LS - llvm::Value* vls = m_ir->CreateAlignedLoad(_ptr(data_addr, j - starta), llvm::MaybeAlign{4}); + if (m_use_avx512 && g_cfg.core.full_width_avx512) + { + vls = m_ir->CreateAlignedLoad(_ptr(data_addr, j - starta), llvm::MaybeAlign{4}); + } + else if (true) + { + vls = m_ir->CreateAlignedLoad(_ptr(data_addr, j - starta), llvm::MaybeAlign{4}); + } + else + { + vls = m_ir->CreateAlignedLoad(_ptr(data_addr, j - starta), llvm::MaybeAlign{4}); + } // Mask if necessary if (holes) { - vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), indices); + vls = m_ir->CreateShuffleVector(vls, ConstantAggregateZero::get(vls->getType()), llvm::makeArrayRef(indices, elements)); } // Perform bitwise comparison and accumulate - u32 words[8]; + u32 words[16]; - for (u32 i = 0; i < 8; i++) + for (u32 i = 0; i < elements; i++) { const u32 k = j + i * 4; words[i] = k >= start && k < end ? func.data[(k - start) / 4] : 0; } - vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, words)); + vls = m_ir->CreateXor(vls, ConstantDataVector::get(m_context, llvm::makeArrayRef(words, elements))); acc = acc ? m_ir->CreateOr(acc, vls) : vls; check_iterations++; } // Pattern for PTEST - acc = m_ir->CreateBitCast(acc, get_type()); + if (m_use_avx512 && g_cfg.core.full_width_avx512) + { + acc = m_ir->CreateBitCast(acc, get_type()); + } + else if (true) + { + acc = m_ir->CreateBitCast(acc, get_type()); + } + else + { + acc = m_ir->CreateBitCast(acc, get_type()); + } + llvm::Value* elem = m_ir->CreateExtractElement(acc, u64{0}); - elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 1)); - elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 2)); - elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, 3)); + + for (u32 i = 1; i < dwords; i++) + { + elem = m_ir->CreateOr(elem, m_ir->CreateExtractElement(acc, i)); + } // Compare result with zero const auto cond = m_ir->CreateICmpNE(elem, m_ir->getInt64(0)); diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h index 407bcf3e64..b37fe31d93 100644 --- a/rpcs3/Emu/system_config.h +++ b/rpcs3/Emu/system_config.h @@ -54,6 +54,7 @@ struct cfg_root : cfg::node cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length cfg::_bool llvm_ppu_accurate_vector_nan{ this, "PPU LLVM Accurate Vector NaN values", false }; cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip) + cfg::_bool full_width_avx512{ this, "Full Width AVX-512", false}; cfg::_bool debug_console_mode{ this, "Debug Console Mode", false }; // Debug console emulation, not recommended cfg::_bool hook_functions{ this, "Hook static functions" }; diff --git a/rpcs3/rpcs3qt/emu_settings_type.h b/rpcs3/rpcs3qt/emu_settings_type.h index 001097daa0..b1b28af7e1 100644 --- a/rpcs3/rpcs3qt/emu_settings_type.h +++ b/rpcs3/rpcs3qt/emu_settings_type.h @@ -38,6 +38,7 @@ enum class emu_settings_type SleepTimersAccuracy, ClocksScale, PerformanceReport, + FullWidthAVX512, // Graphics Renderer, @@ -189,6 +190,7 @@ inline static const QMap settings_location = { emu_settings_type::ClocksScale, { "Core", "Clocks scale"}}, { emu_settings_type::AccuratePPU128Loop, { "Core", "Accurate PPU 128-byte Reservation Op Max Length"}}, { emu_settings_type::PerformanceReport, { "Core", "Enable Performance Report"}}, + { emu_settings_type::FullWidthAVX512, { "Core", "Full Width AVX-512"}}, // Graphics Tab { emu_settings_type::Renderer, { "Video", "Renderer"}}, diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp index ada0430411..1b708bcea6 100644 --- a/rpcs3/rpcs3qt/settings_dialog.cpp +++ b/rpcs3/rpcs3qt/settings_dialog.cpp @@ -202,6 +202,10 @@ settings_dialog::settings_dialog(std::shared_ptr gui_settings, std m_emu_settings->EnhanceCheckBox(ui->accurateXFloat, emu_settings_type::AccurateXFloat); SubscribeTooltip(ui->accurateXFloat, tooltips.settings.accurate_xfloat); + m_emu_settings->EnhanceCheckBox(ui->fullWidthAVX512, emu_settings_type::FullWidthAVX512); + SubscribeTooltip(ui->fullWidthAVX512, tooltips.settings.full_width_avx512); + ui->fullWidthAVX512->setEnabled(utils::has_avx512()); + // Comboboxes m_emu_settings->EnhanceComboBox(ui->spuBlockSize, emu_settings_type::SPUBlockSize); diff --git a/rpcs3/rpcs3qt/settings_dialog.ui b/rpcs3/rpcs3qt/settings_dialog.ui index 2d78c510c2..a7eeb7c233 100644 --- a/rpcs3/rpcs3qt/settings_dialog.ui +++ b/rpcs3/rpcs3qt/settings_dialog.ui @@ -171,6 +171,13 @@ + + + + Full Width AVX-512 + + + diff --git a/rpcs3/rpcs3qt/tooltips.h b/rpcs3/rpcs3qt/tooltips.h index 80ebb7ddbf..c5fdb37e79 100644 --- a/rpcs3/rpcs3qt/tooltips.h +++ b/rpcs3/rpcs3qt/tooltips.h @@ -71,6 +71,7 @@ public: const QString enable_tsx = tr("Enable usage of TSX instructions.\nNeeds to be forced on some Haswell or Broadwell CPUs.\nForcing this on older Hardware can lead to system instability, use it with caution."); const QString spu_block_size = tr("This option controls the SPU analyser, particularly the size of compiled units. The Mega and Giga modes may improve performance by tying smaller units together, decreasing the number of compiled units but increasing their size.\nUse the Safe mode for maximum compatibility."); const QString preferred_spu_threads = tr("Some SPU stages are sensitive to race conditions and allowing a limited number at a time helps alleviate performance stalls.\nSetting this to a smaller value might improve performance and reduce stuttering in some games.\nLeave this on auto if performance is negatively affected when setting a small value."); + const QString full_width_avx512 = tr("Enables the use of code with full width AVX-512.\nThis code can be executed much faster, but may cause a loss in performance if your CPU model experiences downclocking on wide AVX-512 loads.\nNote that AVX-512 instructions will be used regardless of this option, just at 128 and 256 bit width."); // debug