From 71f1021648df008715e00c2021cc880d3a583fef Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sat, 21 Nov 2020 07:56:54 +0300 Subject: [PATCH] Fix thread pool entry point and get_cycles() Fix possible race between thread handle availability. Don't treat zero thread as invalid one. Now entry point is full is assembly. Attempt to fix #9282 Also fix some TLS. --- Utilities/Thread.cpp | 160 ++++++++++++++++++-------------- Utilities/Thread.h | 47 +++------- rpcs3/Emu/RSX/GL/GLGSRender.cpp | 2 + rpcs3/Emu/RSX/GL/GLHelpers.cpp | 8 +- rpcs3/Emu/RSX/GL/GLHelpers.h | 2 +- 5 files changed, 109 insertions(+), 110 deletions(-) diff --git a/Utilities/Thread.cpp b/Utilities/Thread.cpp index 2b6c08b9e9..20ab85c8d4 100644 --- a/Utilities/Thread.cpp +++ b/Utilities/Thread.cpp @@ -88,6 +88,7 @@ thread_local u64 g_tls_fault_rsx = 0; thread_local u64 g_tls_fault_spu = 0; thread_local u64 g_tls_wait_time = 0; thread_local u64 g_tls_wait_fail = 0; +thread_local bool g_tls_access_violation_recovered = false; extern thread_local std::string(*g_tls_log_prefix)(); template <> @@ -1367,13 +1368,11 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no return true; } - thread_local bool access_violation_recovered = false; - // Hack: allocate memory in case the emulator is stopping const auto hack_alloc = [&]() { // If failed the value remains true and std::terminate should be called - access_violation_recovered = true; + g_tls_access_violation_recovered = true; const auto area = vm::reserve_map(vm::any, addr & -0x10000, 0x10000); @@ -1525,7 +1524,7 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no if (cpu->id_type() != 1) { - if (!access_violation_recovered) + if (!g_tls_access_violation_recovered) { vm_log.notice("\n%s", cpu->dump_all()); vm_log.error("Access violation %s location 0x%x (%s) [type=u%u]", is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory", d_size * 8); @@ -1556,14 +1555,14 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no Emu.Pause(); - if (cpu && !access_violation_recovered) + if (cpu && !g_tls_access_violation_recovered) { vm_log.notice("\n%s", cpu->dump_all()); } // Note: a thread may access violate more than once after hack_alloc recovery // Do not log any further access violations in this case. - if (!access_violation_recovered) + if (!g_tls_access_violation_recovered) { vm_log.fatal("Access violation %s location 0x%x (%s) [type=u%u]", is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory", d_size * 8); } @@ -1850,7 +1849,7 @@ static atomic_t s_thread_bits{0}; static atomic_t s_thread_pool[128]{}; -void thread_base::start(native_entry entry, void(*trampoline)()) +void thread_base::start(native_entry entry) { for (u128 bits = s_thread_bits.load(); bits; bits &= bits - 1) { @@ -1869,16 +1868,13 @@ void thread_base::start(native_entry entry, void(*trampoline)()) } // Send "this" and entry point - m_thread = reinterpret_cast(trampoline); + const u64 entry_val = reinterpret_cast(entry); + m_thread = entry_val; atomic_storage::release(*tls, this); s_thread_pool[pos].notify_all(); // Wait for actual "m_thread" in return - while (m_thread == reinterpret_cast(trampoline)) - { - busy_wait(300); - } - + m_thread.wait(entry_val); return; } @@ -1892,6 +1888,10 @@ void thread_base::start(native_entry entry, void(*trampoline)()) void thread_base::initialize(void (*error_cb)()) { +#ifndef _WIN32 + m_thread.release(reinterpret_cast(pthread_self())); +#endif + // Initialize TLS variables thread_ctrl::g_tls_this_thread = this; @@ -2012,9 +2012,6 @@ u64 thread_base::finalize(thread_state result_state) noexcept atomic_wait_engine::set_wait_callback(nullptr); - const u64 _self = m_thread; - m_thread.release(0); - // Return true if need to delete thread object (no) const bool ok = 0 == (3 & ~m_sync.fetch_op([&](u64& v) { @@ -2026,48 +2023,55 @@ u64 thread_base::finalize(thread_state result_state) noexcept m_sync.notify_all(2); // No detached thread supported atm - return _self; + return m_thread; } -void thread_base::finalize(u64 _self) noexcept +u64 thread_base::finalize(u64 _self) noexcept { + g_tls_fault_all = 0; + g_tls_fault_rsx = 0; + g_tls_fault_spu = 0; + g_tls_wait_time = 0; + g_tls_wait_fail = 0; + g_tls_access_violation_recovered = false; + atomic_wait_engine::set_wait_callback(nullptr); g_tls_log_prefix = []() -> std::string { return {}; }; thread_ctrl::g_tls_this_thread = nullptr; if (!_self) { - return; + return 0; } // Try to add self to thread pool - const auto [bits, ok] = s_thread_bits.fetch_op([](u128& bits) - { - if (~bits) [[likely]] - { - // Set lowest clear bit - bits |= bits + 1; - return true; - } - - return false; - }); - - if (!ok) - { -#ifdef _WIN32 - _endthread(); -#else - pthread_detach(reinterpret_cast(_self)); - pthread_exit(0); -#endif - return; - } - set_name("..pool"); - // Obtain id from atomic op - const u32 pos = utils::ctz128(~bits); + u32 pos = -1; + + while (true) + { + const auto [bits, ok] = s_thread_bits.fetch_op([](u128& bits) + { + if (~bits) [[likely]] + { + // Set lowest clear bit + bits |= bits + 1; + return true; + } + + return false; + }); + + if (ok) [[likely]] + { + pos = utils::ctz128(~bits); + break; + } + + s_thread_bits.wait(bits); + } + const auto tls = &thread_ctrl::g_tls_this_thread; s_thread_pool[pos] = tls; @@ -2082,30 +2086,47 @@ void thread_base::finalize(u64 _self) noexcept val &= ~(u128(1) << pos); }); + s_thread_bits.notify_one(); + // Restore thread id const auto _this = atomic_storage::load(*tls); const auto entry = _this->m_thread.exchange(_self); + verify(HERE), entry != _self; _this->m_thread.notify_one(); - // Hack return address to avoid tail call -#ifdef _MSC_VER - *static_cast(_AddressOfReturnAddress()) = entry; -#else - static_cast(__builtin_frame_address(0))[1] = entry; -#endif - //reinterpret_cast(entry)(_this); + // Return new entry + return entry; } -void (*thread_base::make_trampoline(native_entry entry))() +thread_base::native_entry thread_base::make_trampoline(u64(*entry)(thread_base* _base)) { - return build_function_asm([&](asmjit::X86Assembler& c, auto& args) + return build_function_asm([&](asmjit::X86Assembler& c, auto& args) { using namespace asmjit; - // Revert effect of ret instruction (fix stack alignment) - c.mov(x86::rax, imm_ptr(entry)); - c.sub(x86::rsp, 8); - c.jmp(x86::rax); + Label _ret = c.newLabel(); + c.push(x86::rbp); + c.sub(x86::rsp, 0x20); + + // Call entry point (TODO: support for detached threads missing?) + c.call(imm_ptr(entry)); + + // Call finalize, return if zero + c.mov(args[0], x86::rax); + c.call(imm_ptr(finalize)); + c.test(x86::rax, x86::rax); + c.jz(_ret); + + // Otherwise, call it as an entry point with first arg = new current thread + c.mov(x86::rbp, x86::rax); + c.call(imm_ptr(thread_ctrl::get_current)); + c.mov(args[0], x86::rax); + c.add(x86::rsp, 0x28); + c.jmp(x86::rbp); + + c.bind(_ret); + c.add(x86::rsp, 0x28); + c.ret(); }); } @@ -2193,12 +2214,13 @@ thread_base::thread_base(std::string_view name) thread_base::~thread_base() { - if (u64 handle = m_thread.exchange(0)) + // Only cleanup on errored status + if ((m_sync & 3) == 2) { #ifdef _WIN32 - CloseHandle(reinterpret_cast(handle)); + CloseHandle(reinterpret_cast(m_thread.load())); #else - pthread_detach(reinterpret_cast(handle)); + pthread_join(reinterpret_cast(m_thread.load()), nullptr); #endif } } @@ -2255,13 +2277,15 @@ u64 thread_base::get_native_id() const u64 thread_base::get_cycles() { - u64 cycles; + u64 cycles = 0; + + const u64 handle = m_thread; #ifdef _WIN32 - if (QueryThreadCycleTime(reinterpret_cast(m_thread.load()), &cycles)) + if (QueryThreadCycleTime(reinterpret_cast(handle), &cycles)) { #elif __APPLE__ - mach_port_name_t port = pthread_mach_thread_np(reinterpret_cast(m_thread.load())); + mach_port_name_t port = pthread_mach_thread_np(reinterpret_cast(handle)); mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT; thread_basic_info_data_t info; kern_return_t ret = thread_info(port, THREAD_BASIC_INFO, reinterpret_cast(&info), &count); @@ -2272,7 +2296,7 @@ u64 thread_base::get_cycles() #else clockid_t _clock; struct timespec thread_time; - if (!pthread_getcpuclockid(reinterpret_cast(m_thread.load()), &_clock) && !clock_gettime(_clock, &thread_time)) + if (!pthread_getcpuclockid(reinterpret_cast(handle), &_clock) && !clock_gettime(_clock, &thread_time)) { cycles = static_cast(thread_time.tv_sec) * 1'000'000'000 + thread_time.tv_nsec; #endif @@ -2317,19 +2341,15 @@ void thread_ctrl::emergency_exit(std::string_view reason) if (!_self) { + // Unused, detached thread support remnant delete _this; } thread_base::finalize(0); #ifdef _WIN32 - _endthread(); + _endthreadex(0); #else - if (_self) - { - pthread_detach(reinterpret_cast(_self)); - } - pthread_exit(0); #endif } diff --git a/Utilities/Thread.h b/Utilities/Thread.h index a0df14578b..1e5d0143f6 100644 --- a/Utilities/Thread.h +++ b/Utilities/Thread.h @@ -111,22 +111,22 @@ private: stx::atomic_cptr m_tname; // Start thread - void start(native_entry, void(*)()); + void start(native_entry); // Called at the thread start void initialize(void (*error_cb)()); - // Called at the thread end, returns true if needs destruction + // Called at the thread end, returns self handle u64 finalize(thread_state result) noexcept; // Cleanup after possibly deleting the thread instance - static void finalize(u64 _self) noexcept; + static u64 finalize(u64 _self) noexcept; // Set name for debugger static void set_name(std::string); - // Make trampoline with stack fix - static void(*make_trampoline(native_entry))(); + // Make entry point + static native_entry make_trampoline(u64(*)(thread_base*)); friend class thread_ctrl; @@ -277,35 +277,12 @@ class named_thread final : public Context, result_storage_t, thread_bas using result = result_storage_t; using thread = thread_base; - // Type-erased thread entry point -#ifdef _WIN32 - static inline uint __stdcall entry_point(void* arg) -#else - static inline void* entry_point(void* arg) -#endif + static u64 entry_point(thread_base* _base) { - if (auto _this = thread_ctrl::get_current()) - { - arg = _this; - } - - const auto _this = static_cast(static_cast(arg)); - - // Perform self-cleanup if necessary - u64 _self = _this->entry_point(); - - if (!_self) - { - delete _this; - thread::finalize(0); - return 0; - } - - thread::finalize(_self); - return 0; + return static_cast(_base)->entry_point2(); } - u64 entry_point() + u64 entry_point2() { thread::initialize([]() { @@ -330,7 +307,7 @@ class named_thread final : public Context, result_storage_t, thread_bas return thread::finalize(thread_state::finished); } - static inline void(*trampoline)() = thread::make_trampoline(entry_point); + static inline thread::native_entry trampoline = thread::make_trampoline(entry_point); friend class thread_ctrl; @@ -341,7 +318,7 @@ public: : Context() , thread(Context::thread_name) { - thread::start(&named_thread::entry_point, trampoline); + thread::start(trampoline); } // Normal forwarding constructor @@ -350,7 +327,7 @@ public: : Context(std::forward(args)...) , thread(name) { - thread::start(&named_thread::entry_point, trampoline); + thread::start(trampoline); } // Lambda constructor, also the implicit deduction guide candidate @@ -358,7 +335,7 @@ public: : Context(std::forward(f)) , thread(name) { - thread::start(&named_thread::entry_point, trampoline); + thread::start(trampoline); } named_thread(const named_thread&) = delete; diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index c560a8b54d..b5fbe91323 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -469,6 +469,8 @@ void GLGSRender::on_exit() GSRender::on_exit(); zcull_ctrl.release(); + + gl::set_primary_context_thread(false); } void GLGSRender::clear_surface(u32 arg) diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.cpp b/rpcs3/Emu/RSX/GL/GLHelpers.cpp index 034509f819..cc4721480c 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.cpp +++ b/rpcs3/Emu/RSX/GL/GLHelpers.cpp @@ -11,16 +11,16 @@ namespace gl capabilities g_driver_caps; const fbo screen{}; - thread_local bool tls_primary_context_thread = false; + static thread_local bool s_tls_primary_context_thread = false; - void set_primary_context_thread() + void set_primary_context_thread(bool value) { - tls_primary_context_thread = true; + s_tls_primary_context_thread = value; } bool is_primary_context_thread() { - return tls_primary_context_thread; + return s_tls_primary_context_thread; } GLenum draw_mode(rsx::primitive_type in) diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h index b215df7f30..d9416b80e5 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.h +++ b/rpcs3/Emu/RSX/GL/GLHelpers.h @@ -55,7 +55,7 @@ namespace gl bool is_primitive_native(rsx::primitive_type in); GLenum draw_mode(rsx::primitive_type in); - void set_primary_context_thread(); + void set_primary_context_thread(bool = true); bool is_primary_context_thread(); // Texture helpers