SPU Analyzer: Implement loop analysis

This commit is contained in:
Elad Ashkenazi 2024-05-08 14:30:12 +03:00
parent ff42459239
commit e1a0887f51

View File

@ -27,8 +27,6 @@
#include "util/simd.hpp" #include "util/simd.hpp"
#include "util/sysinfo.hpp" #include "util/sysinfo.hpp"
#pragma optimize("", off)
const extern spu_decoder<spu_itype> g_spu_itype; const extern spu_decoder<spu_itype> g_spu_itype;
const extern spu_decoder<spu_iname> g_spu_iname; const extern spu_decoder<spu_iname> g_spu_iname;
const extern spu_decoder<spu_iflag> g_spu_iflag; const extern spu_decoder<spu_iflag> g_spu_iflag;
@ -4827,6 +4825,9 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
usz parent_target_index = 0; usz parent_target_index = 0;
usz iterator_id = 0; usz iterator_id = 0;
usz temp_child_index = umax;
usz temp_list_index = umax;
// PUTLLC16 optimization analysis tracker // PUTLLC16 optimization analysis tracker
atomic16_t atomic16{}; atomic16_t atomic16{};
@ -4843,12 +4844,14 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
std::map<u32, atomic16_t> atomic16_all; // RdAtomicStat location -> atomic loop optimization state std::map<u32, atomic16_t> atomic16_all; // RdAtomicStat location -> atomic loop optimization state
std::map<u32, bool> getllar_starts; // True for failed loops std::map<u32, bool> getllar_starts; // True for failed loops
std::map<u32, bool> run_on_block; std::map<u32, bool> run_on_block;
std::map<u32, bool> logged_block;
std::array<reg_state_t, s_reg_max>* true_state_walkby = nullptr; std::array<reg_state_t, s_reg_max>* true_state_walkby = nullptr;
atomic16_t dummy16{}; atomic16_t dummy16{};
bool likely_putllc_loop = false; bool likely_putllc_loop = false;
bool had_putllc_evaluation = false;
for (u32 i = 0, count = 0; i < result.data.size(); i++) for (u32 i = 0, count = 0; i < result.data.size(); i++)
{ {
@ -4873,7 +4876,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
target_count += loc.size(); target_count += loc.size();
} }
const bool should_search_patterns = likely_putllc_loop && target_count < 100; const bool should_search_patterns = target_count < 300u;
// Treat start of function as an unknown value with tag (because it is) // Treat start of function as an unknown value with tag (because it is)
const reg_state_t start_program_count = reg_state_t::make_unknown(); const reg_state_t start_program_count = reg_state_t::make_unknown();
@ -4890,8 +4893,6 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
SPU_LS_MASK_1 = (SPU_LS_SIZE - 1), SPU_LS_MASK_1 = (SPU_LS_SIZE - 1),
}; };
const bool log_it = result.data.size() == 0x5b && entry_point == 0x9a28;
u32 iterator_id_alloc = 0; u32 iterator_id_alloc = 0;
for (u32 wf = 0, wi = 0, wa = entry_point, bpc = wa; wf <= 1;) for (u32 wf = 0, wi = 0, wa = entry_point, bpc = wa; wf <= 1;)
@ -4914,7 +4915,6 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
if (!should_search_patterns) if (!should_search_patterns)
{ {
// TODO: Enable constant search for all
break; break;
} }
@ -4944,9 +4944,10 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
} }
} }
spu_log.always()("%s", out); spu_log.fatal("%s", out);
} }
true_state_walkby = &infos[bpc]->evaluate_start_state(infos);
true_state_walkby = &ensure(infos[bpc])->evaluate_start_state(infos);
for (reg_state_t& f : *true_state_walkby) for (reg_state_t& f : *true_state_walkby)
{ {
@ -4977,6 +4978,8 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
return; return;
} }
had_putllc_evaluation = true;
g_fxo->get<putllc16_statistics_t>().breaking_reason[cause]++; g_fxo->get<putllc16_statistics_t>().breaking_reason[cause]++;
if (!spu_log.notice) if (!spu_log.notice)
@ -5085,15 +5088,14 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
usz stackframe_pc = SPU_LS_SIZE; usz stackframe_pc = SPU_LS_SIZE;
usz entry_index = umax; usz entry_index = umax;
auto get_block_targets = [&](u32 pc) -> const std::basic_string<u32>& auto get_block_targets = [&](u32 pc) -> std::basic_string_view<u32>
{ {
if (m_block_info[pc / 4] && m_bbs.count(pc)) if (m_block_info[pc / 4] && m_bbs.count(pc))
{ {
return m_bbs.at(pc).targets; return m_bbs.at(pc).targets;
} }
static const std::basic_string<u32> s_empty; return {};
return s_empty;
}; };
u32 target_pc = SPU_LS_SIZE; u32 target_pc = SPU_LS_SIZE;
@ -5107,7 +5109,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
stackframe_pc = state_it->pc; stackframe_pc = state_it->pc;
entry_index = state_it->parent_target_index; entry_index = state_it->parent_target_index;
const auto& targets = get_block_targets(stackframe_pc); const auto targets = get_block_targets(stackframe_pc);
const usz target_size = targets.size(); const usz target_size = targets.size();
@ -5121,15 +5123,16 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
{ {
const usz parent_index = state_it->parent_iterator_index; const usz parent_index = state_it->parent_iterator_index;
to_pop.emplace_back(stackframe_it);
if (parent_index != umax) if (parent_index != umax)
{ {
to_pop.emplace_back(stackframe_it);
stackframe_it = parent_index; stackframe_it = parent_index;
} }
else else
{ {
spu_log.success("Clearing"); // Final
reg_state_it.clear(); wi = 0;
break; break;
} }
} }
@ -5137,16 +5140,14 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
{ {
target_pc = ::at32(targets, entry_index); target_pc = ::at32(targets, entry_index);
// Check block duplication (terminating infinite loops) usz occurence_count = 0;
// Even if duplicated, this still has impact by registering the end of the possible code path outcome std::array<usz, 16> duplicate_positions;
std::set<u32> positions;
u32 dup_pc = SPU_LS_SIZE;
u32 occurence_count = 0;
// Virtual concept (there is no really such thing as loop connectors from the ccompiled-code level) // Virtual concept (there is no really such thing as loop connectors from the ccompiled-code level)
// But it helps to simplify this process // But it helps to simplify this process
bool is_loop_connector_or_too_extensive = false; bool is_loop_connector = false;
bool is_too_extensive = false;
bool is_skipable = false;
// Hack to avoid extensive analysis of all code paths possible: // Hack to avoid extensive analysis of all code paths possible:
// Allow up to 4 occurences of the upper-most block // Allow up to 4 occurences of the upper-most block
@ -5154,43 +5155,100 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
// The proper solution would be to add a precursry function analysis stage which identifies all loop "connectors" and allows duplicates based on it // The proper solution would be to add a precursry function analysis stage which identifies all loop "connectors" and allows duplicates based on it
for (usz i = stackframe_it, count = 0;; count++) for (usz i = stackframe_it, count = 0;; count++)
{ {
if (count >= 100) auto& entry = ::at32(reg_state_it, i);
const u32 entry_pc = entry.pc;
if (count == (state_it->atomic16.active ? 40 : 12))
{ {
is_loop_connector_or_too_extensive = true; if (state_it->atomic16.active && !std::exchange(logged_block[target_pc / 4], true))
{
spu_log.notice("SPU Blcok Analysis is too extensive at 0x%x", entry_pc);
}
is_too_extensive = true;
break; break;
} }
auto& entry = ::at32(reg_state_it, i);
const u32 entry_pc = entry.pc;
if (entry_pc == target_pc) if (entry_pc == target_pc)
{ {
if (dup_pc == entry_pc) duplicate_positions[occurence_count++] = i;
{
occurence_count++;
if (occurence_count == 4) if (occurence_count == duplicate_positions.size())
{
is_loop_connector_or_too_extensive = true;
break;
}
}
else if (dup_pc > entry_pc)
{ {
dup_pc = entry_pc; is_loop_connector = true;
break;
} }
} }
const usz parent = entry.parent_iterator_index; const usz parent_idx = entry.parent_iterator_index;
if (parent == umax) if (parent_idx == umax)
{ {
break; break;
} }
ensure(i != parent); ensure(i != parent_idx);
i = parent;
// Fill info for later
auto& parent = ::at32(reg_state_it, parent_idx);
parent.temp_child_index = i;
parent.temp_list_index = count;
i = parent_idx;
}
// Scan the code for "code flow" repetitions (entire sequences of blocks equal to each other)
// If found, this is 100% a loop, shoulkd it start a third time ignore it
if (occurence_count >= 2)
{
for (usz it_begin = 0; !is_skipable && it_begin < occurence_count - 1; it_begin++)
{
const usz block_start = duplicate_positions[it_begin + 1];
for (usz it_tail = 0; it_tail < it_begin + 1; it_tail++)
{
const usz block_tail = duplicate_positions[it_begin - it_tail];
// Check if the distance is precisely two times from the end
if (reg_state_it.size() - block_start != utils::rol64(reg_state_it.size() - block_tail, 1))
{
continue;
}
bool is_equal = true;
for (usz j = 1; j < reg_state_it.size() - block_tail; j++)
{
if (reg_state_it[block_start + j].pc != reg_state_it[block_tail + j].pc)
{
is_equal = false;
break;
}
}
if (is_equal)
{
is_skipable = true;
break;
}
}
}
}
if (is_skipable)
{
if (!std::exchange(logged_block[target_pc / 4], true))
{
spu_log.notice("SPU block is a loop at [0x%05x -> 0x%05x]", state_it->pc, target_pc);
}
state_it->parent_target_index++;
continue;
}
if (is_loop_connector && !std::exchange(logged_block[target_pc / 4], true))
{
spu_log.notice("SPU block analysis is too repetitive at [0x%05x -> 0x%05x]", state_it->pc, target_pc);
} }
insert_entry = true; insert_entry = true;
@ -5198,17 +5256,19 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
// Test if the code is an opening to external code (start of the function is always respected because it is already assumed to have no origin) // Test if the code is an opening to external code (start of the function is always respected because it is already assumed to have no origin)
is_code_backdoor = m_ret_info[target_pc / 4] || (m_entry_info[target_pc / 4] && target_pc != entry_point); is_code_backdoor = m_ret_info[target_pc / 4] || (m_entry_info[target_pc / 4] && target_pc != entry_point);
if (is_code_backdoor || is_loop_connector_or_too_extensive) if (run_on_block[target_pc / 4])
{ {
insert_entry = false;
}
else if (is_code_backdoor || is_too_extensive || is_loop_connector)
{
if (reg_state_it[stackframe_it].atomic16.active)
{
break_putllc16(40, reg_state_it[stackframe_it].atomic16.discard());
}
// Allow the block to run only once, to avoid unnecessary iterations // Allow the block to run only once, to avoid unnecessary iterations
if (run_on_block[target_pc / 4]) run_on_block[target_pc / 4] = true;
{
insert_entry = false;
}
else
{
run_on_block[target_pc / 4] = true;
}
} }
state_it->parent_target_index++; state_it->parent_target_index++;
@ -5238,6 +5298,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
{ {
// Should not be reachable at the moment // Should not be reachable at the moment
//ensure(false); //ensure(false);
spu_log.error("Failed to clean block analyis steps at block_id %d", reg_state_it[it].iterator_id);
} }
} }
@ -5245,7 +5306,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
{ {
const u32 target_size = get_block_targets(stackframe_pc).size(); const u32 target_size = get_block_targets(stackframe_pc).size();
spu_log.always()("Emplacing: wi=%d, pc=0x%x, target_it=%d/%d, new_pc=0x%x (has_it=%d)", stackframe_it, stackframe_pc, entry_index + 1, target_size, target_pc, atomic16_info.active); spu_log.trace("Emplacing: block_id=%d, pc=0x%x, target_it=%d/%d, new_pc=0x%x (has_it=%d)", reg_state_it[stackframe_it].iterator_id, stackframe_pc, entry_index + 1, target_size, target_pc, atomic16_info.active);
auto& next = reg_state_it.emplace_back(target_pc, stackframe_it, 0); auto& next = reg_state_it.emplace_back(target_pc, stackframe_it, 0);
if (!is_code_backdoor) if (!is_code_backdoor)
@ -5259,7 +5320,8 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
} }
next.iterator_id = iterator_id_alloc++; next.iterator_id = iterator_id_alloc++;
wi++; wi = stackframe_it + 1;
ensure(stackframe_it + 1 == reg_state_it.size() - 1);
} }
} }
@ -5407,7 +5469,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
const auto type = g_spu_itype.decode(data); const auto type = g_spu_itype.decode(data);
// For debugging // For debugging
if (likely_putllc_loop && (log_it || is_pattern_match)) if (false && likely_putllc_loop && is_pattern_match)
{ {
SPUDisAsm dis_asm(cpu_disasm_mode::dump, reinterpret_cast<const u8*>(result.data.data()), result.lower_bound); SPUDisAsm dis_asm(cpu_disasm_mode::dump, reinterpret_cast<const u8*>(result.data.data()), result.lower_bound);
dis_asm.disasm(pos); dis_asm.disasm(pos);
@ -6624,6 +6686,7 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
} }
auto& stats = g_fxo->get<putllc16_statistics_t>(); auto& stats = g_fxo->get<putllc16_statistics_t>();
had_putllc_evaluation = true;
if (!pattern.ls_write) if (!pattern.ls_write)
{ {
@ -6707,6 +6770,11 @@ spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, s
, pattern.mem_count, pattern.put_pc, value.type == v_relative, value.off18, value.type == v_const, value.type == v_reg2, value.reg, value.runtime16_select, entry_point, func_hash, +stats.nowrite, ++stats.single, +stats.all); , pattern.mem_count, pattern.put_pc, value.type == v_relative, value.off18, value.type == v_const, value.type == v_reg2, value.reg, value.runtime16_select, entry_point, func_hash, +stats.nowrite, ++stats.single, +stats.all);
} }
if (likely_putllc_loop && !had_putllc_evaluation)
{
spu_log.notice("Likely missed PUTLLC16 patterns. (entry=0x%x)", entry_point);
}
if (result.data.empty()) if (result.data.empty())
{ {
// Blocks starting from 0x0 or invalid instruction won't be compiled, may need special interpreter fallback // Blocks starting from 0x0 or invalid instruction won't be compiled, may need special interpreter fallback