spu: Reimplement FCGT and FCMGT to handle corner cases (such as comparisons against infinites and denormals)

- Also optimize FMA/FMS/FNMS for recompiler
2024-11-17 08:11:51 +00:00 · 2017-07-06 18:40:51 +03:00 · 2017-07-06 18:40:51 +03:00 · 41d921808b
commit 41d921808b
parent 9e7a42d057
2 changed files with 161 additions and 39 deletions
--- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp
@ -1674,10 +1674,53 @@ void spu_recompiler::ANDC(spu_opcode_t op)

 void spu_recompiler::FCGT(spu_opcode_t op)
 {
-	// reverted less-than
-	const XmmLink& vb = XmmGet(op.rb, XmmType::Float);
-	c->cmpps(vb, SPU_OFF_128(gpr, op.ra), 1);
-	c->movaps(SPU_OFF_128(gpr, op.rt), vb);
+	const auto last_exp_bit = XmmConst(_mm_set1_epi32(0x00800000));
+	const auto all_exp_bits = XmmConst(_mm_set1_epi32(0x7f800000));
+
+	const XmmLink& tmp0 = XmmAlloc();
+	const XmmLink& tmp1 = XmmAlloc();
+	const XmmLink& tmp2 = XmmAlloc();
+	const XmmLink& tmp3 = XmmAlloc();
+	const XmmLink& tmpv = XmmAlloc();
+
+	c->pxor(tmp0, tmp0);
+	c->pxor(tmp1, tmp1);
+	c->cmpps(tmp0, SPU_OFF_128(gpr, op.ra), 3);  //tmp0 is true if a is extended (nan/inf)
+	c->cmpps(tmp1, SPU_OFF_128(gpr, op.rb), 3);  //tmp1 is true if b is extended (nan/inf)
+
+	//compute lower a and b
+	c->movaps(tmp2, last_exp_bit);
+	c->movaps(tmp3, last_exp_bit);
+	c->pandn(tmp2, SPU_OFF_128(gpr, op.ra));  //tmp2 = lowered_a
+	c->pandn(tmp3, SPU_OFF_128(gpr, op.rb));  //tmp3 = lowered_b
+
+	//lower a if extended
+	c->movaps(tmpv, tmp0);
+	c->pand(tmpv, tmp2);
+	c->pandn(tmp0, SPU_OFF_128(gpr, op.ra));
+	c->orps(tmp0, tmpv);
+
+	//lower b if extended
+	c->movaps(tmpv, tmp1);
+	c->pand(tmpv, tmp3);
+	c->pandn(tmp1, SPU_OFF_128(gpr, op.rb));
+	c->orps(tmp1, tmpv);
+
+	//flush a to 0 if denormalized
+	c->pxor(tmpv, tmpv);
+	c->movaps(tmp2, SPU_OFF_128(gpr, op.ra));
+	c->andps(tmp2, all_exp_bits);
+	c->cmpps(tmp2, tmpv, 0);
+	c->pandn(tmp2, tmp0);
+
+	//flush b to 0 if denormalized
+	c->movaps(tmp3, SPU_OFF_128(gpr, op.rb));
+	c->andps(tmp3, all_exp_bits);
+	c->cmpps(tmp3, tmpv, 0);
+	c->pandn(tmp3, tmp1);
+
+	c->cmpps(tmp3, tmp2, 1);
+	c->movaps(SPU_OFF_128(gpr, op.rt), tmp3);
 }

 void spu_recompiler::DFCGT(spu_opcode_t op)
@ -1729,13 +1772,46 @@ void spu_recompiler::ORC(spu_opcode_t op)
 void spu_recompiler::FCMGT(spu_opcode_t op)
 {
 	// reverted less-than
-	const XmmLink& vb = XmmGet(op.rb, XmmType::Float);
-	const XmmLink& vi = XmmAlloc();
-	c->movaps(vi, XmmConst(_mm_set1_epi32(0x7fffffff)));
-	c->andps(vb, vi); // abs
-	c->andps(vi, SPU_OFF_128(gpr, op.ra));
-	c->cmpps(vb, vi, 1);
-	c->movaps(SPU_OFF_128(gpr, op.rt), vb);
+	// since comparison is absoulte, a > b if a is extended and b is not extended
+	// flush denormals to zero to make zero == zero work
+	const auto last_exp_bit = XmmConst(_mm_set1_epi32(0x00800000));
+	const auto all_exp_bits = XmmConst(_mm_set1_epi32(0x7f800000));
+	const auto remove_sign_bits = XmmConst(_mm_set1_epi32(0x7fffffff));
+
+	const XmmLink& tmp0 = XmmAlloc();
+	const XmmLink& tmp1 = XmmAlloc();
+	const XmmLink& tmp2 = XmmAlloc();
+	const XmmLink& tmp3 = XmmAlloc();
+	const XmmLink& tmpv = XmmAlloc();
+
+	c->pxor(tmp0, tmp0);
+	c->pxor(tmp1, tmp1);
+	c->cmpps(tmp0, SPU_OFF_128(gpr, op.ra), 3);  //tmp0 is true if a is extended (nan/inf)
+	c->cmpps(tmp1, SPU_OFF_128(gpr, op.rb), 3);  //tmp1 is true if b is extended (nan/inf)
+
+	//flush a to 0 if denormalized
+	c->pxor(tmpv, tmpv);
+	c->movaps(tmp2, SPU_OFF_128(gpr, op.ra));
+	c->andps(tmp2, all_exp_bits);
+	c->cmpps(tmp2, tmpv, 0);
+	c->pandn(tmp2, SPU_OFF_128(gpr, op.ra));
+
+	//flush b to 0 if denormalized
+	c->movaps(tmp3, SPU_OFF_128(gpr, op.rb));
+	c->andps(tmp3, all_exp_bits);
+	c->cmpps(tmp3, tmpv, 0);
+	c->pandn(tmp3, SPU_OFF_128(gpr, op.rb));
+
+	//Set tmp1 to true where a is extended but b is not extended
+	//This is a simplification since absolute values remove necessity of lowering
+	c->xorps(tmp0, tmp1);   //tmp0 is true when either a or b is extended
+	c->pandn(tmp1, tmp0);   //tmp1 is true if b is not extended and a is extended
+
+	c->andps(tmp2, remove_sign_bits);
+	c->andps(tmp3, remove_sign_bits);
+	c->cmpps(tmp3, tmp2, 1);
+	c->orps(tmp3, tmp1);    //Force result to all true if a is extended but b is not
+	c->movaps(SPU_OFF_128(gpr, op.rt), tmp3);
 }

 void spu_recompiler::DFCMGT(spu_opcode_t op)
@ -2717,15 +2793,13 @@ void spu_recompiler::FNMS(spu_opcode_t op)
 	const XmmLink& tmp_a = XmmAlloc();
 	const XmmLink& tmp_b = XmmAlloc();

-	c->movdqa(tmp_a, mask);                     //tmp_a = mask
-	c->andps(tmp_a, SPU_OFF_128(gpr, op.ra));   //tmp_a = a & mask
-	c->cmpps(tmp_a, mask, 4);                   //tmp_a = tmp_a != mask
-	c->andps(tmp_a, SPU_OFF_128(gpr, op.ra));   //tmp_a = mask_a & va
+	c->pxor(tmp_a, tmp_a);                       //tmp_a = 0
+	c->cmpps(tmp_a, SPU_OFF_128(gpr, op.ra), 3); //tmp_a = ra == extended
+	c->pandn(tmp_a, SPU_OFF_128(gpr, op.ra));    //tmp_a = mask_a & ~ra_extended

-	c->movdqa(tmp_b, mask);                      //tmp_b = mask
-	c->andps(tmp_b, SPU_OFF_128(gpr, op.rb));    //tmp_b = b & mask
-	c->cmpps(tmp_b, mask, 4);                    //tmp_b = tmp_b != mask
-	c->andps(tmp_b, SPU_OFF_128(gpr, op.rb));    //tmp_b = mask_b & vb
+	c->pxor(tmp_b, tmp_b);                       //tmp_b = 0
+	c->cmpps(tmp_b, SPU_OFF_128(gpr, op.rb), 3); //tmp_b = rb == extended
+	c->pandn(tmp_b, SPU_OFF_128(gpr, op.rb));    //tmp_b = mask_b & ~rb_extended

 	c->mulps(tmp_a, tmp_b);
 	c->subps(vc, tmp_a);
@ -2736,19 +2810,16 @@ void spu_recompiler::FMA(spu_opcode_t op)
 {
 	const XmmLink& vc = XmmGet(op.rc, XmmType::Float);

-	const auto mask = XmmConst(_mm_set1_epi32(0x7f800000));
 	const XmmLink& tmp_a = XmmAlloc();
 	const XmmLink& tmp_b = XmmAlloc();

-	c->movdqa(tmp_a, mask);                     //tmp_a = mask
-	c->andps(tmp_a, SPU_OFF_128(gpr, op.ra));   //tmp_a = a & mask
-	c->cmpps(tmp_a, mask, 4);                   //tmp_a = tmp_a != mask
-	c->andps(tmp_a, SPU_OFF_128(gpr, op.ra));   //tmp_a = mask_a & va
+	c->pxor(tmp_a, tmp_a);                       //tmp_a = 0
+	c->cmpps(tmp_a, SPU_OFF_128(gpr, op.ra), 3); //tmp_a = ra == extended
+	c->pandn(tmp_a, SPU_OFF_128(gpr, op.ra));    //tmp_a = mask_a & ~ra_extended

-	c->movdqa(tmp_b, mask);                      //tmp_b = mask
-	c->andps(tmp_b, SPU_OFF_128(gpr, op.rb));    //tmp_b = b & mask
-	c->cmpps(tmp_b, mask, 4);                    //tmp_b = tmp_b != mask
-	c->andps(tmp_b, SPU_OFF_128(gpr, op.rb));    //tmp_b = mask_b & vb
+	c->pxor(tmp_b, tmp_b);                       //tmp_b = 0
+	c->cmpps(tmp_b, SPU_OFF_128(gpr, op.rb), 3); //tmp_b = rb == extended
+	c->pandn(tmp_b, SPU_OFF_128(gpr, op.rb));    //tmp_b = mask_b & ~rb_extended

 	c->mulps(tmp_a, tmp_b);
 	c->addps(tmp_a, SPU_OFF_128(gpr, op.rc));
@ -2763,15 +2834,13 @@ void spu_recompiler::FMS(spu_opcode_t op)
 	const XmmLink& tmp_a = XmmAlloc();
 	const XmmLink& tmp_b = XmmAlloc();

-	c->movdqa(tmp_a, mask);                     //tmp_a = mask
-	c->andps(tmp_a, SPU_OFF_128(gpr, op.ra));   //tmp_a = a & mask
-	c->cmpps(tmp_a, mask, 4);                   //tmp_a = tmp_a != mask
-	c->andps(tmp_a, SPU_OFF_128(gpr, op.ra));   //tmp_a = mask_a & va
+	c->pxor(tmp_a, tmp_a);                       //tmp_a = 0
+	c->cmpps(tmp_a, SPU_OFF_128(gpr, op.ra), 3); //tmp_a = ra == extended
+	c->pandn(tmp_a, SPU_OFF_128(gpr, op.ra));    //tmp_a = mask_a & ~ra_extended

-	c->movdqa(tmp_b, mask);                      //tmp_b = mask
-	c->andps(tmp_b, SPU_OFF_128(gpr, op.rb));    //tmp_b = b & mask
-	c->cmpps(tmp_b, mask, 4);                    //tmp_b = tmp_b != mask
-	c->andps(tmp_b, SPU_OFF_128(gpr, op.rb));    //tmp_b = mask_b & vb
+	c->pxor(tmp_b, tmp_b);                       //tmp_b = 0
+	c->cmpps(tmp_b, SPU_OFF_128(gpr, op.rb), 3); //tmp_b = rb == extended
+	c->pandn(tmp_b, SPU_OFF_128(gpr, op.rb));    //tmp_b = mask_b & ~rb_extended

 	c->mulps(tmp_a, tmp_b);
 	c->subps(tmp_a, SPU_OFF_128(gpr, op.rc));
--- a/rpcs3/Emu/Cell/SPUInterpreter.cpp
+++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp
@ -719,7 +719,40 @@ void spu_interpreter::ANDC(SPUThread& spu, spu_opcode_t op)

 void spu_interpreter_fast::FCGT(SPUThread& spu, spu_opcode_t op)
 {
-	spu.gpr[op.rt].vf = _mm_cmplt_ps(spu.gpr[op.rb].vf, spu.gpr[op.ra].vf);
+	// IMPL NOTES:
+	// if (v is inf) v = (inf - 1) i.e nearest normal value to inf with mantissa bits left intact
+	// if (v is denormalized) v = 0 flush denormals
+	// return v1 > v2
+	// branching simulated using bitwise ops and_not+or
+
+	const auto zero = _mm_set1_ps(0.f);
+	const auto nan_check_a = _mm_cmpunord_ps(spu.gpr[op.ra].vf, zero);    //mask true where a is extended
+	const auto nan_check_b = _mm_cmpunord_ps(spu.gpr[op.rb].vf, zero);    //mask true where b is extended
+
+	//calculate lowered a and b. The mantissa bits are left untouched for now unless its proven they should be flushed
+	const auto last_exp_bit = _mm_castsi128_ps(_mm_set1_epi32(0x00800000));
+	const auto lowered_a =_mm_andnot_ps(last_exp_bit, spu.gpr[op.ra].vf);      //a is lowered to largest unextended value with sign
+	const auto lowered_b = _mm_andnot_ps(last_exp_bit, spu.gpr[op.rb].vf);		//b is lowered to largest unextended value with sign
+
+	//check if a and b are denormalized
+	const auto all_exp_bits = _mm_castsi128_ps(_mm_set1_epi32(0x7f800000));
+	const auto denorm_check_a = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.ra].vf));
+	const auto denorm_check_b = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.rb].vf));
+
+	//set a and b to their lowered values if they are extended
+	const auto a_values_lowered = _mm_and_ps(nan_check_a, lowered_a);
+	const auto original_a_masked = _mm_andnot_ps(nan_check_a, spu.gpr[op.ra].vf);
+	const auto a_final1 = _mm_or_ps(a_values_lowered, original_a_masked);
+
+	const auto b_values_lowered = _mm_and_ps(nan_check_b, lowered_b);
+	const auto original_b_masked = _mm_andnot_ps(nan_check_b, spu.gpr[op.rb].vf);
+	const auto b_final1 = _mm_or_ps(b_values_lowered, original_b_masked);
+
+	//Flush denormals to zero
+	const auto final_a = _mm_andnot_ps(denorm_check_a, a_final1);
+	const auto final_b = _mm_andnot_ps(denorm_check_b, b_final1);
+
+	spu.gpr[op.rt].vf = _mm_cmplt_ps(final_b, final_a);
 }

 void spu_interpreter::DFCGT(SPUThread& spu, spu_opcode_t op)
@ -754,8 +787,28 @@ void spu_interpreter::ORC(SPUThread& spu, spu_opcode_t op)

 void spu_interpreter_fast::FCMGT(SPUThread& spu, spu_opcode_t op)
 {
-	const auto mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
-	spu.gpr[op.rt].vf = _mm_cmplt_ps(_mm_and_ps(spu.gpr[op.rb].vf, mask), _mm_and_ps(spu.gpr[op.ra].vf, mask));
+	//IMPL NOTES: See FCGT
+
+	const auto zero = _mm_set1_ps(0.f);
+	const auto nan_check_a = _mm_cmpunord_ps(spu.gpr[op.ra].vf, zero);    //mask true where a is extended
+	const auto nan_check_b = _mm_cmpunord_ps(spu.gpr[op.rb].vf, zero);    //mask true where b is extended
+
+	//check if a and b are denormalized
+	const auto all_exp_bits = _mm_castsi128_ps(_mm_set1_epi32(0x7f800000));
+	const auto denorm_check_a = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.ra].vf));
+	const auto denorm_check_b = _mm_cmpeq_ps(zero, _mm_and_ps(all_exp_bits, spu.gpr[op.rb].vf));
+
+	//Flush denormals to zero
+	const auto final_a = _mm_andnot_ps(denorm_check_a, spu.gpr[op.ra].vf);
+	const auto final_b = _mm_andnot_ps(denorm_check_b, spu.gpr[op.rb].vf);
+
+	//Mask to make a > b if a is extended but b is not (is this necessary on x86?)
+	const auto nan_mask = _mm_andnot_ps(nan_check_b, _mm_xor_ps(nan_check_a, nan_check_b));
+
+	const auto sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
+	const auto comparison = _mm_cmplt_ps(_mm_and_ps(final_b, sign_mask), _mm_and_ps(final_a, sign_mask));
+
+	spu.gpr[op.rt].vf = _mm_or_ps(comparison, nan_mask);
 }

 void spu_interpreter::DFCMGT(SPUThread& spu, spu_opcode_t op)