mirror of
				https://github.com/PabloMK7/citra.git
				synced 2025-10-31 13:50:03 +00:00 
			
		
		
		
	shader_jit: Add optimizations up to x86-64-v4 (#6668)
				
					
				
			This commit is contained in:
		
							parent
							
								
									6da4853360
								
							
						
					
					
						commit
						a94af8ea62
					
				
					 1 changed files with 157 additions and 63 deletions
				
			
		|  | @ -338,15 +338,39 @@ void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) { | ||||||
|     // where neither source was, this NaN was generated by a 0 * inf multiplication, and so the
 |     // where neither source was, this NaN was generated by a 0 * inf multiplication, and so the
 | ||||||
|     // result should be transformed to 0 to match PICA fp rules.
 |     // result should be transformed to 0 to match PICA fp rules.
 | ||||||
| 
 | 
 | ||||||
|  |     if (host_caps.has(Cpu::tAVX512F | Cpu::tAVX512VL | Cpu::tAVX512DQ)) { | ||||||
|  |         vmulps(scratch, src1, src2); | ||||||
|  | 
 | ||||||
|  |         // Mask of any NaN values found in the result
 | ||||||
|  |         const Xbyak::Opmask zero_mask = k1; | ||||||
|  |         vcmpunordps(zero_mask, scratch, scratch); | ||||||
|  | 
 | ||||||
|  |         // Mask of any non-NaN inputs producing NaN results
 | ||||||
|  |         vcmpordps(zero_mask | zero_mask, src1, src2); | ||||||
|  | 
 | ||||||
|  |         knotb(zero_mask, zero_mask); | ||||||
|  |         vmovaps(src1 | zero_mask | T_z, scratch); | ||||||
|  | 
 | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     // Set scratch to mask of (src1 != NaN and src2 != NaN)
 |     // Set scratch to mask of (src1 != NaN and src2 != NaN)
 | ||||||
|  |     if (host_caps.has(Cpu::tAVX)) { | ||||||
|  |         vcmpordps(scratch, src1, src2); | ||||||
|  |     } else { | ||||||
|         movaps(scratch, src1); |         movaps(scratch, src1); | ||||||
|         cmpordps(scratch, src2); |         cmpordps(scratch, src2); | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     mulps(src1, src2); |     mulps(src1, src2); | ||||||
| 
 | 
 | ||||||
|     // Set src2 to mask of (result == NaN)
 |     // Set src2 to mask of (result == NaN)
 | ||||||
|  |     if (host_caps.has(Cpu::tAVX)) { | ||||||
|  |         vcmpunordps(src2, src2, src1); | ||||||
|  |     } else { | ||||||
|         movaps(src2, src1); |         movaps(src2, src1); | ||||||
|         cmpunordps(src2, src2); |         cmpunordps(src2, src2); | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     // Clear components where scratch != src2 (i.e. if result is NaN where neither source was NaN)
 |     // Clear components where scratch != src2 (i.e. if result is NaN where neither source was NaN)
 | ||||||
|     xorps(scratch, src2); |     xorps(scratch, src2); | ||||||
|  | @ -406,6 +430,11 @@ void JitShader::Compile_DP3(Instruction instr) { | ||||||
| 
 | 
 | ||||||
|     Compile_SanitizedMul(SRC1, SRC2, SCRATCH); |     Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||||||
| 
 | 
 | ||||||
|  |     if (host_caps.has(Cpu::tAVX)) { | ||||||
|  |         vshufps(SRC3, SRC1, SRC1, _MM_SHUFFLE(2, 2, 2, 2)); | ||||||
|  |         vshufps(SRC2, SRC1, SRC1, _MM_SHUFFLE(1, 1, 1, 1)); | ||||||
|  |         vshufps(SRC1, SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); | ||||||
|  |     } else { | ||||||
|         movaps(SRC2, SRC1); |         movaps(SRC2, SRC1); | ||||||
|         shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1)); |         shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1)); | ||||||
| 
 | 
 | ||||||
|  | @ -413,6 +442,8 @@ void JitShader::Compile_DP3(Instruction instr) { | ||||||
|         shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2)); |         shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2)); | ||||||
| 
 | 
 | ||||||
|         shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); |         shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     addps(SRC1, SRC2); |     addps(SRC1, SRC2); | ||||||
|     addps(SRC1, SRC3); |     addps(SRC1, SRC3); | ||||||
| 
 | 
 | ||||||
|  | @ -589,9 +620,15 @@ void JitShader::Compile_MOV(Instruction instr) { | ||||||
| void JitShader::Compile_RCP(Instruction instr) { | void JitShader::Compile_RCP(Instruction instr) { | ||||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
| 
 | 
 | ||||||
|  |     if (host_caps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) { | ||||||
|  |         // Accurate to 14 bits of precisions rather than 12 bits of rcpss
 | ||||||
|  |         vrcp14ss(SRC1, SRC1, SRC1); | ||||||
|  |     } else { | ||||||
|         // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica
 |         // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica
 | ||||||
|         // performs this operation more accurately. This should be checked on hardware.
 |         // performs this operation more accurately. This should be checked on hardware.
 | ||||||
|         rcpss(SRC1, SRC1); |         rcpss(SRC1, SRC1); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
 |     shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
 | ||||||
| 
 | 
 | ||||||
|     Compile_DestEnable(instr, SRC1); |     Compile_DestEnable(instr, SRC1); | ||||||
|  | @ -600,9 +637,15 @@ void JitShader::Compile_RCP(Instruction instr) { | ||||||
| void JitShader::Compile_RSQ(Instruction instr) { | void JitShader::Compile_RSQ(Instruction instr) { | ||||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
| 
 | 
 | ||||||
|  |     if (host_caps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) { | ||||||
|  |         // Accurate to 14 bits of precisions rather than 12 bits of rsqrtss
 | ||||||
|  |         vrsqrt14ss(SRC1, SRC1, SRC1); | ||||||
|  |     } else { | ||||||
|         // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica
 |         // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica
 | ||||||
|         // performs this operation more accurately. This should be checked on hardware.
 |         // performs this operation more accurately. This should be checked on hardware.
 | ||||||
|         rsqrtss(SRC1, SRC1); |         rsqrtss(SRC1, SRC1); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
 |     shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
 | ||||||
| 
 | 
 | ||||||
|     Compile_DestEnable(instr, SRC1); |     Compile_DestEnable(instr, SRC1); | ||||||
|  | @ -1050,22 +1093,36 @@ Xbyak::Label JitShader::CompilePrelude_Log2() { | ||||||
|     jp(input_is_nan); |     jp(input_is_nan); | ||||||
|     jae(input_out_of_range); |     jae(input_out_of_range); | ||||||
| 
 | 
 | ||||||
|     // Split input
 |     // Split input: SRC1=MANT[1,2) SCRATCH2=Exponent
 | ||||||
|  |     if (host_caps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) { | ||||||
|  |         vgetexpss(SCRATCH2, SRC1, SRC1); | ||||||
|  |         vgetmantss(SRC1, SRC1, SRC1, 0x0'0); | ||||||
|  |     } else { | ||||||
|         movd(eax, SRC1); |         movd(eax, SRC1); | ||||||
|         mov(edx, eax); |         mov(edx, eax); | ||||||
|         and_(eax, 0x7f800000); |         and_(eax, 0x7f800000); | ||||||
|         and_(edx, 0x007fffff); |         and_(edx, 0x007fffff); | ||||||
|     movss(SCRATCH, xword[rip + c0]); // Preload c0.
 |  | ||||||
|         or_(edx, 0x3f800000); |         or_(edx, 0x3f800000); | ||||||
|         movd(SRC1, edx); |         movd(SRC1, edx); | ||||||
|         // SRC1 now contains the mantissa of the input.
 |         // SRC1 now contains the mantissa of the input.
 | ||||||
|     mulss(SCRATCH, SRC1); |  | ||||||
|         shr(eax, 23); |         shr(eax, 23); | ||||||
|         sub(eax, 0x7f); |         sub(eax, 0x7f); | ||||||
|         cvtsi2ss(SCRATCH2, eax); |         cvtsi2ss(SCRATCH2, eax); | ||||||
|         // SCRATCH2 now contains the exponent of the input.
 |         // SCRATCH2 now contains the exponent of the input.
 | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     movss(SCRATCH, xword[rip + c0]); | ||||||
| 
 | 
 | ||||||
|     // Complete computation of polynomial
 |     // Complete computation of polynomial
 | ||||||
|  |     if (host_caps.has(Cpu::tFMA)) { | ||||||
|  |         vfmadd213ss(SCRATCH, SRC1, xword[rip + c1]); | ||||||
|  |         vfmadd213ss(SCRATCH, SRC1, xword[rip + c2]); | ||||||
|  |         vfmadd213ss(SCRATCH, SRC1, xword[rip + c3]); | ||||||
|  |         vfmadd213ss(SCRATCH, SRC1, xword[rip + c4]); | ||||||
|  |         subss(SRC1, ONE); | ||||||
|  |         vfmadd231ss(SCRATCH2, SCRATCH, SRC1); | ||||||
|  |     } else { | ||||||
|  |         mulss(SCRATCH, SRC1); | ||||||
|         addss(SCRATCH, xword[rip + c1]); |         addss(SCRATCH, xword[rip + c1]); | ||||||
|         mulss(SCRATCH, SRC1); |         mulss(SCRATCH, SRC1); | ||||||
|         addss(SCRATCH, xword[rip + c2]); |         addss(SCRATCH, xword[rip + c2]); | ||||||
|  | @ -1076,6 +1133,7 @@ Xbyak::Label JitShader::CompilePrelude_Log2() { | ||||||
|         addss(SCRATCH, xword[rip + c4]); |         addss(SCRATCH, xword[rip + c4]); | ||||||
|         mulss(SCRATCH, SRC1); |         mulss(SCRATCH, SRC1); | ||||||
|         addss(SCRATCH2, SCRATCH); |         addss(SCRATCH2, SCRATCH); | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     // Duplicate result across vector
 |     // Duplicate result across vector
 | ||||||
|     xorps(SRC1, SRC1); // break dependency chain
 |     xorps(SRC1, SRC1); // break dependency chain
 | ||||||
|  | @ -1122,26 +1180,60 @@ Xbyak::Label JitShader::CompilePrelude_Exp2() { | ||||||
|     // Handle edge cases
 |     // Handle edge cases
 | ||||||
|     ucomiss(SRC1, SRC1); |     ucomiss(SRC1, SRC1); | ||||||
|     jp(ret_label); |     jp(ret_label); | ||||||
|  | 
 | ||||||
|  |     // Decompose input:
 | ||||||
|  |     // SCRATCH=2^round(input)
 | ||||||
|  |     // SRC1=input-round(input) [-0.5, 0.5)
 | ||||||
|  |     if (host_caps.has(Cpu::tAVX512F | Cpu::tAVX512VL)) { | ||||||
|  |         // input - 0.5
 | ||||||
|  |         vsubss(SCRATCH, SRC1, xword[rip + half]); | ||||||
|  | 
 | ||||||
|  |         // trunc(input - 0.5)
 | ||||||
|  |         vrndscaless(SCRATCH2, SCRATCH, SCRATCH, _MM_FROUND_TRUNC); | ||||||
|  | 
 | ||||||
|  |         // SCRATCH = 1 * 2^(trunc(input - 0.5))
 | ||||||
|  |         vscalefss(SCRATCH, ONE, SCRATCH2); | ||||||
|  | 
 | ||||||
|  |         // SRC1 = input-trunc(input - 0.5)
 | ||||||
|  |         vsubss(SRC1, SRC1, SCRATCH2); | ||||||
|  |     } else { | ||||||
|         // Clamp to maximum range since we shift the value directly into the exponent.
 |         // Clamp to maximum range since we shift the value directly into the exponent.
 | ||||||
|         minss(SRC1, xword[rip + input_max]); |         minss(SRC1, xword[rip + input_max]); | ||||||
|         maxss(SRC1, xword[rip + input_min]); |         maxss(SRC1, xword[rip + input_min]); | ||||||
| 
 | 
 | ||||||
|     // Decompose input
 |         if (host_caps.has(Cpu::tAVX)) { | ||||||
|  |             vsubss(SCRATCH, SRC1, xword[rip + half]); | ||||||
|  |         } else { | ||||||
|             movss(SCRATCH, SRC1); |             movss(SCRATCH, SRC1); | ||||||
|     movss(SCRATCH2, xword[rip + c0]); // Preload c0.
 |  | ||||||
|             subss(SCRATCH, xword[rip + half]); |             subss(SCRATCH, xword[rip + half]); | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         if (host_caps.has(Cpu::tSSE41)) { | ||||||
|  |             roundss(SCRATCH, SCRATCH, _MM_FROUND_TRUNC); | ||||||
|  |             cvtss2si(eax, SCRATCH); | ||||||
|  |         } else { | ||||||
|             cvtss2si(eax, SCRATCH); |             cvtss2si(eax, SCRATCH); | ||||||
|             cvtsi2ss(SCRATCH, eax); |             cvtsi2ss(SCRATCH, eax); | ||||||
|  |         } | ||||||
|         // SCRATCH now contains input rounded to the nearest integer.
 |         // SCRATCH now contains input rounded to the nearest integer.
 | ||||||
|         add(eax, 0x7f); |         add(eax, 0x7f); | ||||||
|         subss(SRC1, SCRATCH); |         subss(SRC1, SCRATCH); | ||||||
|         // SRC1 contains input - round(input), which is in [-0.5, 0.5).
 |         // SRC1 contains input - round(input), which is in [-0.5, 0.5).
 | ||||||
|     mulss(SCRATCH2, SRC1); |  | ||||||
|         shl(eax, 23); |         shl(eax, 23); | ||||||
|         movd(SCRATCH, eax); |         movd(SCRATCH, eax); | ||||||
|         // SCRATCH contains 2^(round(input)).
 |         // SCRATCH contains 2^(round(input)).
 | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     // Complete computation of polynomial.
 |     // Complete computation of polynomial.
 | ||||||
|  |     movss(SCRATCH2, xword[rip + c0]); | ||||||
|  | 
 | ||||||
|  |     if (host_caps.has(Cpu::tFMA)) { | ||||||
|  |         vfmadd213ss(SCRATCH2, SRC1, xword[rip + c1]); | ||||||
|  |         vfmadd213ss(SCRATCH2, SRC1, xword[rip + c2]); | ||||||
|  |         vfmadd213ss(SCRATCH2, SRC1, xword[rip + c3]); | ||||||
|  |         vfmadd213ss(SRC1, SCRATCH2, xword[rip + c4]); | ||||||
|  |     } else { | ||||||
|  |         mulss(SCRATCH2, SRC1); | ||||||
|         addss(SCRATCH2, xword[rip + c1]); |         addss(SCRATCH2, xword[rip + c1]); | ||||||
|         mulss(SCRATCH2, SRC1); |         mulss(SCRATCH2, SRC1); | ||||||
|         addss(SCRATCH2, xword[rip + c2]); |         addss(SCRATCH2, xword[rip + c2]); | ||||||
|  | @ -1149,6 +1241,8 @@ Xbyak::Label JitShader::CompilePrelude_Exp2() { | ||||||
|         addss(SCRATCH2, xword[rip + c3]); |         addss(SCRATCH2, xword[rip + c3]); | ||||||
|         mulss(SRC1, SCRATCH2); |         mulss(SRC1, SCRATCH2); | ||||||
|         addss(SRC1, xword[rip + c4]); |         addss(SRC1, xword[rip + c4]); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     mulss(SRC1, SCRATCH); |     mulss(SRC1, SCRATCH); | ||||||
| 
 | 
 | ||||||
|     // Duplicate result across vector
 |     // Duplicate result across vector
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue