mirror of
				https://github.com/PabloMK7/citra.git
				synced 2025-10-30 21:30:04 +00:00 
			
		
		
		
	shader: Fix address register offset behavior in x64 Jit (#6942)
* shader: Fix address register offset behavior in x64 Jit * shader: Remove redundant jump * tests: Add address register tests * shader: Remove additional pre-multiplications by 16 * tests: Add catch-stringifier for vec4f * tests: Format
This commit is contained in:
		
							parent
							
								
									1caf569f16
								
							
						
					
					
						commit
						ef43776c7b
					
				
					 2 changed files with 94 additions and 27 deletions
				
			
		|  | @ -28,6 +28,15 @@ static constexpr Common::Vec4f vec4_nan = Common::Vec4f::AssignToAll(NAN); | ||||||
| static constexpr Common::Vec4f vec4_one = Common::Vec4f::AssignToAll(1.0f); | static constexpr Common::Vec4f vec4_one = Common::Vec4f::AssignToAll(1.0f); | ||||||
| static constexpr Common::Vec4f vec4_zero = Common::Vec4f::AssignToAll(0.0f); | static constexpr Common::Vec4f vec4_zero = Common::Vec4f::AssignToAll(0.0f); | ||||||
| 
 | 
 | ||||||
|  | namespace Catch { | ||||||
|  | template <> | ||||||
|  | struct StringMaker<Common::Vec4f> { | ||||||
|  |     static std::string convert(Common::Vec4f value) { | ||||||
|  |         return fmt::format("({}, {}, {}, {})", value.r(), value.g(), value.b(), value.a()); | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  | } // namespace Catch
 | ||||||
|  | 
 | ||||||
| static std::unique_ptr<Pica::Shader::ShaderSetup> CompileShaderSetup( | static std::unique_ptr<Pica::Shader::ShaderSetup> CompileShaderSetup( | ||||||
|     std::initializer_list<nihstro::InlineAsm> code) { |     std::initializer_list<nihstro::InlineAsm> code) { | ||||||
|     const auto shbin = nihstro::InlineAsm::CompileToRawBinary(code); |     const auto shbin = nihstro::InlineAsm::CompileToRawBinary(code); | ||||||
|  | @ -385,6 +394,56 @@ TEST_CASE("RSQ", "[video_core][shader][shader_jit]") { | ||||||
|     REQUIRE(shader.Run({0.0625f}).x == Catch::Approx(4.0f).margin(0.004f)); |     REQUIRE(shader.Run({0.0625f}).x == Catch::Approx(4.0f).margin(0.004f)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | TEST_CASE("Address Register Offset", "[video_core][shader][shader_jit]") { | ||||||
|  |     const auto sh_input = SourceRegister::MakeInput(0); | ||||||
|  |     const auto sh_c40 = SourceRegister::MakeFloat(40); | ||||||
|  |     const auto sh_output = DestRegister::MakeOutput(0); | ||||||
|  | 
 | ||||||
|  |     auto shader = ShaderTest({ | ||||||
|  |         // mova a0.x, sh_input.x
 | ||||||
|  |         {OpCode::Id::MOVA, DestRegister{}, "x", sh_input, "x", SourceRegister{}, "", | ||||||
|  |          nihstro::InlineAsm::RelativeAddress::A1}, | ||||||
|  |         // mov sh_output.xyzw, c40[a0.x].xyzw
 | ||||||
|  |         {OpCode::Id::MOV, sh_output, "xyzw", sh_c40, "xyzw", SourceRegister{}, "", | ||||||
|  |          nihstro::InlineAsm::RelativeAddress::A1}, | ||||||
|  |         {OpCode::Id::END}, | ||||||
|  |     }); | ||||||
|  | 
 | ||||||
|  |     // Prepare shader uniforms
 | ||||||
|  |     const bool inverted = true; | ||||||
|  |     std::array<Common::Vec4f, 96> f_uniforms; | ||||||
|  |     for (u32 i = 0; i < 0x80; i++) { | ||||||
|  |         if (i >= 0x00 && i < 0x60) { | ||||||
|  |             const u32 base = inverted ? (0x60 - i) : i; | ||||||
|  |             const auto color = (base * 2.f) / 255.0f; | ||||||
|  |             const auto color_f24 = Pica::f24::FromFloat32(color); | ||||||
|  |             shader.shader_setup->uniforms.f[i] = {color_f24, color_f24, color_f24, | ||||||
|  |                                                   Pica::f24::One()}; | ||||||
|  |             f_uniforms[i] = {color, color, color, 1.f}; | ||||||
|  |         } else if (i >= 0x60 && i < 0x70) { | ||||||
|  |             const u8 color = static_cast<u8>((i - 0x60) * 0x10); | ||||||
|  |             shader.shader_setup->uniforms.i[i - 0x60] = {color, color, color, 255}; | ||||||
|  |         } else if (i >= 0x70 && i < 0x80) { | ||||||
|  |             shader.shader_setup->uniforms.b[i - 0x70] = i >= 0x78; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     REQUIRE(shader.Run(0.f) == f_uniforms[40]); | ||||||
|  |     REQUIRE(shader.Run(13.f) == f_uniforms[53]); | ||||||
|  |     REQUIRE(shader.Run(50.f) == f_uniforms[90]); | ||||||
|  |     REQUIRE(shader.Run(60.f) == vec4_one); | ||||||
|  |     REQUIRE(shader.Run(74.f) == vec4_one); | ||||||
|  |     REQUIRE(shader.Run(87.f) == vec4_one); | ||||||
|  |     REQUIRE(shader.Run(88.f) == f_uniforms[0]); | ||||||
|  |     REQUIRE(shader.Run(128.f) == f_uniforms[40]); | ||||||
|  |     REQUIRE(shader.Run(-40.f) == f_uniforms[0]); | ||||||
|  |     REQUIRE(shader.Run(-42.f) == vec4_one); | ||||||
|  |     REQUIRE(shader.Run(-70.f) == vec4_one); | ||||||
|  |     REQUIRE(shader.Run(-73.f) == f_uniforms[95]); | ||||||
|  |     REQUIRE(shader.Run(-127.f) == f_uniforms[41]); | ||||||
|  |     REQUIRE(shader.Run(-129.f) == f_uniforms[40]); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| // TODO: Requires fix from https://github.com/neobrain/nihstro/issues/68
 | // TODO: Requires fix from https://github.com/neobrain/nihstro/issues/68
 | ||||||
| // TEST_CASE("MAD", "[video_core][shader][shader_jit]") {
 | // TEST_CASE("MAD", "[video_core][shader][shader_jit]") {
 | ||||||
| //     const auto sh_input1 = SourceRegister::MakeInput(0);
 | //     const auto sh_input1 = SourceRegister::MakeInput(0);
 | ||||||
|  |  | ||||||
|  | @ -232,21 +232,45 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe | ||||||
|         address_register_index = instr.common.address_register_index; |         address_register_index = instr.common.address_register_index; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     if (src_num == offset_src && address_register_index != 0) { |     if (src_reg.GetRegisterType() == RegisterType::FloatUniform && src_num == offset_src && | ||||||
|  |         address_register_index != 0) { | ||||||
|  |         Xbyak::Reg64 address_reg; | ||||||
|         switch (address_register_index) { |         switch (address_register_index) { | ||||||
|         case 1: // address offset 1
 |         case 1: | ||||||
|             movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]); |             address_reg = ADDROFFS_REG_0; | ||||||
|             break; |             break; | ||||||
|         case 2: // address offset 2
 |         case 2: | ||||||
|             movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]); |             address_reg = ADDROFFS_REG_1; | ||||||
|             break; |             break; | ||||||
|         case 3: // address offset 3
 |         case 3: | ||||||
|             movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]); |             address_reg = LOOPCOUNT_REG.cvt64(); | ||||||
|             break; |             break; | ||||||
|         default: |         default: | ||||||
|             UNREACHABLE(); |             UNREACHABLE(); | ||||||
|             break; |             break; | ||||||
|         } |         } | ||||||
|  |         // s32 offset = address_reg >= -128 && address_reg <= 127 ? address_reg : 0;
 | ||||||
|  |         // u32 index = (src_reg.GetIndex() + offset) & 0x7f;
 | ||||||
|  | 
 | ||||||
|  |         // First we add 128 to address_reg so the first comparison is turned to
 | ||||||
|  |         // address_reg >= 0 && address_reg < 256 which can be performed with
 | ||||||
|  |         // a single unsigned comparison (cmovb)
 | ||||||
|  |         lea(eax, ptr[address_reg + 128]); | ||||||
|  |         mov(ebx, src_reg.GetIndex()); | ||||||
|  |         mov(ecx, address_reg.cvt32()); | ||||||
|  |         add(ecx, ebx); | ||||||
|  |         cmp(eax, 256); | ||||||
|  |         cmovb(ebx, ecx); | ||||||
|  |         and_(ebx, 0x7f); | ||||||
|  | 
 | ||||||
|  |         // index > 95 ? vec4(1.0) : uniforms.f[index];
 | ||||||
|  |         movaps(dest, ONE); | ||||||
|  |         cmp(ebx, 95); | ||||||
|  |         Label load_end; | ||||||
|  |         jg(load_end); | ||||||
|  |         shl(rbx, 4); | ||||||
|  |         movaps(dest, xword[src_ptr + rbx]); | ||||||
|  |         L(load_end); | ||||||
|     } else { |     } else { | ||||||
|         // Load the source
 |         // Load the source
 | ||||||
|         movaps(dest, xword[src_ptr + src_offset_disp]); |         movaps(dest, xword[src_ptr + src_offset_disp]); | ||||||
|  | @ -590,24 +614,14 @@ void JitShader::Compile_MOVA(Instruction instr) { | ||||||
|         // Move and sign-extend high 32 bits
 |         // Move and sign-extend high 32 bits
 | ||||||
|         shr(rax, 32); |         shr(rax, 32); | ||||||
|         movsxd(ADDROFFS_REG_1, eax); |         movsxd(ADDROFFS_REG_1, eax); | ||||||
| 
 |  | ||||||
|         // Multiply by 16 to be used as an offset later
 |  | ||||||
|         shl(ADDROFFS_REG_0, 4); |  | ||||||
|         shl(ADDROFFS_REG_1, 4); |  | ||||||
|     } else { |     } else { | ||||||
|         if (swiz.DestComponentEnabled(0)) { |         if (swiz.DestComponentEnabled(0)) { | ||||||
|             // Move and sign-extend low 32 bits
 |             // Move and sign-extend low 32 bits
 | ||||||
|             movsxd(ADDROFFS_REG_0, eax); |             movsxd(ADDROFFS_REG_0, eax); | ||||||
| 
 |  | ||||||
|             // Multiply by 16 to be used as an offset later
 |  | ||||||
|             shl(ADDROFFS_REG_0, 4); |  | ||||||
|         } else if (swiz.DestComponentEnabled(1)) { |         } else if (swiz.DestComponentEnabled(1)) { | ||||||
|             // Move and sign-extend high 32 bits
 |             // Move and sign-extend high 32 bits
 | ||||||
|             shr(rax, 32); |             shr(rax, 32); | ||||||
|             movsxd(ADDROFFS_REG_1, eax); |             movsxd(ADDROFFS_REG_1, eax); | ||||||
| 
 |  | ||||||
|             // Multiply by 16 to be used as an offset later
 |  | ||||||
|             shl(ADDROFFS_REG_1, 4); |  | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  | @ -659,9 +673,6 @@ void JitShader::Compile_END(Instruction instr) { | ||||||
|     mov(byte[STATE + offsetof(UnitState, conditional_code[1])], COND1.cvt8()); |     mov(byte[STATE + offsetof(UnitState, conditional_code[1])], COND1.cvt8()); | ||||||
| 
 | 
 | ||||||
|     // Save address/loop registers
 |     // Save address/loop registers
 | ||||||
|     sar(ADDROFFS_REG_0, 4); |  | ||||||
|     sar(ADDROFFS_REG_1, 4); |  | ||||||
|     sar(LOOPCOUNT_REG, 4); |  | ||||||
|     mov(dword[STATE + offsetof(UnitState, address_registers[0])], ADDROFFS_REG_0.cvt32()); |     mov(dword[STATE + offsetof(UnitState, address_registers[0])], ADDROFFS_REG_0.cvt32()); | ||||||
|     mov(dword[STATE + offsetof(UnitState, address_registers[1])], ADDROFFS_REG_1.cvt32()); |     mov(dword[STATE + offsetof(UnitState, address_registers[1])], ADDROFFS_REG_1.cvt32()); | ||||||
|     mov(dword[STATE + offsetof(UnitState, address_registers[2])], LOOPCOUNT_REG); |     mov(dword[STATE + offsetof(UnitState, address_registers[2])], LOOPCOUNT_REG); | ||||||
|  | @ -813,11 +824,11 @@ void JitShader::Compile_LOOP(Instruction instr) { | ||||||
|     std::size_t offset = Uniforms::GetIntUniformOffset(instr.flow_control.int_uniform_id); |     std::size_t offset = Uniforms::GetIntUniformOffset(instr.flow_control.int_uniform_id); | ||||||
|     mov(LOOPCOUNT, dword[UNIFORMS + offset]); |     mov(LOOPCOUNT, dword[UNIFORMS + offset]); | ||||||
|     mov(LOOPCOUNT_REG, LOOPCOUNT); |     mov(LOOPCOUNT_REG, LOOPCOUNT); | ||||||
|     shr(LOOPCOUNT_REG, 4); |     shr(LOOPCOUNT_REG, 8); | ||||||
|     and_(LOOPCOUNT_REG, 0xFF0); // Y-component is the start
 |     and_(LOOPCOUNT_REG, 0xFF); // Y-component is the start
 | ||||||
|     mov(LOOPINC, LOOPCOUNT); |     mov(LOOPINC, LOOPCOUNT); | ||||||
|     shr(LOOPINC, 12); |     shr(LOOPINC, 16); | ||||||
|     and_(LOOPINC, 0xFF0);               // Z-component is the incrementer
 |     and_(LOOPINC, 0xFF);                // Z-component is the incrementer
 | ||||||
|     movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count
 |     movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count
 | ||||||
|     add(LOOPCOUNT, 1);                  // Iteration count is X-component + 1
 |     add(LOOPCOUNT, 1);                  // Iteration count is X-component + 1
 | ||||||
| 
 | 
 | ||||||
|  | @ -993,9 +1004,6 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_ | ||||||
|     movsxd(ADDROFFS_REG_0, dword[STATE + offsetof(UnitState, address_registers[0])]); |     movsxd(ADDROFFS_REG_0, dword[STATE + offsetof(UnitState, address_registers[0])]); | ||||||
|     movsxd(ADDROFFS_REG_1, dword[STATE + offsetof(UnitState, address_registers[1])]); |     movsxd(ADDROFFS_REG_1, dword[STATE + offsetof(UnitState, address_registers[1])]); | ||||||
|     mov(LOOPCOUNT_REG, dword[STATE + offsetof(UnitState, address_registers[2])]); |     mov(LOOPCOUNT_REG, dword[STATE + offsetof(UnitState, address_registers[2])]); | ||||||
|     shl(ADDROFFS_REG_0, 4); |  | ||||||
|     shl(ADDROFFS_REG_1, 4); |  | ||||||
|     shl(LOOPCOUNT_REG, 4); |  | ||||||
| 
 | 
 | ||||||
|     // Load conditional code
 |     // Load conditional code
 | ||||||
|     mov(COND0, byte[STATE + offsetof(UnitState, conditional_code[0])]); |     mov(COND0, byte[STATE + offsetof(UnitState, conditional_code[0])]); | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue