mirror of
				https://github.com/PabloMK7/citra.git
				synced 2025-10-30 21:30:04 +00:00 
			
		
		
		
	Merge pull request #3145 from MerryMage/lg2-ex2
shader_jit_x64_compiler: Remove ABI overhead of LG2 and EX2
This commit is contained in:
		
						commit
						e23c3cd7f7
					
				
					 5 changed files with 285 additions and 20 deletions
				
			
		|  | @ -14,11 +14,17 @@ set(HEADERS | ||||||
|             core/arm/arm_test_common.h |             core/arm/arm_test_common.h | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|  | if (ARCHITECTURE_x86_64) | ||||||
|  |     set(SRCS ${SRCS} | ||||||
|  |             video_core/shader/shader_jit_x64_compiler.cpp | ||||||
|  |             ) | ||||||
|  | endif() | ||||||
|  | 
 | ||||||
| create_directory_groups(${SRCS} ${HEADERS}) | create_directory_groups(${SRCS} ${HEADERS}) | ||||||
| 
 | 
 | ||||||
| add_executable(tests ${SRCS} ${HEADERS}) | add_executable(tests ${SRCS} ${HEADERS}) | ||||||
| target_link_libraries(tests PRIVATE common core) | target_link_libraries(tests PRIVATE common core video_core) | ||||||
| target_link_libraries(tests PRIVATE glad) # To support linker work-around | target_link_libraries(tests PRIVATE glad) # To support linker work-around | ||||||
| target_link_libraries(tests PRIVATE ${PLATFORM_LIBRARIES} catch-single-include Threads::Threads) | target_link_libraries(tests PRIVATE ${PLATFORM_LIBRARIES} catch-single-include nihstro-headers Threads::Threads) | ||||||
| 
 | 
 | ||||||
| add_test(NAME tests COMMAND tests) | add_test(NAME tests COMMAND tests) | ||||||
|  |  | ||||||
							
								
								
									
										91
									
								
								src/tests/video_core/shader/shader_jit_x64_compiler.cpp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										91
									
								
								src/tests/video_core/shader/shader_jit_x64_compiler.cpp
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,91 @@ | ||||||
|  | // Copyright 2017 Citra Emulator Project
 | ||||||
|  | // Licensed under GPLv2 or any later version
 | ||||||
|  | // Refer to the license.txt file included.
 | ||||||
|  | 
 | ||||||
|  | #include <algorithm> | ||||||
|  | #include <cmath> | ||||||
|  | #include <memory> | ||||||
|  | #include <catch.hpp> | ||||||
|  | #include <nihstro/inline_assembly.h> | ||||||
|  | #include "video_core/shader/shader_jit_x64_compiler.h" | ||||||
|  | 
 | ||||||
|  | using float24 = Pica::float24; | ||||||
|  | using JitShader = Pica::Shader::JitShader; | ||||||
|  | 
 | ||||||
|  | using DestRegister = nihstro::DestRegister; | ||||||
|  | using OpCode = nihstro::OpCode; | ||||||
|  | using SourceRegister = nihstro::SourceRegister; | ||||||
|  | 
 | ||||||
|  | static std::unique_ptr<JitShader> CompileShader(std::initializer_list<nihstro::InlineAsm> code) { | ||||||
|  |     const auto shbin = nihstro::InlineAsm::CompileToRawBinary(code); | ||||||
|  | 
 | ||||||
|  |     std::array<u32, Pica::Shader::MAX_PROGRAM_CODE_LENGTH> program_code{}; | ||||||
|  |     std::array<u32, Pica::Shader::MAX_SWIZZLE_DATA_LENGTH> swizzle_data{}; | ||||||
|  | 
 | ||||||
|  |     std::transform(shbin.program.begin(), shbin.program.end(), program_code.begin(), | ||||||
|  |                    [](const auto& x) { return x.hex; }); | ||||||
|  |     std::transform(shbin.swizzle_table.begin(), shbin.swizzle_table.end(), swizzle_data.begin(), | ||||||
|  |                    [](const auto& x) { return x.hex; }); | ||||||
|  | 
 | ||||||
|  |     auto shader = std::make_unique<JitShader>(); | ||||||
|  |     shader->Compile(&program_code, &swizzle_data); | ||||||
|  | 
 | ||||||
|  |     return shader; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | class ShaderTest { | ||||||
|  | public: | ||||||
|  |     explicit ShaderTest(std::initializer_list<nihstro::InlineAsm> code) | ||||||
|  |         : shader(CompileShader(code)) {} | ||||||
|  | 
 | ||||||
|  |     float Run(float input) { | ||||||
|  |         Pica::Shader::ShaderSetup shader_setup; | ||||||
|  |         Pica::Shader::UnitState shader_unit; | ||||||
|  | 
 | ||||||
|  |         shader_unit.registers.input[0].x = float24::FromFloat32(input); | ||||||
|  |         shader->Run(shader_setup, shader_unit, 0); | ||||||
|  |         return shader_unit.registers.output[0].x.ToFloat32(); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  | public: | ||||||
|  |     std::unique_ptr<JitShader> shader; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | TEST_CASE("LG2", "[video_core][shader][shader_jit]") { | ||||||
|  |     const auto sh_input = SourceRegister::MakeInput(0); | ||||||
|  |     const auto sh_output = DestRegister::MakeOutput(0); | ||||||
|  | 
 | ||||||
|  |     auto shader = ShaderTest({ | ||||||
|  |         // clang-format off
 | ||||||
|  |         {OpCode::Id::LG2, sh_output, sh_input}, | ||||||
|  |         {OpCode::Id::END}, | ||||||
|  |         // clang-format on
 | ||||||
|  |     }); | ||||||
|  | 
 | ||||||
|  |     REQUIRE(std::isnan(shader.Run(NAN))); | ||||||
|  |     REQUIRE(std::isnan(shader.Run(-1.f))); | ||||||
|  |     REQUIRE(std::isinf(shader.Run(0.f))); | ||||||
|  |     REQUIRE(shader.Run(4.f) == Approx(2.f)); | ||||||
|  |     REQUIRE(shader.Run(64.f) == Approx(6.f)); | ||||||
|  |     REQUIRE(shader.Run(1.e24f) == Approx(79.7262742773f)); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | TEST_CASE("EX2", "[video_core][shader][shader_jit]") { | ||||||
|  |     const auto sh_input = SourceRegister::MakeInput(0); | ||||||
|  |     const auto sh_output = DestRegister::MakeOutput(0); | ||||||
|  | 
 | ||||||
|  |     auto shader = ShaderTest({ | ||||||
|  |         // clang-format off
 | ||||||
|  |         {OpCode::Id::EX2, sh_output, sh_input}, | ||||||
|  |         {OpCode::Id::END}, | ||||||
|  |         // clang-format on
 | ||||||
|  |     }); | ||||||
|  | 
 | ||||||
|  |     REQUIRE(std::isnan(shader.Run(NAN))); | ||||||
|  |     REQUIRE(shader.Run(-800.f) == Approx(0.f)); | ||||||
|  |     REQUIRE(shader.Run(0.f) == Approx(1.f)); | ||||||
|  |     REQUIRE(shader.Run(2.f) == Approx(4.f)); | ||||||
|  |     REQUIRE(shader.Run(6.f) == Approx(64.f)); | ||||||
|  |     REQUIRE(shader.Run(79.7262742773f) == Approx(1.e24f)); | ||||||
|  |     REQUIRE(std::isinf(shader.Run(800.f))); | ||||||
|  | } | ||||||
|  | @ -87,7 +87,7 @@ target_link_libraries(video_core PUBLIC common core) | ||||||
| target_link_libraries(video_core PRIVATE glad nihstro-headers) | target_link_libraries(video_core PRIVATE glad nihstro-headers) | ||||||
| 
 | 
 | ||||||
| if (ARCHITECTURE_x86_64) | if (ARCHITECTURE_x86_64) | ||||||
|     target_link_libraries(video_core PRIVATE xbyak) |     target_link_libraries(video_core PUBLIC xbyak) | ||||||
| endif() | endif() | ||||||
| 
 | 
 | ||||||
| if (PNG_FOUND) | if (PNG_FOUND) | ||||||
|  |  | ||||||
|  | @ -432,27 +432,13 @@ void JitShader::Compile_DPH(Instruction instr) { | ||||||
| 
 | 
 | ||||||
| void JitShader::Compile_EX2(Instruction instr) { | void JitShader::Compile_EX2(Instruction instr) { | ||||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
|     movss(xmm0, SRC1); // ABI_PARAM1
 |     call(exp2_subroutine); | ||||||
| 
 |  | ||||||
|     ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); |  | ||||||
|     CallFarFunction(*this, exp2f); |  | ||||||
|     ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); |  | ||||||
| 
 |  | ||||||
|     shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
 |  | ||||||
|     movaps(SRC1, xmm0); |  | ||||||
|     Compile_DestEnable(instr, SRC1); |     Compile_DestEnable(instr, SRC1); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void JitShader::Compile_LG2(Instruction instr) { | void JitShader::Compile_LG2(Instruction instr) { | ||||||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
|     movss(xmm0, SRC1); // ABI_PARAM1
 |     call(log2_subroutine); | ||||||
| 
 |  | ||||||
|     ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); |  | ||||||
|     CallFarFunction(*this, log2f); |  | ||||||
|     ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); |  | ||||||
| 
 |  | ||||||
|     shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
 |  | ||||||
|     movaps(SRC1, xmm0); |  | ||||||
|     Compile_DestEnable(instr, SRC1); |     Compile_DestEnable(instr, SRC1); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -935,7 +921,179 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_ | ||||||
|     LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", getSize()); |     LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", getSize()); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {} | JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) { | ||||||
|  |     CompilePrelude(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitShader::CompilePrelude() { | ||||||
|  |     log2_subroutine = CompilePrelude_Log2(); | ||||||
|  |     exp2_subroutine = CompilePrelude_Exp2(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | Xbyak::Label JitShader::CompilePrelude_Log2() { | ||||||
|  |     Xbyak::Label subroutine; | ||||||
|  | 
 | ||||||
|  |     // SSE does not have a log instruction, thus we must approximate.
 | ||||||
|  |     // We perform this approximation first performaing a range reduction into the range [1.0, 2.0).
 | ||||||
|  |     // A minimax polynomial which was fit for the function log2(x) / (x - 1) is then evaluated.
 | ||||||
|  |     // We multiply the result by (x - 1) then restore the result into the appropriate range.
 | ||||||
|  | 
 | ||||||
|  |     // Coefficients for the minimax polynomial.
 | ||||||
|  |     // f(x) computes approximately log2(x) / (x - 1).
 | ||||||
|  |     // f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)).
 | ||||||
|  |     align(64); | ||||||
|  |     const void* c0 = getCurr(); | ||||||
|  |     dd(0x3d74552f); | ||||||
|  |     const void* c1 = getCurr(); | ||||||
|  |     dd(0xbeee7397); | ||||||
|  |     const void* c2 = getCurr(); | ||||||
|  |     dd(0x3fbd96dd); | ||||||
|  |     const void* c3 = getCurr(); | ||||||
|  |     dd(0xc02153f6); | ||||||
|  |     const void* c4 = getCurr(); | ||||||
|  |     dd(0x4038d96c); | ||||||
|  | 
 | ||||||
|  |     align(16); | ||||||
|  |     const void* negative_infinity_vector = getCurr(); | ||||||
|  |     dd(0xff800000); | ||||||
|  |     dd(0xff800000); | ||||||
|  |     dd(0xff800000); | ||||||
|  |     dd(0xff800000); | ||||||
|  |     const void* default_qnan_vector = getCurr(); | ||||||
|  |     dd(0x7fc00000); | ||||||
|  |     dd(0x7fc00000); | ||||||
|  |     dd(0x7fc00000); | ||||||
|  |     dd(0x7fc00000); | ||||||
|  | 
 | ||||||
|  |     Xbyak::Label input_is_nan, input_is_zero, input_out_of_range; | ||||||
|  | 
 | ||||||
|  |     align(16); | ||||||
|  |     L(input_out_of_range); | ||||||
|  |     je(input_is_zero); | ||||||
|  |     movaps(SRC1, xword[rip + default_qnan_vector]); | ||||||
|  |     ret(); | ||||||
|  |     L(input_is_zero); | ||||||
|  |     movaps(SRC1, xword[rip + negative_infinity_vector]); | ||||||
|  |     ret(); | ||||||
|  | 
 | ||||||
|  |     align(16); | ||||||
|  |     L(subroutine); | ||||||
|  | 
 | ||||||
|  |     // Here we handle edge cases: input in {NaN, 0, -Inf, Negative}.
 | ||||||
|  |     xorps(SCRATCH, SCRATCH); | ||||||
|  |     ucomiss(SCRATCH, SRC1); | ||||||
|  |     jp(input_is_nan); | ||||||
|  |     jae(input_out_of_range); | ||||||
|  | 
 | ||||||
|  |     // Split input
 | ||||||
|  |     movd(eax, SRC1); | ||||||
|  |     mov(edx, eax); | ||||||
|  |     and_(eax, 0x7f800000); | ||||||
|  |     and_(edx, 0x007fffff); | ||||||
|  |     movss(SCRATCH, xword[rip + c0]); // Preload c0.
 | ||||||
|  |     or_(edx, 0x3f800000); | ||||||
|  |     movd(SRC1, edx); | ||||||
|  |     // SRC1 now contains the mantissa of the input.
 | ||||||
|  |     mulss(SCRATCH, SRC1); | ||||||
|  |     shr(eax, 23); | ||||||
|  |     sub(eax, 0x7f); | ||||||
|  |     cvtsi2ss(SCRATCH2, eax); | ||||||
|  |     // SCRATCH2 now contains the exponent of the input.
 | ||||||
|  | 
 | ||||||
|  |     // Complete computation of polynomial
 | ||||||
|  |     addss(SCRATCH, xword[rip + c1]); | ||||||
|  |     mulss(SCRATCH, SRC1); | ||||||
|  |     addss(SCRATCH, xword[rip + c2]); | ||||||
|  |     mulss(SCRATCH, SRC1); | ||||||
|  |     addss(SCRATCH, xword[rip + c3]); | ||||||
|  |     mulss(SCRATCH, SRC1); | ||||||
|  |     subss(SRC1, ONE); | ||||||
|  |     addss(SCRATCH, xword[rip + c4]); | ||||||
|  |     mulss(SCRATCH, SRC1); | ||||||
|  |     addss(SCRATCH2, SCRATCH); | ||||||
|  | 
 | ||||||
|  |     // Duplicate result across vector
 | ||||||
|  |     xorps(SRC1, SRC1); // break dependency chain
 | ||||||
|  |     movss(SRC1, SCRATCH2); | ||||||
|  |     L(input_is_nan); | ||||||
|  |     shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); | ||||||
|  | 
 | ||||||
|  |     ret(); | ||||||
|  | 
 | ||||||
|  |     return subroutine; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | Xbyak::Label JitShader::CompilePrelude_Exp2() { | ||||||
|  |     Xbyak::Label subroutine; | ||||||
|  | 
 | ||||||
|  |     // SSE does not have a exp instruction, thus we must approximate.
 | ||||||
|  |     // We perform this approximation first performaing a range reduction into the range [-0.5, 0.5).
 | ||||||
|  |     // A minimax polynomial which was fit for the function exp2(x) is then evaluated.
 | ||||||
|  |     // We then restore the result into the appropriate range.
 | ||||||
|  | 
 | ||||||
|  |     align(64); | ||||||
|  |     const void* input_max = getCurr(); | ||||||
|  |     dd(0x43010000); | ||||||
|  |     const void* input_min = getCurr(); | ||||||
|  |     dd(0xc2fdffff); | ||||||
|  |     const void* c0 = getCurr(); | ||||||
|  |     dd(0x3c5dbe69); | ||||||
|  |     const void* half = getCurr(); | ||||||
|  |     dd(0x3f000000); | ||||||
|  |     const void* c1 = getCurr(); | ||||||
|  |     dd(0x3d5509f9); | ||||||
|  |     const void* c2 = getCurr(); | ||||||
|  |     dd(0x3e773cc5); | ||||||
|  |     const void* c3 = getCurr(); | ||||||
|  |     dd(0x3f3168b3); | ||||||
|  |     const void* c4 = getCurr(); | ||||||
|  |     dd(0x3f800016); | ||||||
|  | 
 | ||||||
|  |     Xbyak::Label ret_label; | ||||||
|  | 
 | ||||||
|  |     align(16); | ||||||
|  |     L(subroutine); | ||||||
|  | 
 | ||||||
|  |     // Handle edge cases
 | ||||||
|  |     ucomiss(SRC1, SRC1); | ||||||
|  |     jp(ret_label); | ||||||
|  |     // Clamp to maximum range since we shift the value directly into the exponent.
 | ||||||
|  |     minss(SRC1, xword[rip + input_max]); | ||||||
|  |     maxss(SRC1, xword[rip + input_min]); | ||||||
|  | 
 | ||||||
|  |     // Decompose input
 | ||||||
|  |     movss(SCRATCH, SRC1); | ||||||
|  |     movss(SCRATCH2, xword[rip + c0]); // Preload c0.
 | ||||||
|  |     subss(SCRATCH, xword[rip + half]); | ||||||
|  |     cvtss2si(eax, SCRATCH); | ||||||
|  |     cvtsi2ss(SCRATCH, eax); | ||||||
|  |     // SCRATCH now contains input rounded to the nearest integer.
 | ||||||
|  |     add(eax, 0x7f); | ||||||
|  |     subss(SRC1, SCRATCH); | ||||||
|  |     // SRC1 contains input - round(input), which is in [-0.5, 0.5).
 | ||||||
|  |     mulss(SCRATCH2, SRC1); | ||||||
|  |     shl(eax, 23); | ||||||
|  |     movd(SCRATCH, eax); | ||||||
|  |     // SCRATCH contains 2^(round(input)).
 | ||||||
|  | 
 | ||||||
|  |     // Complete computation of polynomial.
 | ||||||
|  |     addss(SCRATCH2, xword[rip + c1]); | ||||||
|  |     mulss(SCRATCH2, SRC1); | ||||||
|  |     addss(SCRATCH2, xword[rip + c2]); | ||||||
|  |     mulss(SCRATCH2, SRC1); | ||||||
|  |     addss(SCRATCH2, xword[rip + c3]); | ||||||
|  |     mulss(SRC1, SCRATCH2); | ||||||
|  |     addss(SRC1, xword[rip + c4]); | ||||||
|  |     mulss(SRC1, SCRATCH); | ||||||
|  | 
 | ||||||
|  |     // Duplicate result across vector
 | ||||||
|  |     L(ret_label); | ||||||
|  |     shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); | ||||||
|  | 
 | ||||||
|  |     ret(); | ||||||
|  | 
 | ||||||
|  |     return subroutine; | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| } // namespace Shader
 | } // namespace Shader
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -106,6 +106,13 @@ private: | ||||||
|      */ |      */ | ||||||
|     void FindReturnOffsets(); |     void FindReturnOffsets(); | ||||||
| 
 | 
 | ||||||
|  |     /**
 | ||||||
|  |      * Emits data and code for utility functions. | ||||||
|  |      */ | ||||||
|  |     void CompilePrelude(); | ||||||
|  |     Xbyak::Label CompilePrelude_Log2(); | ||||||
|  |     Xbyak::Label CompilePrelude_Exp2(); | ||||||
|  | 
 | ||||||
|     const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code = nullptr; |     const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code = nullptr; | ||||||
|     const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data = nullptr; |     const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data = nullptr; | ||||||
| 
 | 
 | ||||||
|  | @ -120,6 +127,9 @@ private: | ||||||
| 
 | 
 | ||||||
|     using CompiledShader = void(const void* setup, void* state, const u8* start_addr); |     using CompiledShader = void(const void* setup, void* state, const u8* start_addr); | ||||||
|     CompiledShader* program = nullptr; |     CompiledShader* program = nullptr; | ||||||
|  | 
 | ||||||
|  |     Xbyak::Label log2_subroutine; | ||||||
|  |     Xbyak::Label exp2_subroutine; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| } // Shader
 | } // Shader
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue