mirror of
				https://github.com/PabloMK7/citra.git
				synced 2025-10-31 05:40:04 +00:00 
			
		
		
		
	shader_jit_a64: Compact host executable memory (#230)
* common/aarch64: Allow generic code generator types Use the templated `BasicCodeGenerator` type rather than the specialized `CodeGenerator` type. Allows `VectorCodeGenerator` to work with these functions. * common/aarch64: Add `VectorCodeGenerator` to `CallFarFunction` `VectorCodeGenerator` will always do far-calls since we cannot resolve any absolute addresses here. * shader_jit_a64: Implement position-independent VectorCodeGenerator Generates more position-independent assembly to allow for code to be generated within a resizable vector before copying into executable memory, allowing for more compact memory allocations and usage rather than a statically defined worst-case for all-cases. `VectorCodeGenerator` will need to generate position-independent code rather than use absolute addresses. Assumes all far function calls in the case of `VectorCodeGenerator` to use absolute addresses rather than potentially use a relative `BL` branch after memory relocation.
This commit is contained in:
		
							parent
							
								
									82faf2e557
								
							
						
					
					
						commit
						3e5bbac5a1
					
				
					 4 changed files with 74 additions and 41 deletions
				
			
		|  | @ -78,7 +78,8 @@ inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<64> regs, std::size_t fra | |||
|     return ABIFrameInfo{static_cast<u32>(total_size), static_cast<u32>(fprs_base_subtraction)}; | ||||
| } | ||||
| 
 | ||||
| inline void ABI_PushRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs, | ||||
| template <typename Policy> | ||||
| inline void ABI_PushRegisters(oaknut::BasicCodeGenerator<Policy>& code, std::bitset<64> regs, | ||||
|                               std::size_t frame_size = 0) { | ||||
|     using namespace oaknut; | ||||
|     using namespace oaknut::util; | ||||
|  | @ -137,7 +138,8 @@ inline void ABI_PushRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs, | |||
|     } | ||||
| } | ||||
| 
 | ||||
| inline void ABI_PopRegisters(oaknut::CodeGenerator& code, std::bitset<64> regs, | ||||
| template <typename Policy> | ||||
| inline void ABI_PopRegisters(oaknut::BasicCodeGenerator<Policy>& code, std::bitset<64> regs, | ||||
|                              std::size_t frame_size = 0) { | ||||
|     using namespace oaknut; | ||||
|     using namespace oaknut::util; | ||||
|  |  | |||
|  | @ -38,6 +38,16 @@ inline void CallFarFunction(oaknut::CodeGenerator& code, const T f) { | |||
|     } | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| inline void CallFarFunction(oaknut::VectorCodeGenerator& code, const T f) { | ||||
|     static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer."); | ||||
|     // X16(IP0) and X17(IP1) is the standard veneer register
 | ||||
|     // LR is also available as an intermediate register
 | ||||
|     // https://developer.arm.com/documentation/102374/0101/Procedure-Call-Standard
 | ||||
|     code.MOVP2R(oaknut::util::X16, reinterpret_cast<const void*>(f)); | ||||
|     code.BLR(oaknut::util::X16); | ||||
| } | ||||
| 
 | ||||
| } // namespace Common::A64
 | ||||
| 
 | ||||
| #endif // CITRA_ARCH(arm64)
 | ||||
|  |  | |||
|  | @ -942,7 +942,7 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_ | |||
|     swizzle_data = swizzle_data_; | ||||
| 
 | ||||
|     // Reset flow control state
 | ||||
|     program = xptr<CompiledShader*>(); | ||||
|     const std::uintptr_t program_offset = offset(); | ||||
|     program_counter = 0; | ||||
|     loop_depth = 0; | ||||
|     instruction_labels.fill(Label()); | ||||
|  | @ -984,18 +984,28 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_ | |||
|     return_offsets.clear(); | ||||
|     return_offsets.shrink_to_fit(); | ||||
| 
 | ||||
|     // Copy to executable memory
 | ||||
|     const size_t code_size = code_vec.size() * sizeof(u32); | ||||
| 
 | ||||
|     code_mem = std::make_unique<oaknut::CodeBlock>(code_size); | ||||
|     code_mem->unprotect(); | ||||
| 
 | ||||
|     program = reinterpret_cast<CompiledShader*>(reinterpret_cast<std::byte*>(code_mem->ptr()) + | ||||
|                                                 program_offset); | ||||
| 
 | ||||
|     // Copy to executable memory
 | ||||
|     std::memcpy(code_mem->ptr(), code_vec.data(), code_vec.size() * sizeof(u32)); | ||||
| 
 | ||||
|     // Memory is ready to execute
 | ||||
|     protect(); | ||||
|     invalidate_all(); | ||||
|     code_mem->protect(); | ||||
|     code_mem->invalidate_all(); | ||||
| 
 | ||||
|     const std::size_t code_size = static_cast<std::size_t>(offset()); | ||||
| 
 | ||||
|     ASSERT_MSG(code_size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); | ||||
|     LOG_DEBUG(HW_GPU, "Compiled shader size={}", code_size); | ||||
|     // code_vec is no longer needed
 | ||||
|     code_vec.clear(); | ||||
|     code_vec.shrink_to_fit(); | ||||
| } | ||||
| 
 | ||||
| JitShader::JitShader() : CodeBlock(MAX_SHADER_SIZE), CodeGenerator(CodeBlock::ptr()) { | ||||
|     unprotect(); | ||||
| JitShader::JitShader() : oaknut::VectorCodeGenerator(code_vec) { | ||||
|     CompilePrelude(); | ||||
| } | ||||
| 
 | ||||
|  | @ -1013,19 +1023,22 @@ Label JitShader::CompilePrelude_Log2() { | |||
|     // range. Coefficients for the minimax polynomial.
 | ||||
|     // f(x) computes approximately log2(x) / (x - 1).
 | ||||
|     // f(x) = c4 + x * (c3 + x * (c2 + x * (c1 + x * c0)).
 | ||||
|     align(16); | ||||
|     const void* c0 = xptr<const void*>(); | ||||
|     oaknut::Label c0; | ||||
|     // align(16);
 | ||||
|     l(c0); | ||||
|     dw(0x3d74552f); | ||||
| 
 | ||||
|     align(16); | ||||
|     const void* c14 = xptr<const void*>(); | ||||
|     // align(16);
 | ||||
|     oaknut::Label c14; | ||||
|     l(c14); | ||||
|     dw(0xbeee7397); | ||||
|     dw(0x3fbd96dd); | ||||
|     dw(0xc02153f6); | ||||
|     dw(0x4038d96c); | ||||
| 
 | ||||
|     align(16); | ||||
|     const void* negative_infinity_vector = xptr<const void*>(); | ||||
|     // align(16);
 | ||||
|     oaknut::Label negative_infinity_vector; | ||||
|     l(negative_infinity_vector); | ||||
|     dw(0xff800000); | ||||
|     dw(0xff800000); | ||||
|     dw(0xff800000); | ||||
|  | @ -1038,19 +1051,19 @@ Label JitShader::CompilePrelude_Log2() { | |||
| 
 | ||||
|     Label input_is_nan, input_is_zero, input_out_of_range; | ||||
| 
 | ||||
|     align(16); | ||||
|     // align(16);
 | ||||
|     l(input_out_of_range); | ||||
|     B(Cond::EQ, input_is_zero); | ||||
|     MOVP2R(XSCRATCH0, default_qnan_vector); | ||||
|     ADR(XSCRATCH0, default_qnan_vector); | ||||
|     LDR(SRC1, XSCRATCH0); | ||||
|     RET(); | ||||
| 
 | ||||
|     l(input_is_zero); | ||||
|     MOVP2R(XSCRATCH0, negative_infinity_vector); | ||||
|     ADR(XSCRATCH0, negative_infinity_vector); | ||||
|     LDR(SRC1, XSCRATCH0); | ||||
|     RET(); | ||||
| 
 | ||||
|     align(16); | ||||
|     // align(16);
 | ||||
|     l(subroutine); | ||||
| 
 | ||||
|     // Here we handle edge cases: input in {NaN, 0, -Inf, Negative}.
 | ||||
|  | @ -1078,14 +1091,14 @@ Label JitShader::CompilePrelude_Log2() { | |||
|     UCVTF(VSCRATCH1.toS(), VSCRATCH1.toS()); | ||||
|     // VSCRATCH1 now contains the exponent of the input.
 | ||||
| 
 | ||||
|     MOVP2R(XSCRATCH0, c0); | ||||
|     ADR(XSCRATCH0, c0); | ||||
|     LDR(XSCRATCH0.toW(), XSCRATCH0); | ||||
|     MOV(VSCRATCH0.Selem()[0], XSCRATCH0.toW()); | ||||
| 
 | ||||
|     // Complete computation of polynomial
 | ||||
|     // Load C1,C2,C3,C4 into a single scratch register
 | ||||
|     const QReg C14 = SRC2; | ||||
|     MOVP2R(XSCRATCH0, c14); | ||||
|     ADR(XSCRATCH0, c14); | ||||
|     LDR(C14, XSCRATCH0); | ||||
|     FMUL(VSCRATCH0.toS(), VSCRATCH0.toS(), SRC1.toS()); | ||||
|     FMLA(VSCRATCH0.toS(), ONE.toS(), C14.Selem()[0]); | ||||
|  | @ -1118,27 +1131,35 @@ Label JitShader::CompilePrelude_Exp2() { | |||
|     // polynomial which was fit for the function exp2(x) is then evaluated. We then restore the
 | ||||
|     // result into the appropriate range.
 | ||||
| 
 | ||||
|     align(16); | ||||
|     const void* input_max = xptr<const void*>(); | ||||
|     // align(16);
 | ||||
|     Label input_max; | ||||
|     l(input_max); | ||||
|     dw(0x43010000); | ||||
|     const void* input_min = xptr<const void*>(); | ||||
|     Label input_min; | ||||
|     l(input_min); | ||||
|     dw(0xc2fdffff); | ||||
|     const void* c0 = xptr<const void*>(); | ||||
|     Label c0; | ||||
|     l(c0); | ||||
|     dw(0x3c5dbe69); | ||||
|     const void* half = xptr<const void*>(); | ||||
|     Label half; | ||||
|     l(half); | ||||
|     dw(0x3f000000); | ||||
|     const void* c1 = xptr<const void*>(); | ||||
|     Label c1; | ||||
|     l(c1); | ||||
|     dw(0x3d5509f9); | ||||
|     const void* c2 = xptr<const void*>(); | ||||
|     Label c2; | ||||
|     l(c2); | ||||
|     dw(0x3e773cc5); | ||||
|     const void* c3 = xptr<const void*>(); | ||||
|     Label c3; | ||||
|     l(c3); | ||||
|     dw(0x3f3168b3); | ||||
|     const void* c4 = xptr<const void*>(); | ||||
|     Label c4; | ||||
|     l(c4); | ||||
|     dw(0x3f800016); | ||||
| 
 | ||||
|     Label ret_label; | ||||
| 
 | ||||
|     align(16); | ||||
|     // align(16);
 | ||||
|     l(subroutine); | ||||
| 
 | ||||
|     // Handle edge cases
 | ||||
|  | @ -1149,15 +1170,15 @@ Label JitShader::CompilePrelude_Exp2() { | |||
|     // VSCRATCH0=2^round(input)
 | ||||
|     // SRC1=input-round(input) [-0.5, 0.5)
 | ||||
|     // Clamp to maximum range since we shift the value directly into the exponent.
 | ||||
|     MOVP2R(XSCRATCH0, input_max); | ||||
|     ADR(XSCRATCH0, input_max); | ||||
|     LDR(VSCRATCH0.toS(), XSCRATCH0); | ||||
|     FMIN(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS()); | ||||
| 
 | ||||
|     MOVP2R(XSCRATCH0, input_min); | ||||
|     ADR(XSCRATCH0, input_min); | ||||
|     LDR(VSCRATCH0.toS(), XSCRATCH0); | ||||
|     FMAX(SRC1.toS(), SRC1.toS(), VSCRATCH0.toS()); | ||||
| 
 | ||||
|     MOVP2R(XSCRATCH0, half); | ||||
|     ADR(XSCRATCH0, half); | ||||
|     LDR(VSCRATCH0.toS(), XSCRATCH0); | ||||
|     FSUB(VSCRATCH0.toS(), SRC1.toS(), VSCRATCH0.toS()); | ||||
| 
 | ||||
|  |  | |||
|  | @ -30,20 +30,17 @@ struct ShaderUnit; | |||
| 
 | ||||
| namespace Pica::Shader { | ||||
| 
 | ||||
| /// Memory allocated for each compiled shader
 | ||||
| constexpr std::size_t MAX_SHADER_SIZE = MAX_PROGRAM_CODE_LENGTH * 256; | ||||
| 
 | ||||
| /**
 | ||||
|  * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 | ||||
|  * code that can be executed on the host machine directly. | ||||
|  */ | ||||
| class JitShader : private oaknut::CodeBlock, private oaknut::CodeGenerator { | ||||
| class JitShader : public oaknut::VectorCodeGenerator { | ||||
| public: | ||||
|     JitShader(); | ||||
| 
 | ||||
|     void Run(const ShaderSetup& setup, ShaderUnit& state, u32 offset) const { | ||||
|         program(&setup.uniforms, &state, | ||||
|                 reinterpret_cast<std::byte*>(oaknut::CodeBlock::ptr()) + | ||||
|                 reinterpret_cast<const std::byte*>(code_mem->ptr()) + | ||||
|                     instruction_labels[offset].offset()); | ||||
|     } | ||||
| 
 | ||||
|  | @ -81,6 +78,9 @@ public: | |||
|     void Compile_SETE(Instruction instr); | ||||
| 
 | ||||
| private: | ||||
|     std::vector<u32> code_vec; | ||||
|     std::unique_ptr<oaknut::CodeBlock> code_mem; | ||||
| 
 | ||||
|     void Compile_Block(u32 end); | ||||
|     void Compile_NextInstr(); | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue