mirror of
				https://github.com/PabloMK7/citra.git
				synced 2025-10-31 05:40:04 +00:00 
			
		
		
		
	Merge pull request #1088 from aroulin/x64-emitter-abi-call
x64: Proper stack alignment in shader JIT function calls
This commit is contained in:
		
						commit
						918ca40c68
					
				
					 7 changed files with 301 additions and 455 deletions
				
			
		|  | @ -24,6 +24,7 @@ set(SRCS | |||
| set(HEADERS | ||||
|             assert.h | ||||
|             bit_field.h | ||||
|             bit_set.h | ||||
|             break_points.h | ||||
|             chunk_file.h | ||||
|             code_block.h | ||||
|  |  | |||
							
								
								
									
										189
									
								
								src/common/bit_set.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										189
									
								
								src/common/bit_set.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,189 @@ | |||
| // This file is under the public domain.
 | ||||
| 
 | ||||
| #pragma once | ||||
| 
 | ||||
| #include <cstddef> | ||||
| #ifdef _WIN32 | ||||
| #include <intrin.h> | ||||
| #endif | ||||
| #include <initializer_list> | ||||
| #include <type_traits> | ||||
| #include "common/common_types.h" | ||||
| 
 | ||||
| // namespace avoids conflict with OS X Carbon; don't use BitSet<T> directly
 | ||||
| namespace Common { | ||||
| 
 | ||||
| // Helper functions:
 | ||||
| 
 | ||||
| #ifdef _WIN32 | ||||
| template <typename T> | ||||
| static inline int CountSetBits(T v) | ||||
| { | ||||
|     // from https://graphics.stanford.edu/~seander/bithacks.html
 | ||||
|     // GCC has this built in, but MSVC's intrinsic will only emit the actual
 | ||||
|     // POPCNT instruction, which we're not depending on
 | ||||
|     v = v - ((v >> 1) & (T)~(T)0/3); | ||||
|     v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); | ||||
|     v = (v + (v >> 4)) & (T)~(T)0/255*15; | ||||
|     return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8; | ||||
| } | ||||
| static inline int LeastSignificantSetBit(u8 val) | ||||
| { | ||||
|     unsigned long index; | ||||
|     _BitScanForward(&index, val); | ||||
|     return (int)index; | ||||
| } | ||||
| static inline int LeastSignificantSetBit(u16 val) | ||||
| { | ||||
|     unsigned long index; | ||||
|     _BitScanForward(&index, val); | ||||
|     return (int)index; | ||||
| } | ||||
| static inline int LeastSignificantSetBit(u32 val) | ||||
| { | ||||
|     unsigned long index; | ||||
|     _BitScanForward(&index, val); | ||||
|     return (int)index; | ||||
| } | ||||
| static inline int LeastSignificantSetBit(u64 val) | ||||
| { | ||||
|     unsigned long index; | ||||
|     _BitScanForward64(&index, val); | ||||
|     return (int)index; | ||||
| } | ||||
| #else | ||||
| static inline int CountSetBits(u8 val) { return __builtin_popcount(val); } | ||||
| static inline int CountSetBits(u16 val) { return __builtin_popcount(val); } | ||||
| static inline int CountSetBits(u32 val) { return __builtin_popcount(val); } | ||||
| static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); } | ||||
| static inline int LeastSignificantSetBit(u8 val) { return __builtin_ctz(val); } | ||||
| static inline int LeastSignificantSetBit(u16 val) { return __builtin_ctz(val); } | ||||
| static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); } | ||||
| static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); } | ||||
| #endif | ||||
| 
 | ||||
| // Similar to std::bitset, this is a class which encapsulates a bitset, i.e.
 | ||||
| // using the set bits of an integer to represent a set of integers.  Like that
 | ||||
| // class, it acts like an array of bools:
 | ||||
| //     BitSet32 bs;
 | ||||
| //     bs[1] = true;
 | ||||
| // but also like the underlying integer ([0] = least significant bit):
 | ||||
| //     BitSet32 bs2 = ...;
 | ||||
| //     bs = (bs ^ bs2) & BitSet32(0xffff);
 | ||||
| // The following additional functionality is provided:
 | ||||
| // - Construction using an initializer list.
 | ||||
| //     BitSet bs { 1, 2, 4, 8 };
 | ||||
| // - Efficiently iterating through the set bits:
 | ||||
| //     for (int i : bs)
 | ||||
| //         [i is the *index* of a set bit]
 | ||||
| //   (This uses the appropriate CPU instruction to find the next set bit in one
 | ||||
| //   operation.)
 | ||||
| // - Counting set bits using .Count() - see comment on that method.
 | ||||
| 
 | ||||
| // TODO: use constexpr when MSVC gets out of the Dark Ages
 | ||||
| 
 | ||||
| template <typename IntTy> | ||||
| class BitSet | ||||
| { | ||||
|     static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types"); | ||||
| public: | ||||
|     // A reference to a particular bit, returned from operator[].
 | ||||
|     class Ref | ||||
|     { | ||||
|     public: | ||||
|         Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {} | ||||
|         Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {} | ||||
|         operator bool() const { return (m_bs->m_val & m_mask) != 0; } | ||||
|         bool operator=(bool set) | ||||
|         { | ||||
|             m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0); | ||||
|             return set; | ||||
|         } | ||||
|     private: | ||||
|         BitSet* m_bs; | ||||
|         IntTy m_mask; | ||||
|     }; | ||||
| 
 | ||||
|     // A STL-like iterator is required to be able to use range-based for loops.
 | ||||
|     class Iterator | ||||
|     { | ||||
|     public: | ||||
|         Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {} | ||||
|         Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {} | ||||
|         Iterator& operator=(Iterator other) { new (this) Iterator(other); return *this; } | ||||
|         int operator*() { return m_bit; } | ||||
|         Iterator& operator++() | ||||
|         { | ||||
|             if (m_val == 0) | ||||
|             { | ||||
|                 m_bit = -1; | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 int bit = LeastSignificantSetBit(m_val); | ||||
|                 m_val &= ~(1 << bit); | ||||
|                 m_bit = bit; | ||||
|             } | ||||
|             return *this; | ||||
|         } | ||||
|         Iterator operator++(int _) | ||||
|         { | ||||
|             Iterator other(*this); | ||||
|             ++*this; | ||||
|             return other; | ||||
|         } | ||||
|         bool operator==(Iterator other) const { return m_bit == other.m_bit; } | ||||
|         bool operator!=(Iterator other) const { return m_bit != other.m_bit; } | ||||
|     private: | ||||
|         IntTy m_val; | ||||
|         int m_bit; | ||||
|     }; | ||||
| 
 | ||||
|     BitSet() : m_val(0) {} | ||||
|     explicit BitSet(IntTy val) : m_val(val) {} | ||||
|     BitSet(std::initializer_list<int> init) | ||||
|     { | ||||
|         m_val = 0; | ||||
|         for (int bit : init) | ||||
|             m_val |= (IntTy)1 << bit; | ||||
|     } | ||||
| 
 | ||||
|     static BitSet AllTrue(size_t count) | ||||
|     { | ||||
|         return BitSet(count == sizeof(IntTy)*8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1)); | ||||
|     } | ||||
| 
 | ||||
|     Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); } | ||||
|     const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; } | ||||
|     bool operator==(BitSet other) const { return m_val == other.m_val; } | ||||
|     bool operator!=(BitSet other) const { return m_val != other.m_val; } | ||||
|     bool operator<(BitSet other) const { return m_val < other.m_val; } | ||||
|     bool operator>(BitSet other) const { return m_val > other.m_val; } | ||||
|     BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); } | ||||
|     BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); } | ||||
|     BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); } | ||||
|     BitSet operator~() const { return BitSet(~m_val); } | ||||
|     BitSet& operator|=(BitSet other) { return *this = *this | other; } | ||||
|     BitSet& operator&=(BitSet other) { return *this = *this & other; } | ||||
|     BitSet& operator^=(BitSet other) { return *this = *this ^ other; } | ||||
|     operator u32() = delete; | ||||
|     operator bool() { return m_val != 0; } | ||||
| 
 | ||||
|     // Warning: Even though on modern CPUs this is a single fast instruction,
 | ||||
|     // Dolphin's official builds do not currently assume POPCNT support on x86,
 | ||||
|     // so slower explicit bit twiddling is generated.  Still should generally
 | ||||
|     // be faster than a loop.
 | ||||
|     unsigned int Count() const { return CountSetBits(m_val); } | ||||
| 
 | ||||
|     Iterator begin() const { Iterator it(m_val, 0); return ++it; } | ||||
|     Iterator end() const { return Iterator(m_val, -1); } | ||||
| 
 | ||||
|     IntTy m_val; | ||||
| }; | ||||
| 
 | ||||
| } // Common
 | ||||
| 
 | ||||
| typedef Common::BitSet<u8> BitSet8; | ||||
| typedef Common::BitSet<u16> BitSet16; | ||||
| typedef Common::BitSet<u32> BitSet32; | ||||
| typedef Common::BitSet<u64> BitSet64; | ||||
|  | @ -22,247 +22,69 @@ using namespace Gen; | |||
| 
 | ||||
| // Shared code between Win64 and Unix64
 | ||||
| 
 | ||||
| // Sets up a __cdecl function.
 | ||||
| void XEmitter::ABI_EmitPrologue(int maxCallParams) | ||||
| { | ||||
| #ifdef _M_IX86 | ||||
|     // Don't really need to do anything
 | ||||
| #elif defined(ARCHITECTURE_x86_64) | ||||
| #if _WIN32 | ||||
|     int stacksize = ((maxCallParams + 1) & ~1) * 8 + 8; | ||||
|     // Set up a stack frame so that we can call functions
 | ||||
|     // TODO: use maxCallParams
 | ||||
|     SUB(64, R(RSP), Imm8(stacksize)); | ||||
| #endif | ||||
| #else | ||||
| #error Arch not supported | ||||
| void XEmitter::ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp) { | ||||
|     size_t shadow = 0; | ||||
| #if defined(_WIN32) | ||||
|     shadow = 0x20; | ||||
| #endif | ||||
| 
 | ||||
|     int count = (mask & ABI_ALL_GPRS).Count(); | ||||
|     rsp_alignment -= count * 8; | ||||
|     size_t subtraction = 0; | ||||
|     int fpr_count = (mask & ABI_ALL_FPRS).Count(); | ||||
|     if (fpr_count) { | ||||
|         // If we have any XMMs to save, we must align the stack here.
 | ||||
|         subtraction = rsp_alignment & 0xf; | ||||
|     } | ||||
|     subtraction += 16 * fpr_count; | ||||
|     size_t xmm_base_subtraction = subtraction; | ||||
|     subtraction += needed_frame_size; | ||||
|     subtraction += shadow; | ||||
|     // Final alignment.
 | ||||
|     rsp_alignment -= subtraction; | ||||
|     subtraction += rsp_alignment & 0xf; | ||||
| 
 | ||||
|     *shadowp = shadow; | ||||
|     *subtractionp = subtraction; | ||||
|     *xmm_offsetp = subtraction - xmm_base_subtraction; | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_EmitEpilogue(int maxCallParams) | ||||
| { | ||||
| #ifdef _M_IX86 | ||||
|     RET(); | ||||
| #elif defined(ARCHITECTURE_x86_64) | ||||
| #ifdef _WIN32 | ||||
|     int stacksize = ((maxCallParams+1)&~1)*8 + 8; | ||||
|     ADD(64, R(RSP), Imm8(stacksize)); | ||||
| #endif | ||||
|     RET(); | ||||
| #else | ||||
| #error Arch not supported | ||||
| size_t XEmitter::ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size) { | ||||
|     size_t shadow, subtraction, xmm_offset; | ||||
|     ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset); | ||||
| 
 | ||||
|     for (int r : mask & ABI_ALL_GPRS) | ||||
|         PUSH((X64Reg)r); | ||||
| 
 | ||||
| #endif | ||||
|     if (subtraction) | ||||
|         SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction)); | ||||
| 
 | ||||
|     for (int x : mask & ABI_ALL_FPRS) { | ||||
|         MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg)(x - 16)); | ||||
|         xmm_offset += 16; | ||||
|     } | ||||
| 
 | ||||
|     return shadow; | ||||
| } | ||||
| 
 | ||||
| #ifdef _M_IX86 // All32
 | ||||
| void XEmitter::ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size) { | ||||
|     size_t shadow, subtraction, xmm_offset; | ||||
|     ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset); | ||||
| 
 | ||||
| // Shared code between Win32 and Unix32
 | ||||
| void XEmitter::ABI_CallFunction(const void *func) { | ||||
|     ABI_AlignStack(0); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(0); | ||||
| } | ||||
|     for (int x : mask & ABI_ALL_FPRS) { | ||||
|         MOVAPD((X64Reg) (x - 16), MDisp(RSP, (int)xmm_offset)); | ||||
|         xmm_offset += 16; | ||||
|     } | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionC16(const void *func, u16 param1) { | ||||
|     ABI_AlignStack(1 * 2); | ||||
|     PUSH(16, Imm16(param1)); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(1 * 2); | ||||
| } | ||||
|     if (subtraction) | ||||
|         ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction)); | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2) { | ||||
|     ABI_AlignStack(1 * 2 + 1 * 4); | ||||
|     PUSH(16, Imm16(param2)); | ||||
|     PUSH(32, Imm32(param1)); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(1 * 2 + 1 * 4); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionC(const void *func, u32 param1) { | ||||
|     ABI_AlignStack(1 * 4); | ||||
|     PUSH(32, Imm32(param1)); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(1 * 4); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionCC(const void *func, u32 param1, u32 param2) { | ||||
|     ABI_AlignStack(2 * 4); | ||||
|     PUSH(32, Imm32(param2)); | ||||
|     PUSH(32, Imm32(param1)); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(2 * 4); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3) { | ||||
|     ABI_AlignStack(3 * 4); | ||||
|     PUSH(32, Imm32(param3)); | ||||
|     PUSH(32, Imm32(param2)); | ||||
|     PUSH(32, Imm32(param1)); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(3 * 4); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3) { | ||||
|     ABI_AlignStack(3 * 4); | ||||
|     PUSH(32, ImmPtr(param3)); | ||||
|     PUSH(32, Imm32(param2)); | ||||
|     PUSH(32, Imm32(param1)); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(3 * 4); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2,u32 param3, void *param4) { | ||||
|     ABI_AlignStack(4 * 4); | ||||
|     PUSH(32, ImmPtr(param4)); | ||||
|     PUSH(32, Imm32(param3)); | ||||
|     PUSH(32, Imm32(param2)); | ||||
|     PUSH(32, Imm32(param1)); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(4 * 4); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionP(const void *func, void *param1) { | ||||
|     ABI_AlignStack(1 * 4); | ||||
|     PUSH(32, ImmPtr(param1)); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(1 * 4); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2) { | ||||
|     ABI_AlignStack(2 * 4); | ||||
|     PUSH(32, arg2); | ||||
|     PUSH(32, ImmPtr(param1)); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(2 * 4); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3) { | ||||
|     ABI_AlignStack(3 * 4); | ||||
|     PUSH(32, arg3); | ||||
|     PUSH(32, arg2); | ||||
|     PUSH(32, ImmPtr(param1)); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(3 * 4); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3) { | ||||
|     ABI_AlignStack(3 * 4); | ||||
|     PUSH(32, Imm32(param3)); | ||||
|     PUSH(32, ImmPtr(param2)); | ||||
|     PUSH(32, ImmPtr(param1)); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(3 * 4); | ||||
| } | ||||
| 
 | ||||
| // Pass a register as a parameter.
 | ||||
| void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1) { | ||||
|     ABI_AlignStack(1 * 4); | ||||
|     PUSH(32, R(reg1)); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(1 * 4); | ||||
| } | ||||
| 
 | ||||
| // Pass two registers as parameters.
 | ||||
| void XEmitter::ABI_CallFunctionRR(const void *func, Gen::X64Reg reg1, Gen::X64Reg reg2) | ||||
| { | ||||
|     ABI_AlignStack(2 * 4); | ||||
|     PUSH(32, R(reg2)); | ||||
|     PUSH(32, R(reg1)); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(2 * 4); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2) | ||||
| { | ||||
|     ABI_AlignStack(2 * 4); | ||||
|     PUSH(32, Imm32(param2)); | ||||
|     PUSH(32, arg1); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(2 * 4); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3) | ||||
| { | ||||
|     ABI_AlignStack(3 * 4); | ||||
|     PUSH(32, Imm32(param3)); | ||||
|     PUSH(32, Imm32(param2)); | ||||
|     PUSH(32, arg1); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(3 * 4); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1) | ||||
| { | ||||
|     ABI_AlignStack(1 * 4); | ||||
|     PUSH(32, arg1); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(1 * 4); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2) | ||||
| { | ||||
|     ABI_AlignStack(2 * 4); | ||||
|     PUSH(32, arg2); | ||||
|     PUSH(32, arg1); | ||||
|     CALL(func); | ||||
|     ABI_RestoreStack(2 * 4); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { | ||||
|     // Note: 4 * 4 = 16 bytes, so alignment is preserved.
 | ||||
|     PUSH(EBP); | ||||
|     PUSH(EBX); | ||||
|     PUSH(ESI); | ||||
|     PUSH(EDI); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { | ||||
|     POP(EDI); | ||||
|     POP(ESI); | ||||
|     POP(EBX); | ||||
|     POP(EBP); | ||||
| } | ||||
| 
 | ||||
| unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) { | ||||
|     frameSize += 4; // reserve space for return address
 | ||||
|     unsigned int alignedSize = | ||||
| #ifdef __GNUC__ | ||||
|         (frameSize + 15) & -16; | ||||
| #else | ||||
|         (frameSize + 3) & -4; | ||||
| #endif | ||||
|     return alignedSize; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| void XEmitter::ABI_AlignStack(unsigned int frameSize) { | ||||
| // Mac OS X requires the stack to be 16-byte aligned before every call.
 | ||||
| // Linux requires the stack to be 16-byte aligned before calls that put SSE
 | ||||
| // vectors on the stack, but since we do not keep track of which calls do that,
 | ||||
| // it is effectively every call as well.
 | ||||
| // Windows binaries compiled with MSVC do not have such a restriction*, but I
 | ||||
| // expect that GCC on Windows acts the same as GCC on Linux in this respect.
 | ||||
| // It would be nice if someone could verify this.
 | ||||
| // *However, the MSVC optimizing compiler assumes a 4-byte-aligned stack at times.
 | ||||
|     unsigned int fillSize = | ||||
|         ABI_GetAlignedFrameSize(frameSize) - (frameSize + 4); | ||||
|     if (fillSize != 0) { | ||||
|         SUB(32, R(ESP), Imm8(fillSize)); | ||||
|     for (int r = 15; r >= 0; r--) { | ||||
|         if (mask[r]) | ||||
|             POP((X64Reg)r); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_RestoreStack(unsigned int frameSize) { | ||||
|     unsigned int alignedSize = ABI_GetAlignedFrameSize(frameSize); | ||||
|     alignedSize -= 4; // return address is POPped at end of call
 | ||||
|     if (alignedSize != 0) { | ||||
|         ADD(32, R(ESP), Imm8(alignedSize)); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| #else //64bit
 | ||||
| 
 | ||||
| // Common functions
 | ||||
| void XEmitter::ABI_CallFunction(const void *func) { | ||||
|     u64 distance = u64(func) - (u64(code) + 5); | ||||
|  | @ -539,142 +361,3 @@ void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, cons | |||
|         CALL(func); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) { | ||||
|     return frameSize; | ||||
| } | ||||
| 
 | ||||
| #ifdef _WIN32 | ||||
| 
 | ||||
| // The Windows x64 ABI requires XMM6 - XMM15 to be callee saved.  10 regs.
 | ||||
| // But, not saving XMM4 and XMM5 breaks things in VS 2010, even though they are volatile regs.
 | ||||
| // Let's just save all 16.
 | ||||
| const int XMM_STACK_SPACE = 16 * 16; | ||||
| 
 | ||||
| // Win64 Specific Code
 | ||||
| void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { | ||||
|     //we only want to do this once
 | ||||
|     PUSH(RBX); | ||||
|     PUSH(RSI); | ||||
|     PUSH(RDI); | ||||
|     PUSH(RBP); | ||||
|     PUSH(R12); | ||||
|     PUSH(R13); | ||||
|     PUSH(R14); | ||||
|     PUSH(R15); | ||||
|     ABI_AlignStack(0); | ||||
| 
 | ||||
|     // Do this after aligning, because before it's offset by 8.
 | ||||
|     SUB(64, R(RSP), Imm32(XMM_STACK_SPACE)); | ||||
|     for (int i = 0; i < 16; ++i) | ||||
|         MOVAPS(MDisp(RSP, i * 16), (X64Reg)(XMM0 + i)); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { | ||||
|     for (int i = 0; i < 16; ++i) | ||||
|         MOVAPS((X64Reg)(XMM0 + i), MDisp(RSP, i * 16)); | ||||
|     ADD(64, R(RSP), Imm32(XMM_STACK_SPACE)); | ||||
| 
 | ||||
|     ABI_RestoreStack(0); | ||||
|     POP(R15); | ||||
|     POP(R14); | ||||
|     POP(R13); | ||||
|     POP(R12); | ||||
|     POP(RBP); | ||||
|     POP(RDI); | ||||
|     POP(RSI); | ||||
|     POP(RBX); | ||||
| } | ||||
| 
 | ||||
| // Win64 Specific Code
 | ||||
| void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() { | ||||
|     PUSH(RCX); | ||||
|     PUSH(RDX); | ||||
|     PUSH(RSI); | ||||
|     PUSH(RDI); | ||||
|     PUSH(R8); | ||||
|     PUSH(R9); | ||||
|     PUSH(R10); | ||||
|     PUSH(R11); | ||||
|     // TODO: Callers preserve XMM4-5 (XMM0-3 are args.)
 | ||||
|     ABI_AlignStack(0); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() { | ||||
|     ABI_RestoreStack(0); | ||||
|     POP(R11); | ||||
|     POP(R10); | ||||
|     POP(R9); | ||||
|     POP(R8); | ||||
|     POP(RDI); | ||||
|     POP(RSI); | ||||
|     POP(RDX); | ||||
|     POP(RCX); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) { | ||||
|     SUB(64, R(RSP), Imm8(0x28)); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) { | ||||
|     ADD(64, R(RSP), Imm8(0x28)); | ||||
| } | ||||
| 
 | ||||
| #else | ||||
| // Unix64 Specific Code
 | ||||
| void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { | ||||
|     PUSH(RBX); | ||||
|     PUSH(RBP); | ||||
|     PUSH(R12); | ||||
|     PUSH(R13); | ||||
|     PUSH(R14); | ||||
|     PUSH(R15); | ||||
|     PUSH(R15); //just to align stack. duped push/pop doesn't hurt.
 | ||||
|     // TODO: XMM?
 | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { | ||||
|     POP(R15); | ||||
|     POP(R15); | ||||
|     POP(R14); | ||||
|     POP(R13); | ||||
|     POP(R12); | ||||
|     POP(RBP); | ||||
|     POP(RBX); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() { | ||||
|     PUSH(RCX); | ||||
|     PUSH(RDX); | ||||
|     PUSH(RSI); | ||||
|     PUSH(RDI); | ||||
|     PUSH(R8); | ||||
|     PUSH(R9); | ||||
|     PUSH(R10); | ||||
|     PUSH(R11); | ||||
|     PUSH(R11); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() { | ||||
|     POP(R11); | ||||
|     POP(R11); | ||||
|     POP(R10); | ||||
|     POP(R9); | ||||
|     POP(R8); | ||||
|     POP(RDI); | ||||
|     POP(RSI); | ||||
|     POP(RDX); | ||||
|     POP(RCX); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) { | ||||
|     SUB(64, R(RSP), Imm8(0x08)); | ||||
| } | ||||
| 
 | ||||
| void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) { | ||||
|     ADD(64, R(RSP), Imm8(0x08)); | ||||
| } | ||||
| 
 | ||||
| #endif // WIN32
 | ||||
| 
 | ||||
| #endif // 32bit
 | ||||
|  |  | |||
|  | @ -1,35 +1,15 @@ | |||
| // Copyright (C) 2003 Dolphin Project.
 | ||||
| 
 | ||||
| // This program is free software: you can redistribute it and/or modify
 | ||||
| // it under the terms of the GNU General Public License as published by
 | ||||
| // the Free Software Foundation, version 2.0 or later versions.
 | ||||
| 
 | ||||
| // This program is distributed in the hope that it will be useful,
 | ||||
| // but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | ||||
| // GNU General Public License 2.0 for more details.
 | ||||
| 
 | ||||
| // A copy of the GPL 2.0 should have been included with the program.
 | ||||
| // If not, see http://www.gnu.org/licenses/
 | ||||
| 
 | ||||
| // Official SVN repository and contact information can be found at
 | ||||
| // http://code.google.com/p/dolphin-emu/
 | ||||
| // Copyright 2008 Dolphin Emulator Project
 | ||||
| // Licensed under GPLv2+
 | ||||
| // Refer to the license.txt file included.
 | ||||
| 
 | ||||
| #pragma once | ||||
| 
 | ||||
| #include "common/common_types.h" | ||||
| #include "common/bit_set.h" | ||||
| #include "emitter.h" | ||||
| 
 | ||||
| // x86/x64 ABI:s, and helpers to help follow them when JIT-ing code.
 | ||||
| // x64 ABI:s, and helpers to help follow them when JIT-ing code.
 | ||||
| // All convensions return values in EAX (+ possibly EDX).
 | ||||
| 
 | ||||
| // Linux 32-bit, Windows 32-bit (cdecl, System V):
 | ||||
| // * Caller pushes left to right
 | ||||
| // * Caller fixes stack after call
 | ||||
| // * function subtract from stack for local storage only.
 | ||||
| // Scratch:      EAX ECX EDX
 | ||||
| // Callee-save:  EBX ESI EDI EBP
 | ||||
| // Parameters:   -
 | ||||
| 
 | ||||
| // Windows 64-bit
 | ||||
| // * 4-reg "fastcall" variant, very new-skool stack handling
 | ||||
| // * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself calls_
 | ||||
|  | @ -44,18 +24,8 @@ | |||
| // Callee-save:  RBX RBP R12 R13 R14 R15
 | ||||
| // Parameters:   RDI RSI RDX RCX R8 R9
 | ||||
| 
 | ||||
| #ifdef _M_IX86 // 32 bit calling convention, shared by all
 | ||||
| 
 | ||||
| // 32-bit don't pass parameters in regs, but these are convenient to have anyway when we have to
 | ||||
| // choose regs to put stuff in.
 | ||||
| #define ABI_PARAM1 RCX | ||||
| #define ABI_PARAM2 RDX | ||||
| 
 | ||||
| // There are no ABI_PARAM* here, since args are pushed.
 | ||||
| // 32-bit bog standard cdecl, shared between linux and windows
 | ||||
| // MacOSX 32-bit is same as System V with a few exceptions that we probably don't care much about.
 | ||||
| 
 | ||||
| #elif ARCHITECTURE_x86_64 // 64 bit calling convention
 | ||||
| #define ABI_ALL_FPRS BitSet32(0xffff0000) | ||||
| #define ABI_ALL_GPRS BitSet32(0x0000ffff) | ||||
| 
 | ||||
| #ifdef _WIN32 // 64-bit Windows - the really exotic calling convention
 | ||||
| 
 | ||||
|  | @ -64,7 +34,11 @@ | |||
| #define ABI_PARAM3 R8 | ||||
| #define ABI_PARAM4 R9 | ||||
| 
 | ||||
| #else  //64-bit Unix (hopefully MacOSX too)
 | ||||
| // xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers.
 | ||||
| #define ABI_ALL_CALLER_SAVED \ | ||||
|     (BitSet32 { RAX, RCX, RDX, R8, R9, R10, R11, \ | ||||
|                 XMM0+16, XMM1+16, XMM2+16, XMM3+16, XMM4+16, XMM5+16 }) | ||||
| #else //64-bit Unix / OS X
 | ||||
| 
 | ||||
| #define ABI_PARAM1 RDI | ||||
| #define ABI_PARAM2 RSI | ||||
|  | @ -73,6 +47,13 @@ | |||
| #define ABI_PARAM5 R8 | ||||
| #define ABI_PARAM6 R9 | ||||
| 
 | ||||
| // TODO: Avoid pushing all 16 XMM registers when possible. Most functions we call probably
 | ||||
| // don't actually clobber them.
 | ||||
| #define ABI_ALL_CALLER_SAVED \ | ||||
|     (BitSet32 { RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11 } | \ | ||||
|      ABI_ALL_FPRS) | ||||
| #endif // WIN32
 | ||||
| 
 | ||||
| #endif // X86
 | ||||
| #define ABI_ALL_CALLEE_SAVED (~ABI_ALL_CALLER_SAVED) | ||||
| 
 | ||||
| #define ABI_RETURN RAX | ||||
|  |  | |||
|  | @ -18,6 +18,7 @@ | |||
| #pragma once | ||||
| 
 | ||||
| #include "common/assert.h" | ||||
| #include "common/bit_set.h" | ||||
| #include "common/common_types.h" | ||||
| #include "common/code_block.h" | ||||
| 
 | ||||
|  | @ -356,7 +357,7 @@ private: | |||
|     void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg); | ||||
|     void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2); | ||||
| 
 | ||||
|     void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); | ||||
|     void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); | ||||
| 
 | ||||
| protected: | ||||
|     void Write8(u8 value); | ||||
|  | @ -1007,25 +1008,26 @@ public: | |||
|         ABI_CallFunctionC((const void*)func, param1); | ||||
|     } | ||||
| 
 | ||||
|     // A function that doesn't have any control over what it will do to regs,
 | ||||
|     // such as the dispatcher, should be surrounded by these.
 | ||||
|     void ABI_PushAllCalleeSavedRegsAndAdjustStack(); | ||||
|     void ABI_PopAllCalleeSavedRegsAndAdjustStack(); | ||||
|     /**
 | ||||
|      * Saves specified registers and adjusts the stack to be 16-byte aligned as required by the ABI | ||||
|      * | ||||
|      * @param mask Registers to push on the stack (high 16 bits are XMMs, low 16 bits are GPRs) | ||||
|      * @param rsp_alignment Current alignment of the stack pointer, must be 0 or 8 | ||||
|      * @param needed_frame_size Additional space needed, e.g., for function arguments passed on the stack | ||||
|      * @return Size of the shadow space, i.e., offset of the frame | ||||
|      */ | ||||
|     size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); | ||||
| 
 | ||||
|     // A function that doesn't know anything about it's surroundings, should
 | ||||
|     // be surrounded by these to establish a safe environment, where it can roam free.
 | ||||
|     // An example is a backpatch injected function.
 | ||||
|     void ABI_PushAllCallerSavedRegsAndAdjustStack(); | ||||
|     void ABI_PopAllCallerSavedRegsAndAdjustStack(); | ||||
| 
 | ||||
|     unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize); | ||||
|     void ABI_AlignStack(unsigned int frameSize); | ||||
|     void ABI_RestoreStack(unsigned int frameSize); | ||||
| 
 | ||||
|     // Sets up a __cdecl function.
 | ||||
|     // Only x64 really needs the parameter count.
 | ||||
|     void ABI_EmitPrologue(int maxCallParams); | ||||
|     void ABI_EmitEpilogue(int maxCallParams); | ||||
|     /**
 | ||||
|      * Restores specified registers and adjusts the stack to its original alignment, i.e., the alignment before | ||||
|      * the matching PushRegistersAndAdjustStack. | ||||
|      * | ||||
|      * @param mask Registers to restores from the stack (high 16 bits are XMMs, low 16 bits are GPRs) | ||||
|      * @param rsp_alignment Original alignment before the matching PushRegistersAndAdjustStack, must be 0 or 8 | ||||
|      * @param needed_frame_size Additional space that was needed | ||||
|      * @warning Stack must be currently 16-byte aligned | ||||
|      */ | ||||
|     void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); | ||||
| 
 | ||||
|     #ifdef _M_IX86 | ||||
|     static int ABI_GetNumXMMRegs() { return 8; } | ||||
|  |  | |||
|  | @ -122,6 +122,14 @@ static const X64Reg ONE = XMM14; | |||
| /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
 | ||||
| static const X64Reg NEGBIT = XMM15; | ||||
| 
 | ||||
| // State registers that must not be modified by external functions calls
 | ||||
| // Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed
 | ||||
| static const BitSet32 persistent_regs = { | ||||
|     UNIFORMS, REGISTERS, // Pointers to register blocks
 | ||||
|     ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, // Cached registers
 | ||||
|     ONE+16, NEGBIT+16, // Constants
 | ||||
| }; | ||||
| 
 | ||||
| /// Raw constant for the source register selector that indicates no swizzling is performed
 | ||||
| static const u8 NO_SRC_REG_SWIZZLE = 0x1b; | ||||
| /// Raw constant for the destination register enable mask that indicates all components are enabled
 | ||||
|  | @ -295,20 +303,8 @@ void JitCompiler::Compile_UniformCondition(Instruction instr) { | |||
|     CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_PushCallerSavedXMM() { | ||||
| #ifndef _WIN32 | ||||
|     SUB(64, R(RSP), Imm8(2 * 16)); | ||||
|     MOVUPS(MDisp(RSP, 16), ONE); | ||||
|     MOVUPS(MDisp(RSP, 0), NEGBIT); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_PopCallerSavedXMM() { | ||||
| #ifndef _WIN32 | ||||
|     MOVUPS(NEGBIT, MDisp(RSP, 0)); | ||||
|     MOVUPS(ONE, MDisp(RSP, 16)); | ||||
|     ADD(64, R(RSP), Imm8(2 * 16)); | ||||
| #endif | ||||
| BitSet32 JitCompiler::PersistentCallerSavedRegs() { | ||||
|     return persistent_regs & ABI_ALL_CALLER_SAVED; | ||||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_ADD(Instruction instr) { | ||||
|  | @ -390,12 +386,9 @@ void JitCompiler::Compile_EX2(Instruction instr) { | |||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||
|     MOVSS(XMM0, R(SRC1)); | ||||
| 
 | ||||
|     // The following will actually break the stack alignment
 | ||||
|     ABI_PushAllCallerSavedRegsAndAdjustStack(); | ||||
|     Compile_PushCallerSavedXMM(); | ||||
|     ABI_PushRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); | ||||
|     ABI_CallFunction(reinterpret_cast<const void*>(exp2f)); | ||||
|     Compile_PopCallerSavedXMM(); | ||||
|     ABI_PopAllCallerSavedRegsAndAdjustStack(); | ||||
|     ABI_PopRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); | ||||
| 
 | ||||
|     SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0)); | ||||
|     MOVAPS(SRC1, R(XMM0)); | ||||
|  | @ -406,12 +399,9 @@ void JitCompiler::Compile_LG2(Instruction instr) { | |||
|     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||
|     MOVSS(XMM0, R(SRC1)); | ||||
| 
 | ||||
|     // The following will actually break the stack alignment
 | ||||
|     ABI_PushAllCallerSavedRegsAndAdjustStack(); | ||||
|     Compile_PushCallerSavedXMM(); | ||||
|     ABI_PushRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); | ||||
|     ABI_CallFunction(reinterpret_cast<const void*>(log2f)); | ||||
|     Compile_PopCallerSavedXMM(); | ||||
|     ABI_PopAllCallerSavedRegsAndAdjustStack(); | ||||
|     ABI_PopRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); | ||||
| 
 | ||||
|     SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0)); | ||||
|     MOVAPS(SRC1, R(XMM0)); | ||||
|  | @ -560,7 +550,7 @@ void JitCompiler::Compile_NOP(Instruction instr) { | |||
| } | ||||
| 
 | ||||
| void JitCompiler::Compile_END(Instruction instr) { | ||||
|     ABI_PopAllCalleeSavedRegsAndAdjustStack(); | ||||
|     ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); | ||||
|     RET(); | ||||
| } | ||||
| 
 | ||||
|  | @ -756,7 +746,8 @@ CompiledShader* JitCompiler::Compile() { | |||
|     const auto& code = g_state.vs.program_code; | ||||
|     unsigned offset = g_state.regs.vs.main_offset; | ||||
| 
 | ||||
|     ABI_PushAllCalleeSavedRegsAndAdjustStack(); | ||||
|     // The stack pointer is 8 modulo 16 at the entry of a procedure
 | ||||
|     ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); | ||||
| 
 | ||||
|     MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1)); | ||||
|     MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms)); | ||||
|  |  | |||
|  | @ -77,8 +77,7 @@ private: | |||
|     void Compile_EvaluateCondition(Instruction instr); | ||||
|     void Compile_UniformCondition(Instruction instr); | ||||
| 
 | ||||
|     void Compile_PushCallerSavedXMM(); | ||||
|     void Compile_PopCallerSavedXMM(); | ||||
|     BitSet32 PersistentCallerSavedRegs(); | ||||
| 
 | ||||
|     /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks.
 | ||||
|     unsigned* offset_ptr = nullptr; | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue