mirror of
				https://github.com/PabloMK7/citra.git
				synced 2025-10-31 13:50:03 +00:00 
			
		
		
		
	Merge pull request #1002 from bunnei/shader-jit
Vertex Shader JIT for X86-64
This commit is contained in:
		
						commit
						d852c4ecc7
					
				
					 49 changed files with 5533 additions and 339 deletions
				
			
		|  | @ -10,9 +10,21 @@ if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.git/hooks/pre-commit) | ||||||
|         DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/.git/hooks) |         DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/.git/hooks) | ||||||
| endif() | endif() | ||||||
| 
 | 
 | ||||||
|  | # Platform-agnostic definition to check if we are on x86_64 | ||||||
|  | if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "[xX]86_64" OR | ||||||
|  |    ${CMAKE_SYSTEM_PROCESSOR} MATCHES "[aA][mM][dD]64") | ||||||
|  |     set(ARCHITECTURE_x86_64 1) | ||||||
|  |     add_definitions(-DARCHITECTURE_x86_64=1) | ||||||
|  | endif() | ||||||
|  | 
 | ||||||
| if (NOT MSVC) | if (NOT MSVC) | ||||||
|     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wno-attributes -pthread") |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wno-attributes -pthread") | ||||||
|     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread") |     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread") | ||||||
|  | 
 | ||||||
|  |     if (ARCHITECTURE_x86_64) | ||||||
|  |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1") | ||||||
|  |         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1") | ||||||
|  |     endif() | ||||||
| else() | else() | ||||||
|     # Silence "deprecation" warnings |     # Silence "deprecation" warnings | ||||||
|     add_definitions(/D_CRT_SECURE_NO_WARNINGS /D_CRT_NONSTDC_NO_DEPRECATE) |     add_definitions(/D_CRT_SECURE_NO_WARNINGS /D_CRT_NONSTDC_NO_DEPRECATE) | ||||||
|  |  | ||||||
							
								
								
									
										2
									
								
								externals/nihstro
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								externals/nihstro
									
										
									
									
										vendored
									
									
								
							|  | @ -1 +1 @@ | ||||||
| Subproject commit 676254f71e0a7ef0aca8acce078d3c3dc80ccf70 | Subproject commit 445cba0b2ff8d348368e32698e4760a670260bfc | ||||||
|  | @ -14,7 +14,7 @@ set(HEADERS | ||||||
| create_directory_groups(${SRCS} ${HEADERS}) | create_directory_groups(${SRCS} ${HEADERS}) | ||||||
| 
 | 
 | ||||||
| add_executable(citra ${SRCS} ${HEADERS}) | add_executable(citra ${SRCS} ${HEADERS}) | ||||||
| target_link_libraries(citra core common video_core) | target_link_libraries(citra core video_core common) | ||||||
| target_link_libraries(citra ${GLFW_LIBRARIES} ${OPENGL_gl_LIBRARY} inih) | target_link_libraries(citra ${GLFW_LIBRARIES} ${OPENGL_gl_LIBRARY} inih) | ||||||
| if (MSVC) | if (MSVC) | ||||||
|     target_link_libraries(citra getopt) |     target_link_libraries(citra getopt) | ||||||
|  |  | ||||||
|  | @ -71,6 +71,7 @@ int main(int argc, char **argv) { | ||||||
|     EmuWindow_GLFW* emu_window = new EmuWindow_GLFW; |     EmuWindow_GLFW* emu_window = new EmuWindow_GLFW; | ||||||
| 
 | 
 | ||||||
|     VideoCore::g_hw_renderer_enabled = Settings::values.use_hw_renderer; |     VideoCore::g_hw_renderer_enabled = Settings::values.use_hw_renderer; | ||||||
|  |     VideoCore::g_shader_jit_enabled = Settings::values.use_shader_jit; | ||||||
| 
 | 
 | ||||||
|     System::Init(emu_window); |     System::Init(emu_window); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -61,6 +61,7 @@ void Config::ReadValues() { | ||||||
| 
 | 
 | ||||||
|     // Renderer
 |     // Renderer
 | ||||||
|     Settings::values.use_hw_renderer = glfw_config->GetBoolean("Renderer", "use_hw_renderer", false); |     Settings::values.use_hw_renderer = glfw_config->GetBoolean("Renderer", "use_hw_renderer", false); | ||||||
|  |     Settings::values.use_shader_jit = glfw_config->GetBoolean("Renderer", "use_shader_jit", true); | ||||||
| 
 | 
 | ||||||
|     Settings::values.bg_red   = (float)glfw_config->GetReal("Renderer", "bg_red",   1.0); |     Settings::values.bg_red   = (float)glfw_config->GetReal("Renderer", "bg_red",   1.0); | ||||||
|     Settings::values.bg_green = (float)glfw_config->GetReal("Renderer", "bg_green", 1.0); |     Settings::values.bg_green = (float)glfw_config->GetReal("Renderer", "bg_green", 1.0); | ||||||
|  |  | ||||||
|  | @ -42,6 +42,10 @@ frame_skip = | ||||||
| # 0 (default): Software, 1: Hardware | # 0 (default): Software, 1: Hardware | ||||||
| use_hw_renderer = | use_hw_renderer = | ||||||
| 
 | 
 | ||||||
|  | # Whether to use the Just-In-Time (JIT) compiler for shader emulation | ||||||
|  | # 0 : Interpreter (slow), 1 (default): JIT (fast) | ||||||
|  | use_shader_jit = | ||||||
|  | 
 | ||||||
| # The clear color for the renderer. What shows up on the sides of the bottom screen. | # The clear color for the renderer. What shows up on the sides of the bottom screen. | ||||||
| # Must be in range of 0.0-1.0. Defaults to 1.0 for all. | # Must be in range of 0.0-1.0. Defaults to 1.0 for all. | ||||||
| bg_red = | bg_red = | ||||||
|  |  | ||||||
|  | @ -71,7 +71,7 @@ if (APPLE) | ||||||
| else() | else() | ||||||
|     add_executable(citra-qt ${SRCS} ${HEADERS} ${UI_HDRS}) |     add_executable(citra-qt ${SRCS} ${HEADERS} ${UI_HDRS}) | ||||||
| endif() | endif() | ||||||
| target_link_libraries(citra-qt core common video_core qhexedit) | target_link_libraries(citra-qt core video_core common qhexedit) | ||||||
| target_link_libraries(citra-qt ${OPENGL_gl_LIBRARY} ${CITRA_QT_LIBS}) | target_link_libraries(citra-qt ${OPENGL_gl_LIBRARY} ${CITRA_QT_LIBS}) | ||||||
| target_link_libraries(citra-qt ${PLATFORM_LIBRARIES}) | target_link_libraries(citra-qt ${PLATFORM_LIBRARIES}) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -44,6 +44,7 @@ void Config::ReadValues() { | ||||||
| 
 | 
 | ||||||
|     qt_config->beginGroup("Renderer"); |     qt_config->beginGroup("Renderer"); | ||||||
|     Settings::values.use_hw_renderer = qt_config->value("use_hw_renderer", false).toBool(); |     Settings::values.use_hw_renderer = qt_config->value("use_hw_renderer", false).toBool(); | ||||||
|  |     Settings::values.use_shader_jit = qt_config->value("use_shader_jit", true).toBool(); | ||||||
| 
 | 
 | ||||||
|     Settings::values.bg_red   = qt_config->value("bg_red",   1.0).toFloat(); |     Settings::values.bg_red   = qt_config->value("bg_red",   1.0).toFloat(); | ||||||
|     Settings::values.bg_green = qt_config->value("bg_green", 1.0).toFloat(); |     Settings::values.bg_green = qt_config->value("bg_green", 1.0).toFloat(); | ||||||
|  | @ -77,6 +78,7 @@ void Config::SaveValues() { | ||||||
| 
 | 
 | ||||||
|     qt_config->beginGroup("Renderer"); |     qt_config->beginGroup("Renderer"); | ||||||
|     qt_config->setValue("use_hw_renderer", Settings::values.use_hw_renderer); |     qt_config->setValue("use_hw_renderer", Settings::values.use_hw_renderer); | ||||||
|  |     qt_config->setValue("use_shader_jit", Settings::values.use_shader_jit); | ||||||
| 
 | 
 | ||||||
|     // Cast to double because Qt's written float values are not human-readable
 |     // Cast to double because Qt's written float values are not human-readable
 | ||||||
|     qt_config->setValue("bg_red",   (double)Settings::values.bg_red); |     qt_config->setValue("bg_red",   (double)Settings::values.bg_red); | ||||||
|  |  | ||||||
|  | @ -8,7 +8,7 @@ | ||||||
| #include <QBoxLayout> | #include <QBoxLayout> | ||||||
| #include <QTreeView> | #include <QTreeView> | ||||||
| 
 | 
 | ||||||
| #include "video_core/vertex_shader.h" | #include "video_core/shader/shader_interpreter.h" | ||||||
| 
 | 
 | ||||||
| #include "graphics_vertex_shader.h" | #include "graphics_vertex_shader.h" | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -131,6 +131,9 @@ GMainWindow::GMainWindow() : emu_thread(nullptr) | ||||||
|     ui.action_Use_Hardware_Renderer->setChecked(Settings::values.use_hw_renderer); |     ui.action_Use_Hardware_Renderer->setChecked(Settings::values.use_hw_renderer); | ||||||
|     SetHardwareRendererEnabled(ui.action_Use_Hardware_Renderer->isChecked()); |     SetHardwareRendererEnabled(ui.action_Use_Hardware_Renderer->isChecked()); | ||||||
| 
 | 
 | ||||||
|  |     ui.action_Use_Shader_JIT->setChecked(Settings::values.use_shader_jit); | ||||||
|  |     SetShaderJITEnabled(ui.action_Use_Shader_JIT->isChecked()); | ||||||
|  | 
 | ||||||
|     ui.action_Single_Window_Mode->setChecked(settings.value("singleWindowMode", true).toBool()); |     ui.action_Single_Window_Mode->setChecked(settings.value("singleWindowMode", true).toBool()); | ||||||
|     ToggleWindowMode(); |     ToggleWindowMode(); | ||||||
| 
 | 
 | ||||||
|  | @ -144,6 +147,7 @@ GMainWindow::GMainWindow() : emu_thread(nullptr) | ||||||
|     connect(ui.action_Pause, SIGNAL(triggered()), this, SLOT(OnPauseGame())); |     connect(ui.action_Pause, SIGNAL(triggered()), this, SLOT(OnPauseGame())); | ||||||
|     connect(ui.action_Stop, SIGNAL(triggered()), this, SLOT(OnStopGame())); |     connect(ui.action_Stop, SIGNAL(triggered()), this, SLOT(OnStopGame())); | ||||||
|     connect(ui.action_Use_Hardware_Renderer, SIGNAL(triggered(bool)), this, SLOT(SetHardwareRendererEnabled(bool))); |     connect(ui.action_Use_Hardware_Renderer, SIGNAL(triggered(bool)), this, SLOT(SetHardwareRendererEnabled(bool))); | ||||||
|  |     connect(ui.action_Use_Shader_JIT, SIGNAL(triggered(bool)), this, SLOT(SetShaderJITEnabled(bool))); | ||||||
|     connect(ui.action_Single_Window_Mode, SIGNAL(triggered(bool)), this, SLOT(ToggleWindowMode())); |     connect(ui.action_Single_Window_Mode, SIGNAL(triggered(bool)), this, SLOT(ToggleWindowMode())); | ||||||
|     connect(ui.action_Hotkeys, SIGNAL(triggered()), this, SLOT(OnOpenHotkeysDialog())); |     connect(ui.action_Hotkeys, SIGNAL(triggered()), this, SLOT(OnOpenHotkeysDialog())); | ||||||
| 
 | 
 | ||||||
|  | @ -331,6 +335,10 @@ void GMainWindow::SetHardwareRendererEnabled(bool enabled) { | ||||||
|     VideoCore::g_hw_renderer_enabled = enabled; |     VideoCore::g_hw_renderer_enabled = enabled; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | void GMainWindow::SetShaderJITEnabled(bool enabled) { | ||||||
|  |     VideoCore::g_shader_jit_enabled = enabled; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| void GMainWindow::ToggleWindowMode() { | void GMainWindow::ToggleWindowMode() { | ||||||
|     if (ui.action_Single_Window_Mode->isChecked()) { |     if (ui.action_Single_Window_Mode->isChecked()) { | ||||||
|         // Render in the main window...
 |         // Render in the main window...
 | ||||||
|  |  | ||||||
|  | @ -70,6 +70,7 @@ private slots: | ||||||
|     void OnConfigure(); |     void OnConfigure(); | ||||||
|     void OnDisplayTitleBars(bool); |     void OnDisplayTitleBars(bool); | ||||||
|     void SetHardwareRendererEnabled(bool); |     void SetHardwareRendererEnabled(bool); | ||||||
|  |     void SetShaderJITEnabled(bool); | ||||||
|     void ToggleWindowMode(); |     void ToggleWindowMode(); | ||||||
| 
 | 
 | ||||||
| private: | private: | ||||||
|  |  | ||||||
|  | @ -66,6 +66,7 @@ | ||||||
|     <addaction name="action_Stop"/> |     <addaction name="action_Stop"/> | ||||||
|     <addaction name="separator"/> |     <addaction name="separator"/> | ||||||
|     <addaction name="action_Use_Hardware_Renderer"/> |     <addaction name="action_Use_Hardware_Renderer"/> | ||||||
|  |     <addaction name="action_Use_Shader_JIT"/> | ||||||
|     <addaction name="action_Configure"/> |     <addaction name="action_Configure"/> | ||||||
|    </widget> |    </widget> | ||||||
|    <widget class="QMenu" name="menu_View"> |    <widget class="QMenu" name="menu_View"> | ||||||
|  | @ -153,6 +154,14 @@ | ||||||
|     <string>Use Hardware Renderer</string> |     <string>Use Hardware Renderer</string> | ||||||
|    </property> |    </property> | ||||||
|   </action> |   </action> | ||||||
|  |   <action name="action_Use_Shader_JIT"> | ||||||
|  |    <property name="checkable"> | ||||||
|  |     <bool>true</bool> | ||||||
|  |    </property> | ||||||
|  |    <property name="text"> | ||||||
|  |     <string>Use Shader JIT</string> | ||||||
|  |    </property> | ||||||
|  |   </action> | ||||||
|   <action name="action_Configure"> |   <action name="action_Configure"> | ||||||
|    <property name="text"> |    <property name="text"> | ||||||
|     <string>Configure ...</string> |     <string>Configure ...</string> | ||||||
|  |  | ||||||
|  | @ -5,6 +5,7 @@ set(SRCS | ||||||
|             break_points.cpp |             break_points.cpp | ||||||
|             emu_window.cpp |             emu_window.cpp | ||||||
|             file_util.cpp |             file_util.cpp | ||||||
|  |             hash.cpp | ||||||
|             key_map.cpp |             key_map.cpp | ||||||
|             logging/filter.cpp |             logging/filter.cpp | ||||||
|             logging/text_formatter.cpp |             logging/text_formatter.cpp | ||||||
|  | @ -24,14 +25,15 @@ set(HEADERS | ||||||
|             bit_field.h |             bit_field.h | ||||||
|             break_points.h |             break_points.h | ||||||
|             chunk_file.h |             chunk_file.h | ||||||
|  |             code_block.h | ||||||
|             color.h |             color.h | ||||||
|             common_funcs.h |             common_funcs.h | ||||||
|             common_paths.h |             common_paths.h | ||||||
|             common_types.h |             common_types.h | ||||||
|             cpu_detect.h |  | ||||||
|             debug_interface.h |             debug_interface.h | ||||||
|             emu_window.h |             emu_window.h | ||||||
|             file_util.h |             file_util.h | ||||||
|  |             hash.h | ||||||
|             key_map.h |             key_map.h | ||||||
|             linear_disk_cache.h |             linear_disk_cache.h | ||||||
|             logging/text_formatter.h |             logging/text_formatter.h | ||||||
|  | @ -56,6 +58,18 @@ set(HEADERS | ||||||
|             vector_math.h |             vector_math.h | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|  | if(ARCHITECTURE_x86_64) | ||||||
|  |     set(SRCS ${SRCS} | ||||||
|  |             x64/abi.cpp | ||||||
|  |             x64/cpu_detect.cpp | ||||||
|  |             x64/emitter.cpp) | ||||||
|  | 
 | ||||||
|  |     set(HEADERS ${HEADERS} | ||||||
|  |             x64/abi.h | ||||||
|  |             x64/cpu_detect.h | ||||||
|  |             x64/emitter.h) | ||||||
|  | endif() | ||||||
|  | 
 | ||||||
| create_directory_groups(${SRCS} ${HEADERS}) | create_directory_groups(${SRCS} ${HEADERS}) | ||||||
| 
 | 
 | ||||||
| add_library(common STATIC ${SRCS} ${HEADERS}) | add_library(common STATIC ${SRCS} ${HEADERS}) | ||||||
|  |  | ||||||
							
								
								
									
										87
									
								
								src/common/code_block.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								src/common/code_block.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,87 @@ | ||||||
|  | // Copyright 2013 Dolphin Emulator Project
 | ||||||
|  | // Licensed under GPLv2
 | ||||||
|  | // Refer to the license.txt file included.
 | ||||||
|  | 
 | ||||||
|  | #pragma once | ||||||
|  | 
 | ||||||
|  | #include "common_types.h" | ||||||
|  | #include "memory_util.h" | ||||||
|  | 
 | ||||||
|  | // Everything that needs to generate code should inherit from this.
 | ||||||
|  | // You get memory management for free, plus, you can use all emitter functions without
 | ||||||
|  | // having to prefix them with gen-> or something similar.
 | ||||||
|  | // Example implementation:
 | ||||||
|  | // class JIT : public CodeBlock<ARMXEmitter> {}
 | ||||||
|  | template<class T> class CodeBlock : public T, NonCopyable | ||||||
|  | { | ||||||
|  | private: | ||||||
|  |     // A privately used function to set the executable RAM space to something invalid.
 | ||||||
|  |     // For debugging usefulness it should be used to set the RAM to a host specific breakpoint instruction
 | ||||||
|  |     virtual void PoisonMemory() = 0; | ||||||
|  | 
 | ||||||
|  | protected: | ||||||
|  |     u8 *region; | ||||||
|  |     size_t region_size; | ||||||
|  | 
 | ||||||
|  | public: | ||||||
|  |     CodeBlock() : region(nullptr), region_size(0) {} | ||||||
|  |     virtual ~CodeBlock() { if (region) FreeCodeSpace(); } | ||||||
|  | 
 | ||||||
|  |     // Call this before you generate any code.
 | ||||||
|  |     void AllocCodeSpace(int size) | ||||||
|  |     { | ||||||
|  |         region_size = size; | ||||||
|  |         region = (u8*)AllocateExecutableMemory(region_size); | ||||||
|  |         T::SetCodePtr(region); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // Always clear code space with breakpoints, so that if someone accidentally executes
 | ||||||
|  |     // uninitialized, it just breaks into the debugger.
 | ||||||
|  |     void ClearCodeSpace() | ||||||
|  |     { | ||||||
|  |         PoisonMemory(); | ||||||
|  |         ResetCodePtr(); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // Call this when shutting down. Don't rely on the destructor, even though it'll do the job.
 | ||||||
|  |     void FreeCodeSpace() | ||||||
|  |     { | ||||||
|  | #ifdef __SYMBIAN32__ | ||||||
|  |         ResetExecutableMemory(region); | ||||||
|  | #else | ||||||
|  |         FreeMemoryPages(region, region_size); | ||||||
|  | #endif | ||||||
|  |         region = nullptr; | ||||||
|  |         region_size = 0; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     bool IsInSpace(const u8 *ptr) | ||||||
|  |     { | ||||||
|  |         return (ptr >= region) && (ptr < (region + region_size)); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // Cannot currently be undone. Will write protect the entire code region.
 | ||||||
|  |     // Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
 | ||||||
|  |     void WriteProtect() | ||||||
|  |     { | ||||||
|  |         WriteProtectMemory(region, region_size, true); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     void ResetCodePtr() | ||||||
|  |     { | ||||||
|  |         T::SetCodePtr(region); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     size_t GetSpaceLeft() const | ||||||
|  |     { | ||||||
|  |         return region_size - (T::GetCodePtr() - region); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     u8 *GetBasePtr() { | ||||||
|  |         return region; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     size_t GetOffset(const u8 *ptr) const { | ||||||
|  |         return ptr - region; | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  | @ -35,7 +35,7 @@ | ||||||
| 
 | 
 | ||||||
| #ifndef _MSC_VER | #ifndef _MSC_VER | ||||||
| 
 | 
 | ||||||
| #if defined(__x86_64__) || defined(_M_X64) | #ifdef ARCHITECTURE_x86_64 | ||||||
| #define Crash() __asm__ __volatile__("int $3") | #define Crash() __asm__ __volatile__("int $3") | ||||||
| #elif defined(_M_ARM) | #elif defined(_M_ARM) | ||||||
| #define Crash() __asm__ __volatile__("trap") | #define Crash() __asm__ __volatile__("trap") | ||||||
|  |  | ||||||
|  | @ -1,78 +0,0 @@ | ||||||
| // Copyright 2013 Dolphin Emulator Project / 2014 Citra Emulator Project
 |  | ||||||
| // Licensed under GPLv2 or any later version
 |  | ||||||
| // Refer to the license.txt file included.
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| // Detect the cpu, so we'll know which optimizations to use
 |  | ||||||
| #pragma once |  | ||||||
| 
 |  | ||||||
| #include <string> |  | ||||||
| 
 |  | ||||||
| enum CPUVendor |  | ||||||
| { |  | ||||||
|     VENDOR_INTEL = 0, |  | ||||||
|     VENDOR_AMD = 1, |  | ||||||
|     VENDOR_ARM = 2, |  | ||||||
|     VENDOR_OTHER = 3, |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| struct CPUInfo |  | ||||||
| { |  | ||||||
|     CPUVendor vendor; |  | ||||||
| 
 |  | ||||||
|     char cpu_string[0x21]; |  | ||||||
|     char brand_string[0x41]; |  | ||||||
|     bool OS64bit; |  | ||||||
|     bool CPU64bit; |  | ||||||
|     bool Mode64bit; |  | ||||||
| 
 |  | ||||||
|     bool HTT; |  | ||||||
|     int num_cores; |  | ||||||
|     int logical_cpu_count; |  | ||||||
| 
 |  | ||||||
|     bool bSSE; |  | ||||||
|     bool bSSE2; |  | ||||||
|     bool bSSE3; |  | ||||||
|     bool bSSSE3; |  | ||||||
|     bool bPOPCNT; |  | ||||||
|     bool bSSE4_1; |  | ||||||
|     bool bSSE4_2; |  | ||||||
|     bool bLZCNT; |  | ||||||
|     bool bSSE4A; |  | ||||||
|     bool bAVX; |  | ||||||
|     bool bAES; |  | ||||||
|     bool bLAHFSAHF64; |  | ||||||
|     bool bLongMode; |  | ||||||
| 
 |  | ||||||
|     // ARM specific CPUInfo
 |  | ||||||
|     bool bSwp; |  | ||||||
|     bool bHalf; |  | ||||||
|     bool bThumb; |  | ||||||
|     bool bFastMult; |  | ||||||
|     bool bVFP; |  | ||||||
|     bool bEDSP; |  | ||||||
|     bool bThumbEE; |  | ||||||
|     bool bNEON; |  | ||||||
|     bool bVFPv3; |  | ||||||
|     bool bTLS; |  | ||||||
|     bool bVFPv4; |  | ||||||
|     bool bIDIVa; |  | ||||||
|     bool bIDIVt; |  | ||||||
|     bool bArmV7;  // enable MOVT, MOVW etc
 |  | ||||||
| 
 |  | ||||||
|     // ARMv8 specific
 |  | ||||||
|     bool bFP; |  | ||||||
|     bool bASIMD; |  | ||||||
| 
 |  | ||||||
|     // Call Detect()
 |  | ||||||
|     explicit CPUInfo(); |  | ||||||
| 
 |  | ||||||
|     // Turn the cpu info into a string we can show
 |  | ||||||
|     std::string Summarize(); |  | ||||||
| 
 |  | ||||||
| private: |  | ||||||
|     // Detects the various cpu features
 |  | ||||||
|     void Detect(); |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| extern CPUInfo cpu_info; |  | ||||||
							
								
								
									
										126
									
								
								src/common/hash.cpp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										126
									
								
								src/common/hash.cpp
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,126 @@ | ||||||
|  | // Copyright 2015 Citra Emulator Project
 | ||||||
|  | // Licensed under GPLv2 or any later version
 | ||||||
|  | // Refer to the license.txt file included.
 | ||||||
|  | 
 | ||||||
|  | #if defined(_MSC_VER) | ||||||
|  | #include <stdlib.h> | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | #include "common_funcs.h" | ||||||
|  | #include "common_types.h" | ||||||
|  | #include "hash.h" | ||||||
|  | 
 | ||||||
|  | namespace Common { | ||||||
|  | 
 | ||||||
|  | // MurmurHash3 was written by Austin Appleby, and is placed in the public
 | ||||||
|  | // domain. The author hereby disclaims copyright to this source code.
 | ||||||
|  | 
 | ||||||
|  | // Block read - if your platform needs to do endian-swapping or can only handle aligned reads, do
 | ||||||
|  | // the conversion here
 | ||||||
|  | 
 | ||||||
|  | static FORCE_INLINE u32 getblock32(const u32* p, int i) { | ||||||
|  |     return p[i]; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static FORCE_INLINE u64 getblock64(const u64* p, int i) { | ||||||
|  |     return p[i]; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Finalization mix - force all bits of a hash block to avalanche
 | ||||||
|  | 
 | ||||||
|  | static FORCE_INLINE u32 fmix32(u32 h) { | ||||||
|  |     h ^= h >> 16; | ||||||
|  |     h *= 0x85ebca6b; | ||||||
|  |     h ^= h >> 13; | ||||||
|  |     h *= 0xc2b2ae35; | ||||||
|  |     h ^= h >> 16; | ||||||
|  | 
 | ||||||
|  |     return h; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static FORCE_INLINE u64 fmix64(u64 k) { | ||||||
|  |     k ^= k >> 33; | ||||||
|  |     k *= 0xff51afd7ed558ccdllu; | ||||||
|  |     k ^= k >> 33; | ||||||
|  |     k *= 0xc4ceb9fe1a85ec53llu; | ||||||
|  |     k ^= k >> 33; | ||||||
|  | 
 | ||||||
|  |     return k; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // This is the 128-bit variant of the MurmurHash3 hash function that is targetted for 64-bit
 | ||||||
|  | // platforms (MurmurHash3_x64_128). It was taken from:
 | ||||||
|  | // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
 | ||||||
|  | void MurmurHash3_128(const void* key, int len, u32 seed, void* out) { | ||||||
|  |     const u8 * data = (const u8*)key; | ||||||
|  |     const int nblocks = len / 16; | ||||||
|  | 
 | ||||||
|  |     u64 h1 = seed; | ||||||
|  |     u64 h2 = seed; | ||||||
|  | 
 | ||||||
|  |     const u64 c1 = 0x87c37b91114253d5llu; | ||||||
|  |     const u64 c2 = 0x4cf5ad432745937fllu; | ||||||
|  | 
 | ||||||
|  |     // Body
 | ||||||
|  | 
 | ||||||
|  |     const u64 * blocks = (const u64 *)(data); | ||||||
|  | 
 | ||||||
|  |     for (int i = 0; i < nblocks; i++) { | ||||||
|  |         u64 k1 = getblock64(blocks,i*2+0); | ||||||
|  |         u64 k2 = getblock64(blocks,i*2+1); | ||||||
|  | 
 | ||||||
|  |         k1 *= c1; k1  = _rotl64(k1,31); k1 *= c2; h1 ^= k1; | ||||||
|  | 
 | ||||||
|  |         h1 = _rotl64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; | ||||||
|  | 
 | ||||||
|  |         k2 *= c2; k2  = _rotl64(k2,33); k2 *= c1; h2 ^= k2; | ||||||
|  | 
 | ||||||
|  |         h2 = _rotl64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // Tail
 | ||||||
|  | 
 | ||||||
|  |     const u8 * tail = (const u8*)(data + nblocks*16); | ||||||
|  | 
 | ||||||
|  |     u64 k1 = 0; | ||||||
|  |     u64 k2 = 0; | ||||||
|  | 
 | ||||||
|  |     switch (len & 15) { | ||||||
|  |     case 15: k2 ^= ((u64)tail[14]) << 48; | ||||||
|  |     case 14: k2 ^= ((u64)tail[13]) << 40; | ||||||
|  |     case 13: k2 ^= ((u64)tail[12]) << 32; | ||||||
|  |     case 12: k2 ^= ((u64)tail[11]) << 24; | ||||||
|  |     case 11: k2 ^= ((u64)tail[10]) << 16; | ||||||
|  |     case 10: k2 ^= ((u64)tail[ 9]) << 8; | ||||||
|  |     case  9: k2 ^= ((u64)tail[ 8]) << 0; | ||||||
|  |         k2 *= c2; k2  = _rotl64(k2,33); k2 *= c1; h2 ^= k2; | ||||||
|  | 
 | ||||||
|  |     case  8: k1 ^= ((u64)tail[ 7]) << 56; | ||||||
|  |     case  7: k1 ^= ((u64)tail[ 6]) << 48; | ||||||
|  |     case  6: k1 ^= ((u64)tail[ 5]) << 40; | ||||||
|  |     case  5: k1 ^= ((u64)tail[ 4]) << 32; | ||||||
|  |     case  4: k1 ^= ((u64)tail[ 3]) << 24; | ||||||
|  |     case  3: k1 ^= ((u64)tail[ 2]) << 16; | ||||||
|  |     case  2: k1 ^= ((u64)tail[ 1]) << 8; | ||||||
|  |     case  1: k1 ^= ((u64)tail[ 0]) << 0; | ||||||
|  |         k1 *= c1; k1  = _rotl64(k1,31); k1 *= c2; h1 ^= k1; | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     // Finalization
 | ||||||
|  | 
 | ||||||
|  |     h1 ^= len; h2 ^= len; | ||||||
|  | 
 | ||||||
|  |     h1 += h2; | ||||||
|  |     h2 += h1; | ||||||
|  | 
 | ||||||
|  |     h1 = fmix64(h1); | ||||||
|  |     h2 = fmix64(h2); | ||||||
|  | 
 | ||||||
|  |     h1 += h2; | ||||||
|  |     h2 += h1; | ||||||
|  | 
 | ||||||
|  |     ((u64*)out)[0] = h1; | ||||||
|  |     ((u64*)out)[1] = h2; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | } // namespace Common
 | ||||||
							
								
								
									
										25
									
								
								src/common/hash.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								src/common/hash.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,25 @@ | ||||||
|  | // Copyright 2015 Citra Emulator Project
 | ||||||
|  | // Licensed under GPLv2 or any later version
 | ||||||
|  | // Refer to the license.txt file included.
 | ||||||
|  | 
 | ||||||
|  | #pragma once | ||||||
|  | 
 | ||||||
|  | #include "common/common_types.h" | ||||||
|  | 
 | ||||||
|  | namespace Common { | ||||||
|  | 
 | ||||||
|  | void MurmurHash3_128(const void* key, int len, u32 seed, void* out); | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * Computes a 64-bit hash over the specified block of data | ||||||
|  |  * @param data Block of data to compute hash over | ||||||
|  |  * @param len Length of data (in bytes) to compute hash over | ||||||
|  |  * @returns 64-bit hash value that was computed over the data block | ||||||
|  |  */ | ||||||
|  | static inline u64 ComputeHash64(const void* data, int len) { | ||||||
|  |     u64 res[2]; | ||||||
|  |     MurmurHash3_128(data, len, 0, res); | ||||||
|  |     return res[0]; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | } // namespace Common
 | ||||||
|  | @ -16,7 +16,7 @@ | ||||||
|     #include <sys/mman.h> |     #include <sys/mman.h> | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #if !defined(_WIN32) && defined(__x86_64__) && !defined(MAP_32BIT) | #if !defined(_WIN32) && defined(ARCHITECTURE_X64) && !defined(MAP_32BIT) | ||||||
| #include <unistd.h> | #include <unistd.h> | ||||||
| #define PAGE_MASK     (getpagesize() - 1) | #define PAGE_MASK     (getpagesize() - 1) | ||||||
| #define round_page(x) ((((unsigned long)(x)) + PAGE_MASK) & ~(PAGE_MASK)) | #define round_page(x) ((((unsigned long)(x)) + PAGE_MASK) & ~(PAGE_MASK)) | ||||||
|  | @ -31,7 +31,7 @@ void* AllocateExecutableMemory(size_t size, bool low) | ||||||
|     void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE); |     void* ptr = VirtualAlloc(0, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE); | ||||||
| #else | #else | ||||||
|     static char *map_hint = 0; |     static char *map_hint = 0; | ||||||
| #if defined(__x86_64__) && !defined(MAP_32BIT) | #if defined(ARCHITECTURE_X64) && !defined(MAP_32BIT) | ||||||
|     // This OS has no flag to enforce allocation below the 4 GB boundary,
 |     // This OS has no flag to enforce allocation below the 4 GB boundary,
 | ||||||
|     // but if we hint that we want a low address it is very likely we will
 |     // but if we hint that we want a low address it is very likely we will
 | ||||||
|     // get one.
 |     // get one.
 | ||||||
|  | @ -43,7 +43,7 @@ void* AllocateExecutableMemory(size_t size, bool low) | ||||||
| #endif | #endif | ||||||
|     void* ptr = mmap(map_hint, size, PROT_READ | PROT_WRITE | PROT_EXEC, |     void* ptr = mmap(map_hint, size, PROT_READ | PROT_WRITE | PROT_EXEC, | ||||||
|         MAP_ANON | MAP_PRIVATE |         MAP_ANON | MAP_PRIVATE | ||||||
| #if defined(__x86_64__) && defined(MAP_32BIT) | #if defined(ARCHITECTURE_X64) && defined(MAP_32BIT) | ||||||
|         | (low ? MAP_32BIT : 0) |         | (low ? MAP_32BIT : 0) | ||||||
| #endif | #endif | ||||||
|         , -1, 0); |         , -1, 0); | ||||||
|  | @ -62,7 +62,7 @@ void* AllocateExecutableMemory(size_t size, bool low) | ||||||
| #endif | #endif | ||||||
|         LOG_ERROR(Common_Memory, "Failed to allocate executable memory"); |         LOG_ERROR(Common_Memory, "Failed to allocate executable memory"); | ||||||
|     } |     } | ||||||
| #if !defined(_WIN32) && defined(__x86_64__) && !defined(MAP_32BIT) | #if !defined(_WIN32) && defined(ARCHITECTURE_X64) && !defined(MAP_32BIT) | ||||||
|     else |     else | ||||||
|     { |     { | ||||||
|         if (low) |         if (low) | ||||||
|  |  | ||||||
|  | @ -27,7 +27,7 @@ | ||||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////
 | ////////////////////////////////////////////////////////////////////////////////////////////////////
 | ||||||
| // Platform detection
 | // Platform detection
 | ||||||
| 
 | 
 | ||||||
| #if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) | #if defined(ARCHITECTURE_x86_64) || defined(__aarch64__) | ||||||
|     #define EMU_ARCH_BITS 64 |     #define EMU_ARCH_BITS 64 | ||||||
| #elif defined(__i386) || defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) | #elif defined(__i386) || defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) | ||||||
|     #define EMU_ARCH_BITS 32 |     #define EMU_ARCH_BITS 32 | ||||||
|  |  | ||||||
							
								
								
									
										680
									
								
								src/common/x64/abi.cpp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										680
									
								
								src/common/x64/abi.cpp
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,680 @@ | ||||||
|  | // Copyright (C) 2003 Dolphin Project.
 | ||||||
|  | 
 | ||||||
|  | // This program is free software: you can redistribute it and/or modify
 | ||||||
|  | // it under the terms of the GNU General Public License as published by
 | ||||||
|  | // the Free Software Foundation, version 2.0 or later versions.
 | ||||||
|  | 
 | ||||||
|  | // This program is distributed in the hope that it will be useful,
 | ||||||
|  | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | ||||||
|  | // GNU General Public License 2.0 for more details.
 | ||||||
|  | 
 | ||||||
|  | // A copy of the GPL 2.0 should have been included with the program.
 | ||||||
|  | // If not, see http://www.gnu.org/licenses/
 | ||||||
|  | 
 | ||||||
|  | // Official SVN repository and contact information can be found at
 | ||||||
|  | // http://code.google.com/p/dolphin-emu/
 | ||||||
|  | 
 | ||||||
|  | #include "abi.h" | ||||||
|  | #include "emitter.h" | ||||||
|  | 
 | ||||||
|  | using namespace Gen; | ||||||
|  | 
 | ||||||
|  | // Shared code between Win64 and Unix64
 | ||||||
|  | 
 | ||||||
|  | // Sets up a __cdecl function.
 | ||||||
|  | void XEmitter::ABI_EmitPrologue(int maxCallParams) | ||||||
|  | { | ||||||
|  | #ifdef _M_IX86 | ||||||
|  |     // Don't really need to do anything
 | ||||||
|  | #elif defined(ARCHITECTURE_x86_64) | ||||||
|  | #if _WIN32 | ||||||
|  |     int stacksize = ((maxCallParams + 1) & ~1) * 8 + 8; | ||||||
|  |     // Set up a stack frame so that we can call functions
 | ||||||
|  |     // TODO: use maxCallParams
 | ||||||
|  |     SUB(64, R(RSP), Imm8(stacksize)); | ||||||
|  | #endif | ||||||
|  | #else | ||||||
|  | #error Arch not supported | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_EmitEpilogue(int maxCallParams) | ||||||
|  | { | ||||||
|  | #ifdef _M_IX86 | ||||||
|  |     RET(); | ||||||
|  | #elif defined(ARCHITECTURE_x86_64) | ||||||
|  | #ifdef _WIN32 | ||||||
|  |     int stacksize = ((maxCallParams+1)&~1)*8 + 8; | ||||||
|  |     ADD(64, R(RSP), Imm8(stacksize)); | ||||||
|  | #endif | ||||||
|  |     RET(); | ||||||
|  | #else | ||||||
|  | #error Arch not supported | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #ifdef _M_IX86 // All32
 | ||||||
|  | 
 | ||||||
|  | // Shared code between Win32 and Unix32
 | ||||||
|  | void XEmitter::ABI_CallFunction(const void *func) { | ||||||
|  |     ABI_AlignStack(0); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(0); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionC16(const void *func, u16 param1) { | ||||||
|  |     ABI_AlignStack(1 * 2); | ||||||
|  |     PUSH(16, Imm16(param1)); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(1 * 2); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2) { | ||||||
|  |     ABI_AlignStack(1 * 2 + 1 * 4); | ||||||
|  |     PUSH(16, Imm16(param2)); | ||||||
|  |     PUSH(32, Imm32(param1)); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(1 * 2 + 1 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionC(const void *func, u32 param1) { | ||||||
|  |     ABI_AlignStack(1 * 4); | ||||||
|  |     PUSH(32, Imm32(param1)); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(1 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionCC(const void *func, u32 param1, u32 param2) { | ||||||
|  |     ABI_AlignStack(2 * 4); | ||||||
|  |     PUSH(32, Imm32(param2)); | ||||||
|  |     PUSH(32, Imm32(param1)); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(2 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3) { | ||||||
|  |     ABI_AlignStack(3 * 4); | ||||||
|  |     PUSH(32, Imm32(param3)); | ||||||
|  |     PUSH(32, Imm32(param2)); | ||||||
|  |     PUSH(32, Imm32(param1)); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(3 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3) { | ||||||
|  |     ABI_AlignStack(3 * 4); | ||||||
|  |     PUSH(32, ImmPtr(param3)); | ||||||
|  |     PUSH(32, Imm32(param2)); | ||||||
|  |     PUSH(32, Imm32(param1)); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(3 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2,u32 param3, void *param4) { | ||||||
|  |     ABI_AlignStack(4 * 4); | ||||||
|  |     PUSH(32, ImmPtr(param4)); | ||||||
|  |     PUSH(32, Imm32(param3)); | ||||||
|  |     PUSH(32, Imm32(param2)); | ||||||
|  |     PUSH(32, Imm32(param1)); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(4 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionP(const void *func, void *param1) { | ||||||
|  |     ABI_AlignStack(1 * 4); | ||||||
|  |     PUSH(32, ImmPtr(param1)); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(1 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2) { | ||||||
|  |     ABI_AlignStack(2 * 4); | ||||||
|  |     PUSH(32, arg2); | ||||||
|  |     PUSH(32, ImmPtr(param1)); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(2 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3) { | ||||||
|  |     ABI_AlignStack(3 * 4); | ||||||
|  |     PUSH(32, arg3); | ||||||
|  |     PUSH(32, arg2); | ||||||
|  |     PUSH(32, ImmPtr(param1)); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(3 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3) { | ||||||
|  |     ABI_AlignStack(3 * 4); | ||||||
|  |     PUSH(32, Imm32(param3)); | ||||||
|  |     PUSH(32, ImmPtr(param2)); | ||||||
|  |     PUSH(32, ImmPtr(param1)); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(3 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Pass a register as a parameter.
 | ||||||
|  | void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1) { | ||||||
|  |     ABI_AlignStack(1 * 4); | ||||||
|  |     PUSH(32, R(reg1)); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(1 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Pass two registers as parameters.
 | ||||||
|  | void XEmitter::ABI_CallFunctionRR(const void *func, Gen::X64Reg reg1, Gen::X64Reg reg2) | ||||||
|  | { | ||||||
|  |     ABI_AlignStack(2 * 4); | ||||||
|  |     PUSH(32, R(reg2)); | ||||||
|  |     PUSH(32, R(reg1)); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(2 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2) | ||||||
|  | { | ||||||
|  |     ABI_AlignStack(2 * 4); | ||||||
|  |     PUSH(32, Imm32(param2)); | ||||||
|  |     PUSH(32, arg1); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(2 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3) | ||||||
|  | { | ||||||
|  |     ABI_AlignStack(3 * 4); | ||||||
|  |     PUSH(32, Imm32(param3)); | ||||||
|  |     PUSH(32, Imm32(param2)); | ||||||
|  |     PUSH(32, arg1); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(3 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1) | ||||||
|  | { | ||||||
|  |     ABI_AlignStack(1 * 4); | ||||||
|  |     PUSH(32, arg1); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(1 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2) | ||||||
|  | { | ||||||
|  |     ABI_AlignStack(2 * 4); | ||||||
|  |     PUSH(32, arg2); | ||||||
|  |     PUSH(32, arg1); | ||||||
|  |     CALL(func); | ||||||
|  |     ABI_RestoreStack(2 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { | ||||||
|  |     // Note: 4 * 4 = 16 bytes, so alignment is preserved.
 | ||||||
|  |     PUSH(EBP); | ||||||
|  |     PUSH(EBX); | ||||||
|  |     PUSH(ESI); | ||||||
|  |     PUSH(EDI); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { | ||||||
|  |     POP(EDI); | ||||||
|  |     POP(ESI); | ||||||
|  |     POP(EBX); | ||||||
|  |     POP(EBP); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) { | ||||||
|  |     frameSize += 4; // reserve space for return address
 | ||||||
|  |     unsigned int alignedSize = | ||||||
|  | #ifdef __GNUC__ | ||||||
|  |         (frameSize + 15) & -16; | ||||||
|  | #else | ||||||
|  |         (frameSize + 3) & -4; | ||||||
|  | #endif | ||||||
|  |     return alignedSize; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_AlignStack(unsigned int frameSize) { | ||||||
|  | // Mac OS X requires the stack to be 16-byte aligned before every call.
 | ||||||
|  | // Linux requires the stack to be 16-byte aligned before calls that put SSE
 | ||||||
|  | // vectors on the stack, but since we do not keep track of which calls do that,
 | ||||||
|  | // it is effectively every call as well.
 | ||||||
|  | // Windows binaries compiled with MSVC do not have such a restriction*, but I
 | ||||||
|  | // expect that GCC on Windows acts the same as GCC on Linux in this respect.
 | ||||||
|  | // It would be nice if someone could verify this.
 | ||||||
|  | // *However, the MSVC optimizing compiler assumes a 4-byte-aligned stack at times.
 | ||||||
|  |     unsigned int fillSize = | ||||||
|  |         ABI_GetAlignedFrameSize(frameSize) - (frameSize + 4); | ||||||
|  |     if (fillSize != 0) { | ||||||
|  |         SUB(32, R(ESP), Imm8(fillSize)); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_RestoreStack(unsigned int frameSize) { | ||||||
|  |     unsigned int alignedSize = ABI_GetAlignedFrameSize(frameSize); | ||||||
|  |     alignedSize -= 4; // return address is POPped at end of call
 | ||||||
|  |     if (alignedSize != 0) { | ||||||
|  |         ADD(32, R(ESP), Imm8(alignedSize)); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #else //64bit
 | ||||||
|  | 
 | ||||||
|  | // Common functions
 | ||||||
|  | void XEmitter::ABI_CallFunction(const void *func) { | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionC16(const void *func, u16 param1) { | ||||||
|  |     MOV(32, R(ABI_PARAM1), Imm32((u32)param1)); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2) { | ||||||
|  |     MOV(32, R(ABI_PARAM1), Imm32(param1)); | ||||||
|  |     MOV(32, R(ABI_PARAM2), Imm32((u32)param2)); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |         && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |             // Far call
 | ||||||
|  |             MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |             CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionC(const void *func, u32 param1) { | ||||||
|  |     MOV(32, R(ABI_PARAM1), Imm32(param1)); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionCC(const void *func, u32 param1, u32 param2) { | ||||||
|  |     MOV(32, R(ABI_PARAM1), Imm32(param1)); | ||||||
|  |     MOV(32, R(ABI_PARAM2), Imm32(param2)); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3) { | ||||||
|  |     MOV(32, R(ABI_PARAM1), Imm32(param1)); | ||||||
|  |     MOV(32, R(ABI_PARAM2), Imm32(param2)); | ||||||
|  |     MOV(32, R(ABI_PARAM3), Imm32(param3)); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3) { | ||||||
|  |     MOV(32, R(ABI_PARAM1), Imm32(param1)); | ||||||
|  |     MOV(32, R(ABI_PARAM2), Imm32(param2)); | ||||||
|  |     MOV(64, R(ABI_PARAM3), ImmPtr(param3)); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2, u32 param3, void *param4) { | ||||||
|  |     MOV(32, R(ABI_PARAM1), Imm32(param1)); | ||||||
|  |     MOV(32, R(ABI_PARAM2), Imm32(param2)); | ||||||
|  |     MOV(32, R(ABI_PARAM3), Imm32(param3)); | ||||||
|  |     MOV(64, R(ABI_PARAM4), ImmPtr(param4)); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionP(const void *func, void *param1) { | ||||||
|  |     MOV(64, R(ABI_PARAM1), ImmPtr(param1)); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2) { | ||||||
|  |     MOV(64, R(ABI_PARAM1), ImmPtr(param1)); | ||||||
|  |     if (!arg2.IsSimpleReg(ABI_PARAM2)) | ||||||
|  |         MOV(32, R(ABI_PARAM2), arg2); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3) { | ||||||
|  |     MOV(64, R(ABI_PARAM1), ImmPtr(param1)); | ||||||
|  |     if (!arg2.IsSimpleReg(ABI_PARAM2)) | ||||||
|  |         MOV(32, R(ABI_PARAM2), arg2); | ||||||
|  |     if (!arg3.IsSimpleReg(ABI_PARAM3)) | ||||||
|  |         MOV(32, R(ABI_PARAM3), arg3); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3) { | ||||||
|  |     MOV(64, R(ABI_PARAM1), ImmPtr(param1)); | ||||||
|  |     MOV(64, R(ABI_PARAM2), ImmPtr(param2)); | ||||||
|  |     MOV(32, R(ABI_PARAM3), Imm32(param3)); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Pass a register as a parameter.
 | ||||||
|  | void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1) { | ||||||
|  |     if (reg1 != ABI_PARAM1) | ||||||
|  |         MOV(32, R(ABI_PARAM1), R(reg1)); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Pass two registers as parameters.
 | ||||||
|  | void XEmitter::ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2) { | ||||||
|  |     if (reg2 != ABI_PARAM1) { | ||||||
|  |         if (reg1 != ABI_PARAM1) | ||||||
|  |             MOV(64, R(ABI_PARAM1), R(reg1)); | ||||||
|  |         if (reg2 != ABI_PARAM2) | ||||||
|  |             MOV(64, R(ABI_PARAM2), R(reg2)); | ||||||
|  |     } else { | ||||||
|  |         if (reg2 != ABI_PARAM2) | ||||||
|  |             MOV(64, R(ABI_PARAM2), R(reg2)); | ||||||
|  |         if (reg1 != ABI_PARAM1) | ||||||
|  |             MOV(64, R(ABI_PARAM1), R(reg1)); | ||||||
|  |     } | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2) | ||||||
|  | { | ||||||
|  |     if (!arg1.IsSimpleReg(ABI_PARAM1)) | ||||||
|  |         MOV(32, R(ABI_PARAM1), arg1); | ||||||
|  |     MOV(32, R(ABI_PARAM2), Imm32(param2)); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3) | ||||||
|  | { | ||||||
|  |     if (!arg1.IsSimpleReg(ABI_PARAM1)) | ||||||
|  |         MOV(32, R(ABI_PARAM1), arg1); | ||||||
|  |     MOV(32, R(ABI_PARAM2), Imm32(param2)); | ||||||
|  |     MOV(64, R(ABI_PARAM3), Imm64(param3)); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1) | ||||||
|  | { | ||||||
|  |     if (!arg1.IsSimpleReg(ABI_PARAM1)) | ||||||
|  |         MOV(32, R(ABI_PARAM1), arg1); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2) | ||||||
|  | { | ||||||
|  |     if (!arg1.IsSimpleReg(ABI_PARAM1)) | ||||||
|  |         MOV(32, R(ABI_PARAM1), arg1); | ||||||
|  |     if (!arg2.IsSimpleReg(ABI_PARAM2)) | ||||||
|  |         MOV(32, R(ABI_PARAM2), arg2); | ||||||
|  |     u64 distance = u64(func) - (u64(code) + 5); | ||||||
|  |     if (distance >= 0x0000000080000000ULL | ||||||
|  |      && distance <  0xFFFFFFFF80000000ULL) { | ||||||
|  |         // Far call
 | ||||||
|  |         MOV(64, R(RAX), ImmPtr(func)); | ||||||
|  |         CALLptr(R(RAX)); | ||||||
|  |     } else { | ||||||
|  |         CALL(func); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) { | ||||||
|  |     return frameSize; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #ifdef _WIN32 | ||||||
|  | 
 | ||||||
|  | // The Windows x64 ABI requires XMM6 - XMM15 to be callee saved.  10 regs.
 | ||||||
|  | // But, not saving XMM4 and XMM5 breaks things in VS 2010, even though they are volatile regs.
 | ||||||
|  | // Let's just save all 16.
 | ||||||
|  | const int XMM_STACK_SPACE = 16 * 16; | ||||||
|  | 
 | ||||||
|  | // Win64 Specific Code
 | ||||||
|  | void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { | ||||||
|  |     //we only want to do this once
 | ||||||
|  |     PUSH(RBX); | ||||||
|  |     PUSH(RSI); | ||||||
|  |     PUSH(RDI); | ||||||
|  |     PUSH(RBP); | ||||||
|  |     PUSH(R12); | ||||||
|  |     PUSH(R13); | ||||||
|  |     PUSH(R14); | ||||||
|  |     PUSH(R15); | ||||||
|  |     ABI_AlignStack(0); | ||||||
|  | 
 | ||||||
|  |     // Do this after aligning, because before it's offset by 8.
 | ||||||
|  |     SUB(64, R(RSP), Imm32(XMM_STACK_SPACE)); | ||||||
|  |     for (int i = 0; i < 16; ++i) | ||||||
|  |         MOVAPS(MDisp(RSP, i * 16), (X64Reg)(XMM0 + i)); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { | ||||||
|  |     for (int i = 0; i < 16; ++i) | ||||||
|  |         MOVAPS((X64Reg)(XMM0 + i), MDisp(RSP, i * 16)); | ||||||
|  |     ADD(64, R(RSP), Imm32(XMM_STACK_SPACE)); | ||||||
|  | 
 | ||||||
|  |     ABI_RestoreStack(0); | ||||||
|  |     POP(R15); | ||||||
|  |     POP(R14); | ||||||
|  |     POP(R13); | ||||||
|  |     POP(R12); | ||||||
|  |     POP(RBP); | ||||||
|  |     POP(RDI); | ||||||
|  |     POP(RSI); | ||||||
|  |     POP(RBX); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // Win64 Specific Code
 | ||||||
|  | void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() { | ||||||
|  |     PUSH(RCX); | ||||||
|  |     PUSH(RDX); | ||||||
|  |     PUSH(RSI); | ||||||
|  |     PUSH(RDI); | ||||||
|  |     PUSH(R8); | ||||||
|  |     PUSH(R9); | ||||||
|  |     PUSH(R10); | ||||||
|  |     PUSH(R11); | ||||||
|  |     // TODO: Callers preserve XMM4-5 (XMM0-3 are args.)
 | ||||||
|  |     ABI_AlignStack(0); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() { | ||||||
|  |     ABI_RestoreStack(0); | ||||||
|  |     POP(R11); | ||||||
|  |     POP(R10); | ||||||
|  |     POP(R9); | ||||||
|  |     POP(R8); | ||||||
|  |     POP(RDI); | ||||||
|  |     POP(RSI); | ||||||
|  |     POP(RDX); | ||||||
|  |     POP(RCX); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) { | ||||||
|  |     SUB(64, R(RSP), Imm8(0x28)); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) { | ||||||
|  |     ADD(64, R(RSP), Imm8(0x28)); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #else | ||||||
|  | // Unix64 Specific Code
 | ||||||
|  | void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() { | ||||||
|  |     PUSH(RBX); | ||||||
|  |     PUSH(RBP); | ||||||
|  |     PUSH(R12); | ||||||
|  |     PUSH(R13); | ||||||
|  |     PUSH(R14); | ||||||
|  |     PUSH(R15); | ||||||
|  |     PUSH(R15); //just to align stack. duped push/pop doesn't hurt.
 | ||||||
|  |     // TODO: XMM?
 | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() { | ||||||
|  |     POP(R15); | ||||||
|  |     POP(R15); | ||||||
|  |     POP(R14); | ||||||
|  |     POP(R13); | ||||||
|  |     POP(R12); | ||||||
|  |     POP(RBP); | ||||||
|  |     POP(RBX); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() { | ||||||
|  |     PUSH(RCX); | ||||||
|  |     PUSH(RDX); | ||||||
|  |     PUSH(RSI); | ||||||
|  |     PUSH(RDI); | ||||||
|  |     PUSH(R8); | ||||||
|  |     PUSH(R9); | ||||||
|  |     PUSH(R10); | ||||||
|  |     PUSH(R11); | ||||||
|  |     PUSH(R11); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() { | ||||||
|  |     POP(R11); | ||||||
|  |     POP(R11); | ||||||
|  |     POP(R10); | ||||||
|  |     POP(R9); | ||||||
|  |     POP(R8); | ||||||
|  |     POP(RDI); | ||||||
|  |     POP(RSI); | ||||||
|  |     POP(RDX); | ||||||
|  |     POP(RCX); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) { | ||||||
|  |     SUB(64, R(RSP), Imm8(0x08)); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) { | ||||||
|  |     ADD(64, R(RSP), Imm8(0x08)); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #endif // WIN32
 | ||||||
|  | 
 | ||||||
|  | #endif // 32bit
 | ||||||
							
								
								
									
										78
									
								
								src/common/x64/abi.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								src/common/x64/abi.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,78 @@ | ||||||
|  | // Copyright (C) 2003 Dolphin Project.
 | ||||||
|  | 
 | ||||||
|  | // This program is free software: you can redistribute it and/or modify
 | ||||||
|  | // it under the terms of the GNU General Public License as published by
 | ||||||
|  | // the Free Software Foundation, version 2.0 or later versions.
 | ||||||
|  | 
 | ||||||
|  | // This program is distributed in the hope that it will be useful,
 | ||||||
|  | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||||
|  | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | ||||||
|  | // GNU General Public License 2.0 for more details.
 | ||||||
|  | 
 | ||||||
|  | // A copy of the GPL 2.0 should have been included with the program.
 | ||||||
|  | // If not, see http://www.gnu.org/licenses/
 | ||||||
|  | 
 | ||||||
|  | // Official SVN repository and contact information can be found at
 | ||||||
|  | // http://code.google.com/p/dolphin-emu/
 | ||||||
|  | 
 | ||||||
|  | #pragma once | ||||||
|  | 
 | ||||||
|  | #include "common/common_types.h" | ||||||
|  | 
 | ||||||
|  | // x86/x64 ABI:s, and helpers to help follow them when JIT-ing code.
 | ||||||
|  | // All convensions return values in EAX (+ possibly EDX).
 | ||||||
|  | 
 | ||||||
|  | // Linux 32-bit, Windows 32-bit (cdecl, System V):
 | ||||||
|  | // * Caller pushes left to right
 | ||||||
|  | // * Caller fixes stack after call
 | ||||||
|  | // * function subtract from stack for local storage only.
 | ||||||
|  | // Scratch:      EAX ECX EDX
 | ||||||
|  | // Callee-save:  EBX ESI EDI EBP
 | ||||||
|  | // Parameters:   -
 | ||||||
|  | 
 | ||||||
|  | // Windows 64-bit
 | ||||||
|  | // * 4-reg "fastcall" variant, very new-skool stack handling
 | ||||||
|  | // * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself calls_
 | ||||||
|  | // * Parameters passed in RCX, RDX, ... further parameters are MOVed into the allocated stack space.
 | ||||||
|  | // Scratch:      RAX RCX RDX R8 R9 R10 R11
 | ||||||
|  | // Callee-save:  RBX RSI RDI RBP R12 R13 R14 R15
 | ||||||
|  | // Parameters:   RCX RDX R8 R9, further MOV-ed
 | ||||||
|  | 
 | ||||||
|  | // Linux 64-bit
 | ||||||
|  | // * 6-reg "fastcall" variant, old skool stack handling (parameters are pushed)
 | ||||||
|  | // Scratch:      RAX RCX RDX RSI RDI R8 R9 R10 R11
 | ||||||
|  | // Callee-save:  RBX RBP R12 R13 R14 R15
 | ||||||
|  | // Parameters:   RDI RSI RDX RCX R8 R9
 | ||||||
|  | 
 | ||||||
|  | #ifdef _M_IX86 // 32 bit calling convention, shared by all
 | ||||||
|  | 
 | ||||||
|  | // 32-bit don't pass parameters in regs, but these are convenient to have anyway when we have to
 | ||||||
|  | // choose regs to put stuff in.
 | ||||||
|  | #define ABI_PARAM1 RCX | ||||||
|  | #define ABI_PARAM2 RDX | ||||||
|  | 
 | ||||||
|  | // There are no ABI_PARAM* here, since args are pushed.
 | ||||||
|  | // 32-bit bog standard cdecl, shared between linux and windows
 | ||||||
|  | // MacOSX 32-bit is same as System V with a few exceptions that we probably don't care much about.
 | ||||||
|  | 
 | ||||||
|  | #elif ARCHITECTURE_x86_64 // 64 bit calling convention
 | ||||||
|  | 
 | ||||||
|  | #ifdef _WIN32 // 64-bit Windows - the really exotic calling convention
 | ||||||
|  | 
 | ||||||
|  | #define ABI_PARAM1 RCX | ||||||
|  | #define ABI_PARAM2 RDX | ||||||
|  | #define ABI_PARAM3 R8 | ||||||
|  | #define ABI_PARAM4 R9 | ||||||
|  | 
 | ||||||
|  | #else  //64-bit Unix (hopefully MacOSX too)
 | ||||||
|  | 
 | ||||||
|  | #define ABI_PARAM1 RDI | ||||||
|  | #define ABI_PARAM2 RSI | ||||||
|  | #define ABI_PARAM3 RDX | ||||||
|  | #define ABI_PARAM4 RCX | ||||||
|  | #define ABI_PARAM5 R8 | ||||||
|  | #define ABI_PARAM6 R9 | ||||||
|  | 
 | ||||||
|  | #endif // WIN32
 | ||||||
|  | 
 | ||||||
|  | #endif // X86
 | ||||||
							
								
								
									
										187
									
								
								src/common/x64/cpu_detect.cpp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										187
									
								
								src/common/x64/cpu_detect.cpp
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,187 @@ | ||||||
|  | // Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project
 | ||||||
|  | // Licensed under GPLv2 or any later version
 | ||||||
|  | // Refer to the license.txt file included.
 | ||||||
|  | 
 | ||||||
|  | #include <cstring> | ||||||
|  | #include <string> | ||||||
|  | #include <thread> | ||||||
|  | 
 | ||||||
|  | #include "common/common_types.h" | ||||||
|  | 
 | ||||||
|  | #include "cpu_detect.h" | ||||||
|  | 
 | ||||||
|  | namespace Common { | ||||||
|  | 
 | ||||||
|  | #ifndef _MSC_VER | ||||||
|  | 
 | ||||||
|  | #ifdef __FreeBSD__ | ||||||
|  | #include <sys/types.h> | ||||||
|  | #include <machine/cpufunc.h> | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | static inline void __cpuidex(int info[4], int function_id, int subfunction_id) { | ||||||
|  | #ifdef __FreeBSD__ | ||||||
|  |     // Despite the name, this is just do_cpuid() with ECX as second input.
 | ||||||
|  |     cpuid_count((u_int)function_id, (u_int)subfunction_id, (u_int*)info); | ||||||
|  | #else | ||||||
|  |     info[0] = function_id;    // eax
 | ||||||
|  |     info[2] = subfunction_id; // ecx
 | ||||||
|  |     __asm__( | ||||||
|  |         "cpuid" | ||||||
|  |         : "=a" (info[0]), | ||||||
|  |         "=b" (info[1]), | ||||||
|  |         "=c" (info[2]), | ||||||
|  |         "=d" (info[3]) | ||||||
|  |         : "a" (function_id), | ||||||
|  |         "c" (subfunction_id) | ||||||
|  |         ); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static inline void __cpuid(int info[4], int function_id) { | ||||||
|  |     return __cpuidex(info, function_id, 0); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #define _XCR_XFEATURE_ENABLED_MASK 0 | ||||||
|  | static u64 _xgetbv(u32 index) { | ||||||
|  |     u32 eax, edx; | ||||||
|  |     __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); | ||||||
|  |     return ((u64)edx << 32) | eax; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #endif // ifndef _MSC_VER
 | ||||||
|  | 
 | ||||||
|  | // Detects the various CPU features
 | ||||||
|  | static CPUCaps Detect() { | ||||||
|  |     CPUCaps caps = {}; | ||||||
|  | 
 | ||||||
|  |     caps.num_cores = std::thread::hardware_concurrency(); | ||||||
|  | 
 | ||||||
|  |     // Assumes the CPU supports the CPUID instruction. Those that don't would likely not support
 | ||||||
|  |     // Citra at all anyway
 | ||||||
|  | 
 | ||||||
|  |     int cpu_id[4]; | ||||||
|  |     memset(caps.brand_string, 0, sizeof(caps.brand_string)); | ||||||
|  | 
 | ||||||
|  |     // Detect CPU's CPUID capabilities and grab CPU string
 | ||||||
|  |     __cpuid(cpu_id, 0x00000000); | ||||||
|  |     u32 max_std_fn = cpu_id[0]; // EAX
 | ||||||
|  | 
 | ||||||
|  |     std::memcpy(&caps.brand_string[0], &cpu_id[1], sizeof(int)); | ||||||
|  |     std::memcpy(&caps.brand_string[4], &cpu_id[3], sizeof(int)); | ||||||
|  |     std::memcpy(&caps.brand_string[8], &cpu_id[2], sizeof(int)); | ||||||
|  | 
 | ||||||
|  |     __cpuid(cpu_id, 0x80000000); | ||||||
|  | 
 | ||||||
|  |     u32 max_ex_fn = cpu_id[0]; | ||||||
|  |     if (!strcmp(caps.brand_string, "GenuineIntel")) | ||||||
|  |         caps.vendor = CPUVendor::INTEL; | ||||||
|  |     else if (!strcmp(caps.brand_string, "AuthenticAMD")) | ||||||
|  |         caps.vendor = CPUVendor::AMD; | ||||||
|  |     else | ||||||
|  |         caps.vendor = CPUVendor::OTHER; | ||||||
|  | 
 | ||||||
|  |     // Set reasonable default brand string even if brand string not available
 | ||||||
|  |     strcpy(caps.cpu_string, caps.brand_string); | ||||||
|  | 
 | ||||||
|  |     // Detect family and other miscellaneous features
 | ||||||
|  |     if (max_std_fn >= 1) { | ||||||
|  |         __cpuid(cpu_id, 0x00000001); | ||||||
|  | 
 | ||||||
|  |         if ((cpu_id[3] >> 25) & 1) caps.sse = true; | ||||||
|  |         if ((cpu_id[3] >> 26) & 1) caps.sse2 = true; | ||||||
|  |         if ((cpu_id[2]) & 1) caps.sse3 = true; | ||||||
|  |         if ((cpu_id[2] >> 9) & 1) caps.ssse3 = true; | ||||||
|  |         if ((cpu_id[2] >> 19) & 1) caps.sse4_1 = true; | ||||||
|  |         if ((cpu_id[2] >> 20) & 1) caps.sse4_2 = true; | ||||||
|  |         if ((cpu_id[2] >> 22) & 1) caps.movbe = true; | ||||||
|  |         if ((cpu_id[2] >> 25) & 1) caps.aes = true; | ||||||
|  | 
 | ||||||
|  |         if ((cpu_id[3] >> 24) & 1) { | ||||||
|  |             caps.fxsave_fxrstor = true; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         // AVX support requires 3 separate checks:
 | ||||||
|  |         //  - Is the AVX bit set in CPUID?
 | ||||||
|  |         //  - Is the XSAVE bit set in CPUID?
 | ||||||
|  |         //  - XGETBV result has the XCR bit set.
 | ||||||
|  |         if (((cpu_id[2] >> 28) & 1) && ((cpu_id[2] >> 27) & 1)) { | ||||||
|  |             if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) { | ||||||
|  |                 caps.avx = true; | ||||||
|  |                 if ((cpu_id[2] >> 12) & 1) | ||||||
|  |                     caps.fma = true; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         if (max_std_fn >= 7) { | ||||||
|  |             __cpuidex(cpu_id, 0x00000007, 0x00000000); | ||||||
|  |             // Can't enable AVX2 unless the XSAVE/XGETBV checks above passed
 | ||||||
|  |             if ((cpu_id[1] >> 5) & 1) | ||||||
|  |                 caps.avx2 = caps.avx; | ||||||
|  |             if ((cpu_id[1] >> 3) & 1) | ||||||
|  |                 caps.bmi1 = true; | ||||||
|  |             if ((cpu_id[1] >> 8) & 1) | ||||||
|  |                 caps.bmi2 = true; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     caps.flush_to_zero = caps.sse; | ||||||
|  | 
 | ||||||
|  |     if (max_ex_fn >= 0x80000004) { | ||||||
|  |         // Extract CPU model string
 | ||||||
|  |         __cpuid(cpu_id, 0x80000002); | ||||||
|  |         std::memcpy(caps.cpu_string, cpu_id, sizeof(cpu_id)); | ||||||
|  |         __cpuid(cpu_id, 0x80000003); | ||||||
|  |         std::memcpy(caps.cpu_string + 16, cpu_id, sizeof(cpu_id)); | ||||||
|  |         __cpuid(cpu_id, 0x80000004); | ||||||
|  |         std::memcpy(caps.cpu_string + 32, cpu_id, sizeof(cpu_id)); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (max_ex_fn >= 0x80000001) { | ||||||
|  |         // Check for more features
 | ||||||
|  |         __cpuid(cpu_id, 0x80000001); | ||||||
|  |         if (cpu_id[2] & 1) caps.lahf_sahf_64 = true; | ||||||
|  |         if ((cpu_id[2] >> 5) & 1) caps.lzcnt = true; | ||||||
|  |         if ((cpu_id[2] >> 16) & 1) caps.fma4 = true; | ||||||
|  |         if ((cpu_id[3] >> 29) & 1) caps.long_mode = true; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     return caps; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | const CPUCaps& GetCPUCaps() { | ||||||
|  |     static CPUCaps caps = Detect(); | ||||||
|  |     return caps; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | std::string GetCPUCapsString() { | ||||||
|  |     auto caps = GetCPUCaps(); | ||||||
|  | 
 | ||||||
|  |     std::string sum(caps.cpu_string); | ||||||
|  |     sum += " ("; | ||||||
|  |     sum += caps.brand_string; | ||||||
|  |     sum += ")"; | ||||||
|  | 
 | ||||||
|  |     if (caps.sse) sum += ", SSE"; | ||||||
|  |     if (caps.sse2) { | ||||||
|  |         sum += ", SSE2"; | ||||||
|  |         if (!caps.flush_to_zero) sum += " (without DAZ)"; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (caps.sse3) sum += ", SSE3"; | ||||||
|  |     if (caps.ssse3) sum += ", SSSE3"; | ||||||
|  |     if (caps.sse4_1) sum += ", SSE4.1"; | ||||||
|  |     if (caps.sse4_2) sum += ", SSE4.2"; | ||||||
|  |     if (caps.avx) sum += ", AVX"; | ||||||
|  |     if (caps.avx2) sum += ", AVX2"; | ||||||
|  |     if (caps.bmi1) sum += ", BMI1"; | ||||||
|  |     if (caps.bmi2) sum += ", BMI2"; | ||||||
|  |     if (caps.fma) sum += ", FMA"; | ||||||
|  |     if (caps.aes) sum += ", AES"; | ||||||
|  |     if (caps.movbe) sum += ", MOVBE"; | ||||||
|  |     if (caps.long_mode) sum += ", 64-bit support"; | ||||||
|  | 
 | ||||||
|  |     return sum; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | } // namespace Common
 | ||||||
							
								
								
									
										66
									
								
								src/common/x64/cpu_detect.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										66
									
								
								src/common/x64/cpu_detect.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,66 @@ | ||||||
|  | // Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project
 | ||||||
|  | // Licensed under GPLv2 or any later version
 | ||||||
|  | // Refer to the license.txt file included.
 | ||||||
|  | 
 | ||||||
|  | #pragma once | ||||||
|  | 
 | ||||||
|  | #include <string> | ||||||
|  | 
 | ||||||
|  | namespace Common { | ||||||
|  | 
 | ||||||
|  | /// x86/x64 CPU vendors that may be detected by this module
 | ||||||
|  | enum class CPUVendor { | ||||||
|  |     INTEL, | ||||||
|  |     AMD, | ||||||
|  |     OTHER, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | /// x86/x64 CPU capabilities that may be detected by this module
 | ||||||
|  | struct CPUCaps { | ||||||
|  |     CPUVendor vendor; | ||||||
|  |     char cpu_string[0x21]; | ||||||
|  |     char brand_string[0x41]; | ||||||
|  |     int num_cores; | ||||||
|  |     bool sse; | ||||||
|  |     bool sse2; | ||||||
|  |     bool sse3; | ||||||
|  |     bool ssse3; | ||||||
|  |     bool sse4_1; | ||||||
|  |     bool sse4_2; | ||||||
|  |     bool lzcnt; | ||||||
|  |     bool avx; | ||||||
|  |     bool avx2; | ||||||
|  |     bool bmi1; | ||||||
|  |     bool bmi2; | ||||||
|  |     bool fma; | ||||||
|  |     bool fma4; | ||||||
|  |     bool aes; | ||||||
|  | 
 | ||||||
|  |     // Support for the FXSAVE and FXRSTOR instructions
 | ||||||
|  |     bool fxsave_fxrstor; | ||||||
|  | 
 | ||||||
|  |     bool movbe; | ||||||
|  | 
 | ||||||
|  |     // This flag indicates that the hardware supports some mode in which denormal inputs and outputs
 | ||||||
|  |     // are automatically set to (signed) zero.
 | ||||||
|  |     bool flush_to_zero; | ||||||
|  | 
 | ||||||
|  |     // Support for LAHF and SAHF instructions in 64-bit mode
 | ||||||
|  |     bool lahf_sahf_64; | ||||||
|  | 
 | ||||||
|  |     bool long_mode; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * Gets the supported capabilities of the host CPU | ||||||
|  |  * @return Reference to a CPUCaps struct with the detected host CPU capabilities | ||||||
|  |  */ | ||||||
|  | const CPUCaps& GetCPUCaps(); | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * Gets a string summary of the name and supported capabilities of the host CPU | ||||||
|  |  * @return String summary | ||||||
|  |  */ | ||||||
|  | std::string GetCPUCapsString(); | ||||||
|  | 
 | ||||||
|  | } // namespace Common
 | ||||||
							
								
								
									
										1989
									
								
								src/common/x64/emitter.cpp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1989
									
								
								src/common/x64/emitter.cpp
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										1067
									
								
								src/common/x64/emitter.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1067
									
								
								src/common/x64/emitter.h
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							|  | @ -53,6 +53,7 @@ struct Values { | ||||||
| 
 | 
 | ||||||
|     // Renderer
 |     // Renderer
 | ||||||
|     bool use_hw_renderer; |     bool use_hw_renderer; | ||||||
|  |     bool use_shader_jit; | ||||||
| 
 | 
 | ||||||
|     float bg_red; |     float bg_red; | ||||||
|     float bg_green; |     float bg_green; | ||||||
|  |  | ||||||
|  | @ -11,8 +11,9 @@ set(SRCS | ||||||
|             pica.cpp |             pica.cpp | ||||||
|             primitive_assembly.cpp |             primitive_assembly.cpp | ||||||
|             rasterizer.cpp |             rasterizer.cpp | ||||||
|  |             shader/shader.cpp | ||||||
|  |             shader/shader_interpreter.cpp | ||||||
|             utils.cpp |             utils.cpp | ||||||
|             vertex_shader.cpp |  | ||||||
|             video_core.cpp |             video_core.cpp | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|  | @ -35,11 +36,20 @@ set(HEADERS | ||||||
|             primitive_assembly.h |             primitive_assembly.h | ||||||
|             rasterizer.h |             rasterizer.h | ||||||
|             renderer_base.h |             renderer_base.h | ||||||
|  |             shader/shader.h | ||||||
|  |             shader/shader_interpreter.h | ||||||
|             utils.h |             utils.h | ||||||
|             vertex_shader.h |  | ||||||
|             video_core.h |             video_core.h | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|  | if(ARCHITECTURE_x86_64) | ||||||
|  |     set(SRCS ${SRCS} | ||||||
|  |             shader/shader_jit_x64.cpp) | ||||||
|  | 
 | ||||||
|  |     set(HEADERS ${HEADERS} | ||||||
|  |             shader/shader_jit_x64.h) | ||||||
|  | endif() | ||||||
|  | 
 | ||||||
| create_directory_groups(${SRCS} ${HEADERS}) | create_directory_groups(${SRCS} ${HEADERS}) | ||||||
| 
 | 
 | ||||||
| add_library(video_core STATIC ${SRCS} ${HEADERS}) | add_library(video_core STATIC ${SRCS} ${HEADERS}) | ||||||
|  |  | ||||||
|  | @ -7,7 +7,7 @@ | ||||||
| #include "clipper.h" | #include "clipper.h" | ||||||
| #include "pica.h" | #include "pica.h" | ||||||
| #include "rasterizer.h" | #include "rasterizer.h" | ||||||
| #include "vertex_shader.h" | #include "shader/shader_interpreter.h" | ||||||
| 
 | 
 | ||||||
| namespace Pica { | namespace Pica { | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -6,13 +6,13 @@ | ||||||
| 
 | 
 | ||||||
| namespace Pica { | namespace Pica { | ||||||
| 
 | 
 | ||||||
| namespace VertexShader { | namespace Shader { | ||||||
|     struct OutputVertex; |     struct OutputVertex; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| namespace Clipper { | namespace Clipper { | ||||||
| 
 | 
 | ||||||
| using VertexShader::OutputVertex; | using Shader::OutputVertex; | ||||||
| 
 | 
 | ||||||
| void ProcessTriangle(OutputVertex& v0, OutputVertex& v1, OutputVertex& v2); | void ProcessTriangle(OutputVertex& v0, OutputVertex& v1, OutputVertex& v2); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -18,7 +18,7 @@ | ||||||
| #include "pica.h" | #include "pica.h" | ||||||
| #include "primitive_assembly.h" | #include "primitive_assembly.h" | ||||||
| #include "renderer_base.h" | #include "renderer_base.h" | ||||||
| #include "vertex_shader.h" | #include "shader/shader_interpreter.h" | ||||||
| #include "video_core.h" | #include "video_core.h" | ||||||
| 
 | 
 | ||||||
| namespace Pica { | namespace Pica { | ||||||
|  | @ -165,7 +165,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | ||||||
|             DebugUtils::GeometryDumper geometry_dumper; |             DebugUtils::GeometryDumper geometry_dumper; | ||||||
|             PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex> dumping_primitive_assembler(regs.triangle_topology.Value()); |             PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex> dumping_primitive_assembler(regs.triangle_topology.Value()); | ||||||
| #endif | #endif | ||||||
|             PrimitiveAssembler<VertexShader::OutputVertex> primitive_assembler(regs.triangle_topology.Value()); |             PrimitiveAssembler<Shader::OutputVertex> primitive_assembler(regs.triangle_topology.Value()); | ||||||
| 
 | 
 | ||||||
|             if (g_debug_context) { |             if (g_debug_context) { | ||||||
|                 for (int i = 0; i < 3; ++i) { |                 for (int i = 0; i < 3; ++i) { | ||||||
|  | @ -210,11 +210,14 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | ||||||
|             // The size has been tuned for optimal balance between hit-rate and the cost of lookup
 |             // The size has been tuned for optimal balance between hit-rate and the cost of lookup
 | ||||||
|             const size_t VERTEX_CACHE_SIZE = 32; |             const size_t VERTEX_CACHE_SIZE = 32; | ||||||
|             std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids; |             std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids; | ||||||
|             std::array<VertexShader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache; |             std::array<Shader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache; | ||||||
| 
 | 
 | ||||||
|             unsigned int vertex_cache_pos = 0; |             unsigned int vertex_cache_pos = 0; | ||||||
|             vertex_cache_ids.fill(-1); |             vertex_cache_ids.fill(-1); | ||||||
| 
 | 
 | ||||||
|  |             Shader::UnitState shader_unit; | ||||||
|  |             Shader::Setup(shader_unit); | ||||||
|  | 
 | ||||||
|             for (unsigned int index = 0; index < regs.num_vertices; ++index) |             for (unsigned int index = 0; index < regs.num_vertices; ++index) | ||||||
|             { |             { | ||||||
|                 unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index; |                 unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index; | ||||||
|  | @ -224,7 +227,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | ||||||
|                 ASSERT(vertex != -1); |                 ASSERT(vertex != -1); | ||||||
| 
 | 
 | ||||||
|                 bool vertex_cache_hit = false; |                 bool vertex_cache_hit = false; | ||||||
|                 VertexShader::OutputVertex output; |                 Shader::OutputVertex output; | ||||||
| 
 | 
 | ||||||
|                 if (is_indexed) { |                 if (is_indexed) { | ||||||
|                     if (g_debug_context && Pica::g_debug_context->recorder) { |                     if (g_debug_context && Pica::g_debug_context->recorder) { | ||||||
|  | @ -243,7 +246,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | ||||||
| 
 | 
 | ||||||
|                 if (!vertex_cache_hit) { |                 if (!vertex_cache_hit) { | ||||||
|                     // Initialize data for the current vertex
 |                     // Initialize data for the current vertex
 | ||||||
|                     VertexShader::InputVertex input; |                     Shader::InputVertex input; | ||||||
| 
 | 
 | ||||||
|                     for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) { |                     for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) { | ||||||
|                         if (vertex_attribute_elements[i] != 0) { |                         if (vertex_attribute_elements[i] != 0) { | ||||||
|  | @ -306,9 +309,8 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | ||||||
|                                                              std::bind(&DebugUtils::GeometryDumper::AddTriangle, |                                                              std::bind(&DebugUtils::GeometryDumper::AddTriangle, | ||||||
|                                                                        &geometry_dumper, _1, _2, _3)); |                                                                        &geometry_dumper, _1, _2, _3)); | ||||||
| #endif | #endif | ||||||
| 
 |  | ||||||
|                     // Send to vertex shader
 |                     // Send to vertex shader
 | ||||||
|                     output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs, g_state.vs); |                     output = Shader::Run(shader_unit, input, attribute_config.GetNumTotalAttributes()); | ||||||
| 
 | 
 | ||||||
|                     if (is_indexed) { |                     if (is_indexed) { | ||||||
|                         vertex_cache[vertex_cache_pos] = output; |                         vertex_cache[vertex_cache_pos] = output; | ||||||
|  | @ -319,9 +321,9 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) { | ||||||
| 
 | 
 | ||||||
|                 if (Settings::values.use_hw_renderer) { |                 if (Settings::values.use_hw_renderer) { | ||||||
|                     // Send to hardware renderer
 |                     // Send to hardware renderer
 | ||||||
|                     static auto AddHWTriangle = [](const Pica::VertexShader::OutputVertex& v0, |                     static auto AddHWTriangle = [](const Pica::Shader::OutputVertex& v0, | ||||||
|                                                    const Pica::VertexShader::OutputVertex& v1, |                                                    const Pica::Shader::OutputVertex& v1, | ||||||
|                                                    const Pica::VertexShader::OutputVertex& v2) { |                                                    const Pica::Shader::OutputVertex& v2) { | ||||||
|                         VideoCore::g_renderer->hw_rasterizer->AddTriangle(v0, v1, v2); |                         VideoCore::g_renderer->hw_rasterizer->AddTriangle(v0, v1, v2); | ||||||
|                     }; |                     }; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -7,7 +7,7 @@ | ||||||
| #include "common/common_types.h" | #include "common/common_types.h" | ||||||
| 
 | 
 | ||||||
| namespace Pica { | namespace Pica { | ||||||
| namespace VertexShader { | namespace Shader { | ||||||
| struct OutputVertex; | struct OutputVertex; | ||||||
| } | } | ||||||
| } | } | ||||||
|  | @ -24,9 +24,9 @@ public: | ||||||
|     virtual void Reset() = 0; |     virtual void Reset() = 0; | ||||||
| 
 | 
 | ||||||
|     /// Queues the primitive formed by the given vertices for rendering
 |     /// Queues the primitive formed by the given vertices for rendering
 | ||||||
|     virtual void AddTriangle(const Pica::VertexShader::OutputVertex& v0, |     virtual void AddTriangle(const Pica::Shader::OutputVertex& v0, | ||||||
|                              const Pica::VertexShader::OutputVertex& v1, |                              const Pica::Shader::OutputVertex& v1, | ||||||
|                              const Pica::VertexShader::OutputVertex& v2) = 0; |                              const Pica::Shader::OutputVertex& v2) = 0; | ||||||
| 
 | 
 | ||||||
|     /// Draw the current batch of triangles
 |     /// Draw the current batch of triangles
 | ||||||
|     virtual void DrawTriangles() = 0; |     virtual void DrawTriangles() = 0; | ||||||
|  |  | ||||||
|  | @ -6,6 +6,7 @@ | ||||||
| #include <unordered_map> | #include <unordered_map> | ||||||
| 
 | 
 | ||||||
| #include "pica.h" | #include "pica.h" | ||||||
|  | #include "shader/shader.h" | ||||||
| 
 | 
 | ||||||
| namespace Pica { | namespace Pica { | ||||||
| 
 | 
 | ||||||
|  | @ -84,6 +85,8 @@ void Init() { | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void Shutdown() { | void Shutdown() { | ||||||
|  |     Shader::Shutdown(); | ||||||
|  | 
 | ||||||
|     memset(&g_state, 0, sizeof(State)); |     memset(&g_state, 0, sizeof(State)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1083,6 +1083,7 @@ private: | ||||||
|     // TODO: Perform proper arithmetic on this!
 |     // TODO: Perform proper arithmetic on this!
 | ||||||
|     float value; |     float value; | ||||||
| }; | }; | ||||||
|  | static_assert(sizeof(float24) == sizeof(float), "Shader JIT assumes float24 is implemented as a 32-bit float"); | ||||||
| 
 | 
 | ||||||
| /// Struct used to describe current Pica state
 | /// Struct used to describe current Pica state
 | ||||||
| struct State { | struct State { | ||||||
|  | @ -1092,7 +1093,10 @@ struct State { | ||||||
|     /// Vertex shader memory
 |     /// Vertex shader memory
 | ||||||
|     struct ShaderSetup { |     struct ShaderSetup { | ||||||
|         struct { |         struct { | ||||||
|             Math::Vec4<float24> f[96]; |             // The float uniforms are accessed by the shader JIT using SSE instructions, and are
 | ||||||
|  |             // therefore required to be 16-byte aligned.
 | ||||||
|  |             Math::Vec4<float24> MEMORY_ALIGNED16(f[96]); | ||||||
|  | 
 | ||||||
|             std::array<bool, 16> b; |             std::array<bool, 16> b; | ||||||
|             std::array<Math::Vec4<u8>, 4> i; |             std::array<Math::Vec4<u8>, 4> i; | ||||||
|         } uniforms; |         } uniforms; | ||||||
|  |  | ||||||
|  | @ -4,7 +4,7 @@ | ||||||
| 
 | 
 | ||||||
| #include "pica.h" | #include "pica.h" | ||||||
| #include "primitive_assembly.h" | #include "primitive_assembly.h" | ||||||
| #include "vertex_shader.h" | #include "shader/shader_interpreter.h" | ||||||
| 
 | 
 | ||||||
| #include "common/logging/log.h" | #include "common/logging/log.h" | ||||||
| #include "video_core/debug_utils/debug_utils.h" | #include "video_core/debug_utils/debug_utils.h" | ||||||
|  | @ -56,7 +56,7 @@ void PrimitiveAssembler<VertexType>::SubmitVertex(VertexType& vtx, TriangleHandl | ||||||
| 
 | 
 | ||||||
| // explicitly instantiate use cases
 | // explicitly instantiate use cases
 | ||||||
| template | template | ||||||
| struct PrimitiveAssembler<VertexShader::OutputVertex>; | struct PrimitiveAssembler<Shader::OutputVertex>; | ||||||
| template | template | ||||||
| struct PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex>; | struct PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex>; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -8,7 +8,7 @@ | ||||||
| 
 | 
 | ||||||
| #include "video_core/pica.h" | #include "video_core/pica.h" | ||||||
| 
 | 
 | ||||||
| #include "video_core/vertex_shader.h" | #include "video_core/shader/shader_interpreter.h" | ||||||
| 
 | 
 | ||||||
| namespace Pica { | namespace Pica { | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -16,7 +16,7 @@ | ||||||
| #include "math.h" | #include "math.h" | ||||||
| #include "pica.h" | #include "pica.h" | ||||||
| #include "rasterizer.h" | #include "rasterizer.h" | ||||||
| #include "vertex_shader.h" | #include "shader/shader_interpreter.h" | ||||||
| #include "video_core/utils.h" | #include "video_core/utils.h" | ||||||
| 
 | 
 | ||||||
| namespace Pica { | namespace Pica { | ||||||
|  | @ -272,9 +272,9 @@ static Common::Profiling::TimingCategory rasterization_category("Rasterization") | ||||||
|  * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing |  * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing | ||||||
|  * culling via recursion. |  * culling via recursion. | ||||||
|  */ |  */ | ||||||
| static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, | static void ProcessTriangleInternal(const Shader::OutputVertex& v0, | ||||||
|                                     const VertexShader::OutputVertex& v1, |                                     const Shader::OutputVertex& v1, | ||||||
|                                     const VertexShader::OutputVertex& v2, |                                     const Shader::OutputVertex& v2, | ||||||
|                                     bool reversed = false) |                                     bool reversed = false) | ||||||
| { | { | ||||||
|     const auto& regs = g_state.regs; |     const auto& regs = g_state.regs; | ||||||
|  | @ -1107,9 +1107,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void ProcessTriangle(const VertexShader::OutputVertex& v0, | void ProcessTriangle(const Shader::OutputVertex& v0, | ||||||
|                      const VertexShader::OutputVertex& v1, |                      const Shader::OutputVertex& v1, | ||||||
|                      const VertexShader::OutputVertex& v2) { |                      const Shader::OutputVertex& v2) { | ||||||
|     ProcessTriangleInternal(v0, v1, v2); |     ProcessTriangleInternal(v0, v1, v2); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -6,15 +6,15 @@ | ||||||
| 
 | 
 | ||||||
| namespace Pica { | namespace Pica { | ||||||
| 
 | 
 | ||||||
| namespace VertexShader { | namespace Shader { | ||||||
|     struct OutputVertex; |     struct OutputVertex; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| namespace Rasterizer { | namespace Rasterizer { | ||||||
| 
 | 
 | ||||||
| void ProcessTriangle(const VertexShader::OutputVertex& v0, | void ProcessTriangle(const Shader::OutputVertex& v0, | ||||||
|                      const VertexShader::OutputVertex& v1, |                      const Shader::OutputVertex& v1, | ||||||
|                      const VertexShader::OutputVertex& v2); |                      const Shader::OutputVertex& v2); | ||||||
| 
 | 
 | ||||||
| } // namespace Rasterizer
 | } // namespace Rasterizer
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -202,9 +202,9 @@ void RasterizerOpenGL::Reset() { | ||||||
|     res_cache.FullFlush(); |     res_cache.FullFlush(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void RasterizerOpenGL::AddTriangle(const Pica::VertexShader::OutputVertex& v0, | void RasterizerOpenGL::AddTriangle(const Pica::Shader::OutputVertex& v0, | ||||||
|                                    const Pica::VertexShader::OutputVertex& v1, |                                    const Pica::Shader::OutputVertex& v1, | ||||||
|                                    const Pica::VertexShader::OutputVertex& v2) { |                                    const Pica::Shader::OutputVertex& v2) { | ||||||
|     vertex_batch.push_back(HardwareVertex(v0)); |     vertex_batch.push_back(HardwareVertex(v0)); | ||||||
|     vertex_batch.push_back(HardwareVertex(v1)); |     vertex_batch.push_back(HardwareVertex(v1)); | ||||||
|     vertex_batch.push_back(HardwareVertex(v2)); |     vertex_batch.push_back(HardwareVertex(v2)); | ||||||
|  |  | ||||||
|  | @ -9,7 +9,7 @@ | ||||||
| #include "common/common_types.h" | #include "common/common_types.h" | ||||||
| 
 | 
 | ||||||
| #include "video_core/hwrasterizer_base.h" | #include "video_core/hwrasterizer_base.h" | ||||||
| #include "video_core/vertex_shader.h" | #include "video_core/shader/shader_interpreter.h" | ||||||
| 
 | 
 | ||||||
| #include "gl_state.h" | #include "gl_state.h" | ||||||
| #include "gl_rasterizer_cache.h" | #include "gl_rasterizer_cache.h" | ||||||
|  | @ -27,9 +27,9 @@ public: | ||||||
|     void Reset() override; |     void Reset() override; | ||||||
| 
 | 
 | ||||||
|     /// Queues the primitive formed by the given vertices for rendering
 |     /// Queues the primitive formed by the given vertices for rendering
 | ||||||
|     void AddTriangle(const Pica::VertexShader::OutputVertex& v0, |     void AddTriangle(const Pica::Shader::OutputVertex& v0, | ||||||
|                      const Pica::VertexShader::OutputVertex& v1, |                      const Pica::Shader::OutputVertex& v1, | ||||||
|                      const Pica::VertexShader::OutputVertex& v2) override; |                      const Pica::Shader::OutputVertex& v2) override; | ||||||
| 
 | 
 | ||||||
|     /// Draw the current batch of triangles
 |     /// Draw the current batch of triangles
 | ||||||
|     void DrawTriangles() override; |     void DrawTriangles() override; | ||||||
|  | @ -82,7 +82,7 @@ private: | ||||||
| 
 | 
 | ||||||
|     /// Structure that the hardware rendered vertices are composed of
 |     /// Structure that the hardware rendered vertices are composed of
 | ||||||
|     struct HardwareVertex { |     struct HardwareVertex { | ||||||
|         HardwareVertex(const Pica::VertexShader::OutputVertex& v) { |         HardwareVertex(const Pica::Shader::OutputVertex& v) { | ||||||
|             position[0] = v.pos.x.ToFloat32(); |             position[0] = v.pos.x.ToFloat32(); | ||||||
|             position[1] = v.pos.y.ToFloat32(); |             position[1] = v.pos.y.ToFloat32(); | ||||||
|             position[2] = v.pos.z.ToFloat32(); |             position[2] = v.pos.z.ToFloat32(); | ||||||
|  |  | ||||||
							
								
								
									
										145
									
								
								src/video_core/shader/shader.cpp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										145
									
								
								src/video_core/shader/shader.cpp
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,145 @@ | ||||||
|  | // Copyright 2015 Citra Emulator Project
 | ||||||
|  | // Licensed under GPLv2 or any later version
 | ||||||
|  | // Refer to the license.txt file included.
 | ||||||
|  | 
 | ||||||
|  | #include <memory> | ||||||
|  | #include <unordered_map> | ||||||
|  | 
 | ||||||
|  | #include "common/hash.h" | ||||||
|  | #include "common/make_unique.h" | ||||||
|  | #include "common/profiler.h" | ||||||
|  | 
 | ||||||
|  | #include "video_core/debug_utils/debug_utils.h" | ||||||
|  | #include "video_core/pica.h" | ||||||
|  | #include "video_core/video_core.h" | ||||||
|  | 
 | ||||||
|  | #include "shader.h" | ||||||
|  | #include "shader_interpreter.h" | ||||||
|  | 
 | ||||||
|  | #ifdef ARCHITECTURE_x86_64 | ||||||
|  | #include "shader_jit_x64.h" | ||||||
|  | #endif // ARCHITECTURE_x86_64
 | ||||||
|  | 
 | ||||||
|  | namespace Pica { | ||||||
|  | 
 | ||||||
|  | namespace Shader { | ||||||
|  | 
 | ||||||
|  | #ifdef ARCHITECTURE_x86_64 | ||||||
|  | static std::unordered_map<u64, CompiledShader*> shader_map; | ||||||
|  | static JitCompiler jit; | ||||||
|  | static CompiledShader* jit_shader; | ||||||
|  | #endif // ARCHITECTURE_x86_64
 | ||||||
|  | 
 | ||||||
|  | void Setup(UnitState& state) { | ||||||
|  | #ifdef ARCHITECTURE_x86_64 | ||||||
|  |     if (VideoCore::g_shader_jit_enabled) { | ||||||
|  |         u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ | ||||||
|  |             Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)) ^ | ||||||
|  |             g_state.regs.vs.main_offset); | ||||||
|  | 
 | ||||||
|  |         auto iter = shader_map.find(cache_key); | ||||||
|  |         if (iter != shader_map.end()) { | ||||||
|  |             jit_shader = iter->second; | ||||||
|  |         } else { | ||||||
|  |             jit_shader = jit.Compile(); | ||||||
|  |             shader_map.emplace(cache_key, jit_shader); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | #endif // ARCHITECTURE_x86_64
 | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void Shutdown() { | ||||||
|  |     shader_map.clear(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static Common::Profiling::TimingCategory shader_category("Vertex Shader"); | ||||||
|  | 
 | ||||||
|  | OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes) { | ||||||
|  |     auto& config = g_state.regs.vs; | ||||||
|  |     auto& setup = g_state.vs; | ||||||
|  | 
 | ||||||
|  |     Common::Profiling::ScopeTimer timer(shader_category); | ||||||
|  | 
 | ||||||
|  |     state.program_counter = config.main_offset; | ||||||
|  |     state.debug.max_offset = 0; | ||||||
|  |     state.debug.max_opdesc_id = 0; | ||||||
|  | 
 | ||||||
|  |     // Setup input register table
 | ||||||
|  |     const auto& attribute_register_map = config.input_register_map; | ||||||
|  | 
 | ||||||
|  |     if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = input.attr[0]; | ||||||
|  |     if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = input.attr[1]; | ||||||
|  |     if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = input.attr[2]; | ||||||
|  |     if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = input.attr[3]; | ||||||
|  |     if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = input.attr[4]; | ||||||
|  |     if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = input.attr[5]; | ||||||
|  |     if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = input.attr[6]; | ||||||
|  |     if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = input.attr[7]; | ||||||
|  |     if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = input.attr[8]; | ||||||
|  |     if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = input.attr[9]; | ||||||
|  |     if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = input.attr[10]; | ||||||
|  |     if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = input.attr[11]; | ||||||
|  |     if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = input.attr[12]; | ||||||
|  |     if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = input.attr[13]; | ||||||
|  |     if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = input.attr[14]; | ||||||
|  |     if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = input.attr[15]; | ||||||
|  | 
 | ||||||
|  |     state.conditional_code[0] = false; | ||||||
|  |     state.conditional_code[1] = false; | ||||||
|  | 
 | ||||||
|  | #ifdef ARCHITECTURE_x86_64 | ||||||
|  |     if (VideoCore::g_shader_jit_enabled) | ||||||
|  |         jit_shader(&state.registers); | ||||||
|  |     else | ||||||
|  |         RunInterpreter(state); | ||||||
|  | #else | ||||||
|  |     RunInterpreter(state); | ||||||
|  | #endif // ARCHITECTURE_x86_64
 | ||||||
|  | 
 | ||||||
|  | #if PICA_DUMP_SHADERS | ||||||
|  |     DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(), | ||||||
|  |         state.debug.max_opdesc_id, config.main_offset, | ||||||
|  |         g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here
 | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  |     // Setup output data
 | ||||||
|  |     OutputVertex ret; | ||||||
|  |     // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
 | ||||||
|  |     // figure out what those circumstances are and enable the remaining outputs then.
 | ||||||
|  |     for (int i = 0; i < 7; ++i) { | ||||||
|  |         const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here
 | ||||||
|  | 
 | ||||||
|  |         u32 semantics[4] = { | ||||||
|  |             output_register_map.map_x, output_register_map.map_y, | ||||||
|  |             output_register_map.map_z, output_register_map.map_w | ||||||
|  |         }; | ||||||
|  | 
 | ||||||
|  |         for (int comp = 0; comp < 4; ++comp) { | ||||||
|  |             float24* out = ((float24*)&ret) + semantics[comp]; | ||||||
|  |             if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { | ||||||
|  |                 *out = state.registers.output[i][comp]; | ||||||
|  |             } else { | ||||||
|  |                 // Zero output so that attributes which aren't output won't have denormals in them,
 | ||||||
|  |                 // which would slow us down later.
 | ||||||
|  |                 memset(out, 0, sizeof(*out)); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation
 | ||||||
|  |     for (int i = 0; i < 4; ++i) { | ||||||
|  |         ret.color[i] = float24::FromFloat32( | ||||||
|  |             std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f)); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", | ||||||
|  |         ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), | ||||||
|  |         ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), | ||||||
|  |         ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); | ||||||
|  | 
 | ||||||
|  |     return ret; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | } // namespace Shader
 | ||||||
|  | 
 | ||||||
|  | } // namespace Pica
 | ||||||
							
								
								
									
										169
									
								
								src/video_core/shader/shader.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										169
									
								
								src/video_core/shader/shader.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,169 @@ | ||||||
|  | // Copyright 2015 Citra Emulator Project
 | ||||||
|  | // Licensed under GPLv2 or any later version
 | ||||||
|  | // Refer to the license.txt file included.
 | ||||||
|  | 
 | ||||||
|  | #pragma once | ||||||
|  | 
 | ||||||
|  | #include <boost/container/static_vector.hpp> | ||||||
|  | #include <nihstro/shader_binary.h> | ||||||
|  | 
 | ||||||
|  | #include "common/common_funcs.h" | ||||||
|  | #include "common/common_types.h" | ||||||
|  | #include "common/vector_math.h" | ||||||
|  | 
 | ||||||
|  | #include "video_core/pica.h" | ||||||
|  | 
 | ||||||
|  | using nihstro::RegisterType; | ||||||
|  | using nihstro::SourceRegister; | ||||||
|  | using nihstro::DestRegister; | ||||||
|  | 
 | ||||||
|  | namespace Pica { | ||||||
|  | 
 | ||||||
|  | namespace Shader { | ||||||
|  | 
 | ||||||
|  | struct InputVertex { | ||||||
|  |     Math::Vec4<float24> attr[16]; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | struct OutputVertex { | ||||||
|  |     OutputVertex() = default; | ||||||
|  | 
 | ||||||
|  |     // VS output attributes
 | ||||||
|  |     Math::Vec4<float24> pos; | ||||||
|  |     Math::Vec4<float24> dummy; // quaternions (not implemented, yet)
 | ||||||
|  |     Math::Vec4<float24> color; | ||||||
|  |     Math::Vec2<float24> tc0; | ||||||
|  |     Math::Vec2<float24> tc1; | ||||||
|  |     float24 pad[6]; | ||||||
|  |     Math::Vec2<float24> tc2; | ||||||
|  | 
 | ||||||
|  |     // Padding for optimal alignment
 | ||||||
|  |     float24 pad2[4]; | ||||||
|  | 
 | ||||||
|  |     // Attributes used to store intermediate results
 | ||||||
|  | 
 | ||||||
|  |     // position after perspective divide
 | ||||||
|  |     Math::Vec3<float24> screenpos; | ||||||
|  |     float24 pad3; | ||||||
|  | 
 | ||||||
|  |     // Linear interpolation
 | ||||||
|  |     // factor: 0=this, 1=vtx
 | ||||||
|  |     void Lerp(float24 factor, const OutputVertex& vtx) { | ||||||
|  |         pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); | ||||||
|  | 
 | ||||||
|  |         // TODO: Should perform perspective correct interpolation here...
 | ||||||
|  |         tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); | ||||||
|  |         tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); | ||||||
|  |         tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); | ||||||
|  | 
 | ||||||
|  |         screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); | ||||||
|  | 
 | ||||||
|  |         color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // Linear interpolation
 | ||||||
|  |     // factor: 0=v0, 1=v1
 | ||||||
|  |     static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { | ||||||
|  |         OutputVertex ret = v0; | ||||||
|  |         ret.Lerp(factor, v1); | ||||||
|  |         return ret; | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  | static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); | ||||||
|  | static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * This structure contains the state information that needs to be unique for a shader unit. The 3DS | ||||||
|  |  * has four shader units that process shaders in parallel. At the present, Citra only implements a | ||||||
|  |  * single shader unit that processes all shaders serially. Putting the state information in a struct | ||||||
|  |  * here will make it easier for us to parallelize the shader processing later. | ||||||
|  |  */ | ||||||
|  | struct UnitState { | ||||||
|  |     struct Registers { | ||||||
|  |         // The registers are accessed by the shader JIT using SSE instructions, and are therefore
 | ||||||
|  |         // required to be 16-byte aligned.
 | ||||||
|  |         Math::Vec4<float24> MEMORY_ALIGNED16(input[16]); | ||||||
|  |         Math::Vec4<float24> MEMORY_ALIGNED16(output[16]); | ||||||
|  |         Math::Vec4<float24> MEMORY_ALIGNED16(temporary[16]); | ||||||
|  |     } registers; | ||||||
|  |     static_assert(std::is_pod<Registers>::value, "Structure is not POD"); | ||||||
|  | 
 | ||||||
|  |     u32 program_counter; | ||||||
|  |     bool conditional_code[2]; | ||||||
|  | 
 | ||||||
|  |     // Two Address registers and one loop counter
 | ||||||
|  |     // TODO: How many bits do these actually have?
 | ||||||
|  |     s32 address_registers[3]; | ||||||
|  | 
 | ||||||
|  |     enum { | ||||||
|  |         INVALID_ADDRESS = 0xFFFFFFFF | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     struct CallStackElement { | ||||||
|  |         u32 final_address;  // Address upon which we jump to return_address
 | ||||||
|  |         u32 return_address; // Where to jump when leaving scope
 | ||||||
|  |         u8 repeat_counter;  // How often to repeat until this call stack element is removed
 | ||||||
|  |         u8 loop_increment;  // Which value to add to the loop counter after an iteration
 | ||||||
|  |                             // TODO: Should this be a signed value? Does it even matter?
 | ||||||
|  |         u32 loop_address;   // The address where we'll return to after each loop iteration
 | ||||||
|  |     }; | ||||||
|  | 
 | ||||||
|  |     // TODO: Is there a maximal size for this?
 | ||||||
|  |     boost::container::static_vector<CallStackElement, 16> call_stack; | ||||||
|  | 
 | ||||||
|  |     struct { | ||||||
|  |         u32 max_offset; // maximum program counter ever reached
 | ||||||
|  |         u32 max_opdesc_id; // maximum swizzle pattern index ever used
 | ||||||
|  |     } debug; | ||||||
|  | 
 | ||||||
|  |     static int InputOffset(const SourceRegister& reg) { | ||||||
|  |         switch (reg.GetRegisterType()) { | ||||||
|  |         case RegisterType::Input: | ||||||
|  |             return (int)offsetof(UnitState::Registers, input) + reg.GetIndex()*sizeof(Math::Vec4<float24>); | ||||||
|  | 
 | ||||||
|  |         case RegisterType::Temporary: | ||||||
|  |             return (int)offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); | ||||||
|  | 
 | ||||||
|  |         default: | ||||||
|  |             UNREACHABLE(); | ||||||
|  |             return 0; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     static int OutputOffset(const DestRegister& reg) { | ||||||
|  |         switch (reg.GetRegisterType()) { | ||||||
|  |         case RegisterType::Output: | ||||||
|  |             return (int)offsetof(UnitState::Registers, output) + reg.GetIndex()*sizeof(Math::Vec4<float24>); | ||||||
|  | 
 | ||||||
|  |         case RegisterType::Temporary: | ||||||
|  |             return (int)offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); | ||||||
|  | 
 | ||||||
|  |         default: | ||||||
|  |             UNREACHABLE(); | ||||||
|  |             return 0; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per | ||||||
|  |  * vertex, which would happen within the `Run` function). | ||||||
|  |  * @param state Shader unit state, must be setup per shader and per shader unit | ||||||
|  |  */ | ||||||
|  | void Setup(UnitState& state); | ||||||
|  | 
 | ||||||
|  | /// Performs any cleanup when the emulator is shutdown
 | ||||||
|  | void Shutdown(); | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * Runs the currently setup shader | ||||||
|  |  * @param state Shader unit state, must be setup per shader and per shader unit | ||||||
|  |  * @param input Input vertex into the shader | ||||||
|  |  * @param num_attributes The number of vertex shader attributes | ||||||
|  |  * @return The output vertex, after having been processed by the vertex shader | ||||||
|  |  */ | ||||||
|  | OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes); | ||||||
|  | 
 | ||||||
|  | } // namespace Shader
 | ||||||
|  | 
 | ||||||
|  | } // namespace Pica
 | ||||||
|  | @ -2,18 +2,14 @@ | ||||||
| // Licensed under GPLv2 or any later version
 | // Licensed under GPLv2 or any later version
 | ||||||
| // Refer to the license.txt file included.
 | // Refer to the license.txt file included.
 | ||||||
| 
 | 
 | ||||||
| #include <boost/container/static_vector.hpp> |  | ||||||
| #include <boost/range/algorithm.hpp> |  | ||||||
| 
 |  | ||||||
| #include <common/file_util.h> | #include <common/file_util.h> | ||||||
| 
 | 
 | ||||||
| #include <nihstro/shader_bytecode.h> | #include <nihstro/shader_bytecode.h> | ||||||
| 
 | 
 | ||||||
| #include "common/profiler.h" | #include "video_core/pica.h" | ||||||
| 
 | 
 | ||||||
| #include "pica.h" | #include "shader.h" | ||||||
| #include "vertex_shader.h" | #include "shader_interpreter.h" | ||||||
| #include "debug_utils/debug_utils.h" |  | ||||||
| 
 | 
 | ||||||
| using nihstro::OpCode; | using nihstro::OpCode; | ||||||
| using nihstro::Instruction; | using nihstro::Instruction; | ||||||
|  | @ -23,44 +19,9 @@ using nihstro::SwizzlePattern; | ||||||
| 
 | 
 | ||||||
| namespace Pica { | namespace Pica { | ||||||
| 
 | 
 | ||||||
| namespace VertexShader { | namespace Shader { | ||||||
| 
 | 
 | ||||||
| struct VertexShaderState { | void RunInterpreter(UnitState& state) { | ||||||
|     u32 program_counter; |  | ||||||
| 
 |  | ||||||
|     const float24* input_register_table[16]; |  | ||||||
|     Math::Vec4<float24> output_registers[16]; |  | ||||||
| 
 |  | ||||||
|     Math::Vec4<float24> temporary_registers[16]; |  | ||||||
|     bool conditional_code[2]; |  | ||||||
| 
 |  | ||||||
|     // Two Address registers and one loop counter
 |  | ||||||
|     // TODO: How many bits do these actually have?
 |  | ||||||
|     s32 address_registers[3]; |  | ||||||
| 
 |  | ||||||
|     enum { |  | ||||||
|         INVALID_ADDRESS = 0xFFFFFFFF |  | ||||||
|     }; |  | ||||||
| 
 |  | ||||||
|     struct CallStackElement { |  | ||||||
|         u32 final_address;  // Address upon which we jump to return_address
 |  | ||||||
|         u32 return_address; // Where to jump when leaving scope
 |  | ||||||
|         u8 repeat_counter;  // How often to repeat until this call stack element is removed
 |  | ||||||
|         u8 loop_increment;  // Which value to add to the loop counter after an iteration
 |  | ||||||
|                             // TODO: Should this be a signed value? Does it even matter?
 |  | ||||||
|         u32 loop_address;   // The address where we'll return to after each loop iteration
 |  | ||||||
|     }; |  | ||||||
| 
 |  | ||||||
|     // TODO: Is there a maximal size for this?
 |  | ||||||
|     boost::container::static_vector<CallStackElement, 16> call_stack; |  | ||||||
| 
 |  | ||||||
|     struct { |  | ||||||
|         u32 max_offset; // maximum program counter ever reached
 |  | ||||||
|         u32 max_opdesc_id; // maximum swizzle pattern index ever used
 |  | ||||||
|     } debug; |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| static void ProcessShaderCode(VertexShaderState& state) { |  | ||||||
|     const auto& uniforms = g_state.vs.uniforms; |     const auto& uniforms = g_state.vs.uniforms; | ||||||
|     const auto& swizzle_data = g_state.vs.swizzle_data; |     const auto& swizzle_data = g_state.vs.swizzle_data; | ||||||
|     const auto& program_code = g_state.vs.program_code; |     const auto& program_code = g_state.vs.program_code; | ||||||
|  | @ -90,7 +51,7 @@ static void ProcessShaderCode(VertexShaderState& state) { | ||||||
|         const Instruction instr = { program_code[state.program_counter] }; |         const Instruction instr = { program_code[state.program_counter] }; | ||||||
|         const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] }; |         const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] }; | ||||||
| 
 | 
 | ||||||
|         static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions, |         static auto call = [](UnitState& state, u32 offset, u32 num_instructions, | ||||||
|                               u32 return_offset, u8 repeat_count, u8 loop_increment) { |                               u32 return_offset, u8 repeat_count, u8 loop_increment) { | ||||||
|             state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
 |             state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
 | ||||||
|             ASSERT(state.call_stack.size() < state.call_stack.capacity()); |             ASSERT(state.call_stack.size() < state.call_stack.capacity()); | ||||||
|  | @ -101,10 +62,10 @@ static void ProcessShaderCode(VertexShaderState& state) { | ||||||
|         auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { |         auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { | ||||||
|             switch (source_reg.GetRegisterType()) { |             switch (source_reg.GetRegisterType()) { | ||||||
|             case RegisterType::Input: |             case RegisterType::Input: | ||||||
|                 return state.input_register_table[source_reg.GetIndex()]; |                 return &state.registers.input[source_reg.GetIndex()].x; | ||||||
| 
 | 
 | ||||||
|             case RegisterType::Temporary: |             case RegisterType::Temporary: | ||||||
|                 return &state.temporary_registers[source_reg.GetIndex()].x; |                 return &state.registers.temporary[source_reg.GetIndex()].x; | ||||||
| 
 | 
 | ||||||
|             case RegisterType::FloatUniform: |             case RegisterType::FloatUniform: | ||||||
|                 return &uniforms.f[source_reg.GetIndex()].x; |                 return &uniforms.f[source_reg.GetIndex()].x; | ||||||
|  | @ -153,8 +114,8 @@ static void ProcessShaderCode(VertexShaderState& state) { | ||||||
|                 src2[3] = src2[3] * float24::FromFloat32(-1); |                 src2[3] = src2[3] * float24::FromFloat32(-1); | ||||||
|             } |             } | ||||||
| 
 | 
 | ||||||
|             float24* dest = (instr.common.dest.Value() < 0x10) ? &state.output_registers[instr.common.dest.Value().GetIndex()][0] |             float24* dest = (instr.common.dest.Value() < 0x10) ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] | ||||||
|                         : (instr.common.dest.Value() < 0x20) ? &state.temporary_registers[instr.common.dest.Value().GetIndex()][0] |                         : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] | ||||||
|                         : dummy_vec4_float24; |                         : dummy_vec4_float24; | ||||||
| 
 | 
 | ||||||
|             state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id); |             state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id); | ||||||
|  | @ -394,8 +355,8 @@ static void ProcessShaderCode(VertexShaderState& state) { | ||||||
|                     src3[3] = src3[3] * float24::FromFloat32(-1); |                     src3[3] = src3[3] * float24::FromFloat32(-1); | ||||||
|                 } |                 } | ||||||
| 
 | 
 | ||||||
|                 float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.output_registers[instr.mad.dest.Value().GetIndex()][0] |                 float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] | ||||||
|                             : (instr.mad.dest.Value() < 0x20) ? &state.temporary_registers[instr.mad.dest.Value().GetIndex()][0] |                             : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] | ||||||
|                             : dummy_vec4_float24; |                             : dummy_vec4_float24; | ||||||
| 
 | 
 | ||||||
|                 for (int i = 0; i < 4; ++i) { |                 for (int i = 0; i < 4; ++i) { | ||||||
|  | @ -413,7 +374,7 @@ static void ProcessShaderCode(VertexShaderState& state) { | ||||||
| 
 | 
 | ||||||
|         default: |         default: | ||||||
|         { |         { | ||||||
|             static auto evaluate_condition = [](const VertexShaderState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { |             static auto evaluate_condition = [](const UnitState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { | ||||||
|                 bool results[2] = { refx == state.conditional_code[0], |                 bool results[2] = { refx == state.conditional_code[0], | ||||||
|                                     refy == state.conditional_code[1] }; |                                     refy == state.conditional_code[1] }; | ||||||
| 
 | 
 | ||||||
|  | @ -542,88 +503,6 @@ static void ProcessShaderCode(VertexShaderState& state) { | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static Common::Profiling::TimingCategory shader_category("Vertex Shader"); |  | ||||||
| 
 |  | ||||||
| OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup) { |  | ||||||
|     Common::Profiling::ScopeTimer timer(shader_category); |  | ||||||
| 
 |  | ||||||
|     VertexShaderState state; |  | ||||||
| 
 |  | ||||||
|     state.program_counter = config.main_offset; |  | ||||||
|     state.debug.max_offset = 0; |  | ||||||
|     state.debug.max_opdesc_id = 0; |  | ||||||
| 
 |  | ||||||
|     // Setup input register table
 |  | ||||||
|     const auto& attribute_register_map = config.input_register_map; |  | ||||||
|     float24 dummy_register; |  | ||||||
|     boost::fill(state.input_register_table, &dummy_register); |  | ||||||
| 
 |  | ||||||
|     if (num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x; |  | ||||||
|     if (num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x; |  | ||||||
|     if (num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x; |  | ||||||
|     if (num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x; |  | ||||||
|     if (num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x; |  | ||||||
|     if (num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x; |  | ||||||
|     if (num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x; |  | ||||||
|     if (num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x; |  | ||||||
|     if (num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x; |  | ||||||
|     if (num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x; |  | ||||||
|     if (num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x; |  | ||||||
|     if (num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x; |  | ||||||
|     if (num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x; |  | ||||||
|     if (num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x; |  | ||||||
|     if (num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x; |  | ||||||
|     if (num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x; |  | ||||||
| 
 |  | ||||||
|     state.conditional_code[0] = false; |  | ||||||
|     state.conditional_code[1] = false; |  | ||||||
| 
 |  | ||||||
|     ProcessShaderCode(state); |  | ||||||
| #if PICA_DUMP_SHADERS |  | ||||||
|     DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(), |  | ||||||
|                            state.debug.max_opdesc_id, config.main_offset, |  | ||||||
|                            g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here
 |  | ||||||
| #endif |  | ||||||
| 
 |  | ||||||
|     // Setup output data
 |  | ||||||
|     OutputVertex ret; |  | ||||||
|     // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
 |  | ||||||
|     // figure out what those circumstances are and enable the remaining outputs then.
 |  | ||||||
|     for (int i = 0; i < 7; ++i) { |  | ||||||
|         const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here
 |  | ||||||
| 
 |  | ||||||
|         u32 semantics[4] = { |  | ||||||
|             output_register_map.map_x, output_register_map.map_y, |  | ||||||
|             output_register_map.map_z, output_register_map.map_w |  | ||||||
|         }; |  | ||||||
| 
 |  | ||||||
|         for (int comp = 0; comp < 4; ++comp) { |  | ||||||
|             float24* out = ((float24*)&ret) + semantics[comp]; |  | ||||||
|             if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { |  | ||||||
|                 *out = state.output_registers[i][comp]; |  | ||||||
|             } else { |  | ||||||
|                 // Zero output so that attributes which aren't output won't have denormals in them,
 |  | ||||||
|                 // which would slow us down later.
 |  | ||||||
|                 memset(out, 0, sizeof(*out)); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation
 |  | ||||||
|     for (int i = 0; i < 4; ++i) { |  | ||||||
|         ret.color[i] = float24::FromFloat32( |  | ||||||
|             std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f)); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", |  | ||||||
|         ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), |  | ||||||
|         ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), |  | ||||||
|         ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); |  | ||||||
| 
 |  | ||||||
|     return ret; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| } // namespace
 | } // namespace
 | ||||||
| 
 | 
 | ||||||
| } // namespace
 | } // namespace
 | ||||||
							
								
								
									
										19
									
								
								src/video_core/shader/shader_interpreter.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								src/video_core/shader/shader_interpreter.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,19 @@ | ||||||
|  | // Copyright 2014 Citra Emulator Project
 | ||||||
|  | // Licensed under GPLv2 or any later version
 | ||||||
|  | // Refer to the license.txt file included.
 | ||||||
|  | 
 | ||||||
|  | #pragma once | ||||||
|  | 
 | ||||||
|  | #include "video_core/pica.h" | ||||||
|  | 
 | ||||||
|  | #include "shader.h" | ||||||
|  | 
 | ||||||
|  | namespace Pica { | ||||||
|  | 
 | ||||||
|  | namespace Shader { | ||||||
|  | 
 | ||||||
|  | void RunInterpreter(UnitState& state); | ||||||
|  | 
 | ||||||
|  | } // namespace
 | ||||||
|  | 
 | ||||||
|  | } // namespace
 | ||||||
							
								
								
									
										675
									
								
								src/video_core/shader/shader_jit_x64.cpp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										675
									
								
								src/video_core/shader/shader_jit_x64.cpp
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,675 @@ | ||||||
|  | // Copyright 2015 Citra Emulator Project
 | ||||||
|  | // Licensed under GPLv2 or any later version
 | ||||||
|  | // Refer to the license.txt file included.
 | ||||||
|  | 
 | ||||||
|  | #include <smmintrin.h> | ||||||
|  | 
 | ||||||
|  | #include "common/x64/abi.h" | ||||||
|  | #include "common/x64/cpu_detect.h" | ||||||
|  | #include "common/x64/emitter.h" | ||||||
|  | 
 | ||||||
|  | #include "shader.h" | ||||||
|  | #include "shader_jit_x64.h" | ||||||
|  | 
 | ||||||
|  | namespace Pica { | ||||||
|  | 
 | ||||||
|  | namespace Shader { | ||||||
|  | 
 | ||||||
|  | using namespace Gen; | ||||||
|  | 
 | ||||||
|  | typedef void (JitCompiler::*JitFunction)(Instruction instr); | ||||||
|  | 
 | ||||||
|  | const JitFunction instr_table[64] = { | ||||||
|  |     &JitCompiler::Compile_ADD,      // add
 | ||||||
|  |     &JitCompiler::Compile_DP3,      // dp3
 | ||||||
|  |     &JitCompiler::Compile_DP4,      // dp4
 | ||||||
|  |     nullptr,                        // dph
 | ||||||
|  |     nullptr,                        // unknown
 | ||||||
|  |     nullptr,                        // ex2
 | ||||||
|  |     nullptr,                        // lg2
 | ||||||
|  |     nullptr,                        // unknown
 | ||||||
|  |     &JitCompiler::Compile_MUL,      // mul
 | ||||||
|  |     nullptr,                        // lge
 | ||||||
|  |     nullptr,                        // slt
 | ||||||
|  |     &JitCompiler::Compile_FLR,      // flr
 | ||||||
|  |     &JitCompiler::Compile_MAX,      // max
 | ||||||
|  |     &JitCompiler::Compile_MIN,      // min
 | ||||||
|  |     &JitCompiler::Compile_RCP,      // rcp
 | ||||||
|  |     &JitCompiler::Compile_RSQ,      // rsq
 | ||||||
|  |     nullptr,                        // unknown
 | ||||||
|  |     nullptr,                        // unknown
 | ||||||
|  |     &JitCompiler::Compile_MOVA,     // mova
 | ||||||
|  |     &JitCompiler::Compile_MOV,      // mov
 | ||||||
|  |     nullptr,                        // unknown
 | ||||||
|  |     nullptr,                        // unknown
 | ||||||
|  |     nullptr,                        // unknown
 | ||||||
|  |     nullptr,                        // unknown
 | ||||||
|  |     nullptr,                        // dphi
 | ||||||
|  |     nullptr,                        // unknown
 | ||||||
|  |     nullptr,                        // sgei
 | ||||||
|  |     &JitCompiler::Compile_SLTI,     // slti
 | ||||||
|  |     nullptr,                        // unknown
 | ||||||
|  |     nullptr,                        // unknown
 | ||||||
|  |     nullptr,                        // unknown
 | ||||||
|  |     nullptr,                        // unknown
 | ||||||
|  |     nullptr,                        // unknown
 | ||||||
|  |     &JitCompiler::Compile_NOP,      // nop
 | ||||||
|  |     &JitCompiler::Compile_END,      // end
 | ||||||
|  |     nullptr,                        // break
 | ||||||
|  |     &JitCompiler::Compile_CALL,     // call
 | ||||||
|  |     &JitCompiler::Compile_CALLC,    // callc
 | ||||||
|  |     &JitCompiler::Compile_CALLU,    // callu
 | ||||||
|  |     &JitCompiler::Compile_IF,       // ifu
 | ||||||
|  |     &JitCompiler::Compile_IF,       // ifc
 | ||||||
|  |     &JitCompiler::Compile_LOOP,     // loop
 | ||||||
|  |     nullptr,                        // emit
 | ||||||
|  |     nullptr,                        // sete
 | ||||||
|  |     &JitCompiler::Compile_JMP,      // jmpc
 | ||||||
|  |     &JitCompiler::Compile_JMP,      // jmpu
 | ||||||
|  |     &JitCompiler::Compile_CMP,      // cmp
 | ||||||
|  |     &JitCompiler::Compile_CMP,      // cmp
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // madi
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // madi
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // madi
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // madi
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // madi
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // madi
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // madi
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // madi
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // mad
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // mad
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // mad
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // mad
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // mad
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // mad
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // mad
 | ||||||
|  |     &JitCompiler::Compile_MAD,      // mad
 | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | // The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can
 | ||||||
|  | // be used as scratch registers within a compiler function. The other registers have designated
 | ||||||
|  | // purposes, as documented below:
 | ||||||
|  | 
 | ||||||
|  | /// Pointer to the uniform memory
 | ||||||
|  | static const X64Reg UNIFORMS = R9; | ||||||
|  | /// The two 32-bit VS address offset registers set by the MOVA instruction
 | ||||||
|  | static const X64Reg ADDROFFS_REG_0 = R10; | ||||||
|  | static const X64Reg ADDROFFS_REG_1 = R11; | ||||||
|  | /// VS loop count register
 | ||||||
|  | static const X64Reg LOOPCOUNT_REG = R12; | ||||||
|  | /// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker)
 | ||||||
|  | static const X64Reg LOOPCOUNT = RSI; | ||||||
|  | /// Number to increment LOOPCOUNT_REG by on each loop iteration
 | ||||||
|  | static const X64Reg LOOPINC = RDI; | ||||||
|  | /// Result of the previous CMP instruction for the X-component comparison
 | ||||||
|  | static const X64Reg COND0 = R13; | ||||||
|  | /// Result of the previous CMP instruction for the Y-component comparison
 | ||||||
|  | static const X64Reg COND1 = R14; | ||||||
|  | /// Pointer to the UnitState instance for the current VS unit
 | ||||||
|  | static const X64Reg REGISTERS = R15; | ||||||
|  | /// SIMD scratch register
 | ||||||
|  | static const X64Reg SCRATCH = XMM0; | ||||||
|  | /// Loaded with the first swizzled source register, otherwise can be used as a scratch register
 | ||||||
|  | static const X64Reg SRC1 = XMM1; | ||||||
|  | /// Loaded with the second swizzled source register, otherwise can be used as a scratch register
 | ||||||
|  | static const X64Reg SRC2 = XMM2; | ||||||
|  | /// Loaded with the third swizzled source register, otherwise can be used as a scratch register
 | ||||||
|  | static const X64Reg SRC3 = XMM3; | ||||||
|  | /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
 | ||||||
|  | static const X64Reg ONE = XMM14; | ||||||
|  | /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
 | ||||||
|  | static const X64Reg NEGBIT = XMM15; | ||||||
|  | 
 | ||||||
|  | /// Raw constant for the source register selector that indicates no swizzling is performed
 | ||||||
|  | static const u8 NO_SRC_REG_SWIZZLE = 0x1b; | ||||||
|  | /// Raw constant for the destination register enable mask that indicates all components are enabled
 | ||||||
|  | static const u8 NO_DEST_REG_MASK = 0xf; | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * Loads and swizzles a source register into the specified XMM register. | ||||||
|  |  * @param instr VS instruction, used for determining how to load the source register | ||||||
|  |  * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) | ||||||
|  |  * @param src_reg SourceRegister object corresponding to the source register to load | ||||||
|  |  * @param dest Destination XMM register to store the loaded, swizzled source register | ||||||
|  |  */ | ||||||
|  | void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, X64Reg dest) { | ||||||
|  |     X64Reg src_ptr; | ||||||
|  |     int src_offset; | ||||||
|  | 
 | ||||||
|  |     if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { | ||||||
|  |         src_ptr = UNIFORMS; | ||||||
|  |         src_offset = src_reg.GetIndex() * sizeof(float24) * 4; | ||||||
|  |     } else { | ||||||
|  |         src_ptr = REGISTERS; | ||||||
|  |         src_offset = UnitState::InputOffset(src_reg); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     unsigned operand_desc_id; | ||||||
|  |     if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || | ||||||
|  |         instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||||||
|  |         // The MAD and MADI instructions do not use the address offset registers, so loading the
 | ||||||
|  |         // source is a bit simpler here
 | ||||||
|  | 
 | ||||||
|  |         operand_desc_id = instr.mad.operand_desc_id; | ||||||
|  | 
 | ||||||
|  |         // Load the source
 | ||||||
|  |         MOVAPS(dest, MDisp(src_ptr, src_offset)); | ||||||
|  |     } else { | ||||||
|  |         operand_desc_id = instr.common.operand_desc_id; | ||||||
|  | 
 | ||||||
|  |         const bool is_inverted = (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); | ||||||
|  |         unsigned offset_src = is_inverted ? 2 : 1; | ||||||
|  | 
 | ||||||
|  |         if (src_num == offset_src && instr.common.address_register_index != 0) { | ||||||
|  |             switch (instr.common.address_register_index) { | ||||||
|  |             case 1: // address offset 1
 | ||||||
|  |                 MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_0, 1, src_offset)); | ||||||
|  |                 break; | ||||||
|  |             case 2: // address offset 2
 | ||||||
|  |                 MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_1, 1, src_offset)); | ||||||
|  |                 break; | ||||||
|  |             case 3: // adddress offet 3
 | ||||||
|  |                 MOVAPS(dest, MComplex(src_ptr, LOOPCOUNT_REG, 1, src_offset)); | ||||||
|  |                 break; | ||||||
|  |             default: | ||||||
|  |                 UNREACHABLE(); | ||||||
|  |                 break; | ||||||
|  |             } | ||||||
|  |         } else { | ||||||
|  |             // Load the source
 | ||||||
|  |             MOVAPS(dest, MDisp(src_ptr, src_offset)); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; | ||||||
|  | 
 | ||||||
|  |     // Generate instructions for source register swizzling as needed
 | ||||||
|  |     u8 sel = swiz.GetRawSelector(src_num); | ||||||
|  |     if (sel != NO_SRC_REG_SWIZZLE) { | ||||||
|  |         // Selector component order needs to be reversed for the SHUFPS instruction
 | ||||||
|  |         sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); | ||||||
|  | 
 | ||||||
|  |         // Shuffle inputs for swizzle
 | ||||||
|  |         SHUFPS(dest, R(dest), sel); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // If the source register should be negated, flip the negative bit using XOR
 | ||||||
|  |     const bool negate[] = { swiz.negate_src1, swiz.negate_src2, swiz.negate_src3 }; | ||||||
|  |     if (negate[src_num - 1]) { | ||||||
|  |         XORPS(dest, R(NEGBIT)); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { | ||||||
|  |     DestRegister dest; | ||||||
|  |     unsigned operand_desc_id; | ||||||
|  |     if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || | ||||||
|  |         instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||||||
|  |         operand_desc_id = instr.mad.operand_desc_id; | ||||||
|  |         dest = instr.mad.dest.Value(); | ||||||
|  |     } else { | ||||||
|  |         operand_desc_id = instr.common.operand_desc_id; | ||||||
|  |         dest = instr.common.dest.Value(); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; | ||||||
|  | 
 | ||||||
|  |     // If all components are enabled, write the result to the destination register
 | ||||||
|  |     if (swiz.dest_mask == NO_DEST_REG_MASK) { | ||||||
|  |         // Store dest back to memory
 | ||||||
|  |         MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), src); | ||||||
|  | 
 | ||||||
|  |     } else { | ||||||
|  |         // Not all components are enabled, so mask the result when storing to the destination register...
 | ||||||
|  |         MOVAPS(SCRATCH, MDisp(REGISTERS, UnitState::OutputOffset(dest))); | ||||||
|  | 
 | ||||||
|  |         if (Common::GetCPUCaps().sse4_1) { | ||||||
|  |             u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); | ||||||
|  |             BLENDPS(SCRATCH, R(src), mask); | ||||||
|  |         } else { | ||||||
|  |             MOVAPS(XMM4, R(src)); | ||||||
|  |             UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination
 | ||||||
|  |             UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination
 | ||||||
|  | 
 | ||||||
|  |             // Compute selector to selectively copy source components to destination for SHUFPS instruction
 | ||||||
|  |             u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) | | ||||||
|  |                      ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | | ||||||
|  |                      ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | | ||||||
|  |                      ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); | ||||||
|  |             SHUFPS(SCRATCH, R(XMM4), sel); | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         // Store dest back to memory
 | ||||||
|  |         MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), SCRATCH); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_EvaluateCondition(Instruction instr) { | ||||||
|  |     // Note: NXOR is used below to check for equality
 | ||||||
|  |     switch (instr.flow_control.op) { | ||||||
|  |     case Instruction::FlowControlType::Or: | ||||||
|  |         MOV(32, R(RAX), R(COND0)); | ||||||
|  |         MOV(32, R(RBX), R(COND1)); | ||||||
|  |         XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); | ||||||
|  |         XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1)); | ||||||
|  |         OR(32, R(RAX), R(RBX)); | ||||||
|  |         break; | ||||||
|  | 
 | ||||||
|  |     case Instruction::FlowControlType::And: | ||||||
|  |         MOV(32, R(RAX), R(COND0)); | ||||||
|  |         MOV(32, R(RBX), R(COND1)); | ||||||
|  |         XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); | ||||||
|  |         XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1)); | ||||||
|  |         AND(32, R(RAX), R(RBX)); | ||||||
|  |         break; | ||||||
|  | 
 | ||||||
|  |     case Instruction::FlowControlType::JustX: | ||||||
|  |         MOV(32, R(RAX), R(COND0)); | ||||||
|  |         XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); | ||||||
|  |         break; | ||||||
|  | 
 | ||||||
|  |     case Instruction::FlowControlType::JustY: | ||||||
|  |         MOV(32, R(RAX), R(COND1)); | ||||||
|  |         XOR(32, R(RAX), Imm32(instr.flow_control.refy.Value() ^ 1)); | ||||||
|  |         break; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_UniformCondition(Instruction instr) { | ||||||
|  |     int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool)); | ||||||
|  |     CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_ADD(Instruction instr) { | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
|  |     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||||||
|  |     ADDPS(SRC1, R(SRC2)); | ||||||
|  |     Compile_DestEnable(instr, SRC1); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_DP3(Instruction instr) { | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
|  |     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||||||
|  | 
 | ||||||
|  |     if (Common::GetCPUCaps().sse4_1) { | ||||||
|  |         DPPS(SRC1, R(SRC2), 0x7f); | ||||||
|  |     } else { | ||||||
|  |         MULPS(SRC1, R(SRC2)); | ||||||
|  | 
 | ||||||
|  |         MOVAPS(SRC2, R(SRC1)); | ||||||
|  |         SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); | ||||||
|  | 
 | ||||||
|  |         MOVAPS(SRC3, R(SRC1)); | ||||||
|  |         SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); | ||||||
|  | 
 | ||||||
|  |         SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); | ||||||
|  |         ADDPS(SRC1, R(SRC2)); | ||||||
|  |         ADDPS(SRC1, R(SRC3)); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     Compile_DestEnable(instr, SRC1); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_DP4(Instruction instr) { | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
|  |     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||||||
|  | 
 | ||||||
|  |     if (Common::GetCPUCaps().sse4_1) { | ||||||
|  |         DPPS(SRC1, R(SRC2), 0xff); | ||||||
|  |     } else { | ||||||
|  |         MULPS(SRC1, R(SRC2)); | ||||||
|  | 
 | ||||||
|  |         MOVAPS(SRC2, R(SRC1)); | ||||||
|  |         SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
 | ||||||
|  |         ADDPS(SRC1, R(SRC2)); | ||||||
|  | 
 | ||||||
|  |         MOVAPS(SRC2, R(SRC1)); | ||||||
|  |         SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
 | ||||||
|  |         ADDPS(SRC1, R(SRC2)); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     Compile_DestEnable(instr, SRC1); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_MUL(Instruction instr) { | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
|  |     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||||||
|  |     MULPS(SRC1, R(SRC2)); | ||||||
|  |     Compile_DestEnable(instr, SRC1); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_FLR(Instruction instr) { | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
|  | 
 | ||||||
|  |     if (Common::GetCPUCaps().sse4_1) { | ||||||
|  |         ROUNDFLOORPS(SRC1, R(SRC1)); | ||||||
|  |     } else { | ||||||
|  |         CVTPS2DQ(SRC1, R(SRC1)); | ||||||
|  |         CVTDQ2PS(SRC1, R(SRC1)); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     Compile_DestEnable(instr, SRC1); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_MAX(Instruction instr) { | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
|  |     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||||||
|  |     MAXPS(SRC1, R(SRC2)); | ||||||
|  |     Compile_DestEnable(instr, SRC1); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_MIN(Instruction instr) { | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
|  |     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||||||
|  |     MINPS(SRC1, R(SRC2)); | ||||||
|  |     Compile_DestEnable(instr, SRC1); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_MOVA(Instruction instr) { | ||||||
|  |     SwizzlePattern swiz = { g_state.vs.swizzle_data[instr.common.operand_desc_id] }; | ||||||
|  | 
 | ||||||
|  |     if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { | ||||||
|  |         return; // NoOp
 | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
|  | 
 | ||||||
|  |     // Convert floats to integers (only care about X and Y components)
 | ||||||
|  |     CVTPS2DQ(SRC1, R(SRC1)); | ||||||
|  | 
 | ||||||
|  |     // Get result
 | ||||||
|  |     MOVQ_xmm(R(RAX), SRC1); | ||||||
|  | 
 | ||||||
|  |     // Handle destination enable
 | ||||||
|  |     if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { | ||||||
|  |         // Move and sign-extend low 32 bits
 | ||||||
|  |         MOVSX(64, 32, ADDROFFS_REG_0, R(RAX)); | ||||||
|  | 
 | ||||||
|  |         // Move and sign-extend high 32 bits
 | ||||||
|  |         SHR(64, R(RAX), Imm8(32)); | ||||||
|  |         MOVSX(64, 32, ADDROFFS_REG_1, R(RAX)); | ||||||
|  | 
 | ||||||
|  |         // Multiply by 16 to be used as an offset later
 | ||||||
|  |         SHL(64, R(ADDROFFS_REG_0), Imm8(4)); | ||||||
|  |         SHL(64, R(ADDROFFS_REG_1), Imm8(4)); | ||||||
|  |     } else { | ||||||
|  |         if (swiz.DestComponentEnabled(0)) { | ||||||
|  |             // Move and sign-extend low 32 bits
 | ||||||
|  |             MOVSX(64, 32, ADDROFFS_REG_0, R(RAX)); | ||||||
|  | 
 | ||||||
|  |             // Multiply by 16 to be used as an offset later
 | ||||||
|  |             SHL(64, R(ADDROFFS_REG_0), Imm8(4)); | ||||||
|  |         } else if (swiz.DestComponentEnabled(1)) { | ||||||
|  |             // Move and sign-extend high 32 bits
 | ||||||
|  |             SHR(64, R(RAX), Imm8(32)); | ||||||
|  |             MOVSX(64, 32, ADDROFFS_REG_1, R(RAX)); | ||||||
|  | 
 | ||||||
|  |             // Multiply by 16 to be used as an offset later
 | ||||||
|  |             SHL(64, R(ADDROFFS_REG_1), Imm8(4)); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_MOV(Instruction instr) { | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
|  |     Compile_DestEnable(instr, SRC1); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_SLTI(Instruction instr) { | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.common.src2i, SRC2); | ||||||
|  | 
 | ||||||
|  |     CMPSS(SRC1, R(SRC2), CMP_LT); | ||||||
|  |     ANDPS(SRC1, R(ONE)); | ||||||
|  | 
 | ||||||
|  |     Compile_DestEnable(instr, SRC1); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_RCP(Instruction instr) { | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
|  | 
 | ||||||
|  |     // TODO(bunnei): RCPPS is a pretty rough approximation, this might cause problems if Pica
 | ||||||
|  |     // performs this operation more accurately. This should be checked on hardware.
 | ||||||
|  |     RCPPS(SRC1, R(SRC1)); | ||||||
|  | 
 | ||||||
|  |     Compile_DestEnable(instr, SRC1); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_RSQ(Instruction instr) { | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
|  | 
 | ||||||
|  |     // TODO(bunnei): RSQRTPS is a pretty rough approximation, this might cause problems if Pica
 | ||||||
|  |     // performs this operation more accurately. This should be checked on hardware.
 | ||||||
|  |     RSQRTPS(SRC1, R(SRC1)); | ||||||
|  | 
 | ||||||
|  |     Compile_DestEnable(instr, SRC1); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_NOP(Instruction instr) { | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_END(Instruction instr) { | ||||||
|  |     ABI_PopAllCalleeSavedRegsAndAdjustStack(); | ||||||
|  |     RET(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_CALL(Instruction instr) { | ||||||
|  |     unsigned offset = instr.flow_control.dest_offset; | ||||||
|  |     while (offset < (instr.flow_control.dest_offset + instr.flow_control.num_instructions)) { | ||||||
|  |         Compile_NextInstr(&offset); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_CALLC(Instruction instr) { | ||||||
|  |     Compile_EvaluateCondition(instr); | ||||||
|  |     FixupBranch b = J_CC(CC_Z, true); | ||||||
|  |     Compile_CALL(instr); | ||||||
|  |     SetJumpTarget(b); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_CALLU(Instruction instr) { | ||||||
|  |     Compile_UniformCondition(instr); | ||||||
|  |     FixupBranch b = J_CC(CC_Z, true); | ||||||
|  |     Compile_CALL(instr); | ||||||
|  |     SetJumpTarget(b); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_CMP(Instruction instr) { | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||||||
|  |     Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||||||
|  | 
 | ||||||
|  |     static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT }; | ||||||
|  | 
 | ||||||
|  |     if (instr.common.compare_op.x == instr.common.compare_op.y) { | ||||||
|  |         // Compare X-component and Y-component together
 | ||||||
|  |         CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]); | ||||||
|  | 
 | ||||||
|  |         MOVQ_xmm(R(COND0), SRC1); | ||||||
|  |         MOV(64, R(COND1), R(COND0)); | ||||||
|  |     } else { | ||||||
|  |         // Compare X-component
 | ||||||
|  |         MOVAPS(SCRATCH, R(SRC1)); | ||||||
|  |         CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]); | ||||||
|  | 
 | ||||||
|  |         // Compare Y-component
 | ||||||
|  |         CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]); | ||||||
|  | 
 | ||||||
|  |         MOVQ_xmm(R(COND0), SCRATCH); | ||||||
|  |         MOVQ_xmm(R(COND1), SRC1); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     SHR(32, R(COND0), Imm8(31)); | ||||||
|  |     SHR(64, R(COND1), Imm8(63)); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_MAD(Instruction instr) { | ||||||
|  |     Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); | ||||||
|  | 
 | ||||||
|  |     if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||||||
|  |         Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2); | ||||||
|  |         Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3); | ||||||
|  |     } else { | ||||||
|  |         Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2); | ||||||
|  |         Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (Common::GetCPUCaps().fma) { | ||||||
|  |         VFMADD213PS(SRC1, SRC2, R(SRC3)); | ||||||
|  |     } else { | ||||||
|  |         MULPS(SRC1, R(SRC2)); | ||||||
|  |         ADDPS(SRC1, R(SRC3)); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     Compile_DestEnable(instr, SRC1); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_IF(Instruction instr) { | ||||||
|  |     ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards if-statements not supported"); | ||||||
|  | 
 | ||||||
|  |     // Evaluate the "IF" condition
 | ||||||
|  |     if (instr.opcode.Value() == OpCode::Id::IFU) { | ||||||
|  |         Compile_UniformCondition(instr); | ||||||
|  |     } else if (instr.opcode.Value() == OpCode::Id::IFC) { | ||||||
|  |         Compile_EvaluateCondition(instr); | ||||||
|  |     } | ||||||
|  |     FixupBranch b = J_CC(CC_Z, true); | ||||||
|  | 
 | ||||||
|  |     // Compile the code that corresponds to the condition evaluating as true
 | ||||||
|  |     Compile_Block(instr.flow_control.dest_offset - 1); | ||||||
|  | 
 | ||||||
|  |     // If there isn't an "ELSE" condition, we are done here
 | ||||||
|  |     if (instr.flow_control.num_instructions == 0) { | ||||||
|  |         SetJumpTarget(b); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     FixupBranch b2 = J(true); | ||||||
|  | 
 | ||||||
|  |     SetJumpTarget(b); | ||||||
|  | 
 | ||||||
|  |     // This code corresponds to the "ELSE" condition
 | ||||||
|  |     // Comple the code that corresponds to the condition evaluating as false
 | ||||||
|  |     Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions - 1); | ||||||
|  | 
 | ||||||
|  |     SetJumpTarget(b2); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_LOOP(Instruction instr) { | ||||||
|  |     ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards loops not supported"); | ||||||
|  |     ASSERT_MSG(!looping, "Nested loops not supported"); | ||||||
|  | 
 | ||||||
|  |     looping = true; | ||||||
|  | 
 | ||||||
|  |     int offset = offsetof(decltype(g_state.vs.uniforms), i) + (instr.flow_control.int_uniform_id * sizeof(Math::Vec4<u8>)); | ||||||
|  |     MOV(32, R(LOOPCOUNT), MDisp(UNIFORMS, offset)); | ||||||
|  |     MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT)); | ||||||
|  |     SHR(32, R(LOOPCOUNT_REG), Imm8(8)); | ||||||
|  |     AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start
 | ||||||
|  |     MOV(32, R(LOOPINC), R(LOOPCOUNT)); | ||||||
|  |     SHR(32, R(LOOPINC), Imm8(16)); | ||||||
|  |     MOVZX(32, 8, LOOPINC, R(LOOPINC)); // Z-component is the incrementer
 | ||||||
|  |     MOVZX(32, 8, LOOPCOUNT, R(LOOPCOUNT)); // X-component is iteration count
 | ||||||
|  |     ADD(32, R(LOOPCOUNT), Imm8(1)); // Iteration count is X-component + 1
 | ||||||
|  | 
 | ||||||
|  |     auto loop_start = GetCodePtr(); | ||||||
|  | 
 | ||||||
|  |     Compile_Block(instr.flow_control.dest_offset); | ||||||
|  | 
 | ||||||
|  |     ADD(32, R(LOOPCOUNT_REG), R(LOOPINC)); // Increment LOOPCOUNT_REG by Z-component
 | ||||||
|  |     SUB(32, R(LOOPCOUNT), Imm8(1)); // Increment loop count by 1
 | ||||||
|  |     J_CC(CC_NZ, loop_start); // Loop if not equal
 | ||||||
|  | 
 | ||||||
|  |     looping = false; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_JMP(Instruction instr) { | ||||||
|  |     ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards jumps not supported"); | ||||||
|  | 
 | ||||||
|  |     if (instr.opcode.Value() == OpCode::Id::JMPC) | ||||||
|  |         Compile_EvaluateCondition(instr); | ||||||
|  |     else if (instr.opcode.Value() == OpCode::Id::JMPU) | ||||||
|  |         Compile_UniformCondition(instr); | ||||||
|  |     else | ||||||
|  |         UNREACHABLE(); | ||||||
|  | 
 | ||||||
|  |     FixupBranch b = J_CC(CC_NZ, true); | ||||||
|  | 
 | ||||||
|  |     Compile_Block(instr.flow_control.dest_offset); | ||||||
|  | 
 | ||||||
|  |     SetJumpTarget(b); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_Block(unsigned stop) { | ||||||
|  |     // Save current offset pointer
 | ||||||
|  |     unsigned* prev_offset_ptr = offset_ptr; | ||||||
|  |     unsigned offset = *prev_offset_ptr; | ||||||
|  | 
 | ||||||
|  |     while (offset <= stop) | ||||||
|  |         Compile_NextInstr(&offset); | ||||||
|  | 
 | ||||||
|  |     // Restore current offset pointer
 | ||||||
|  |     offset_ptr = prev_offset_ptr; | ||||||
|  |     *offset_ptr = offset; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Compile_NextInstr(unsigned* offset) { | ||||||
|  |     offset_ptr = offset; | ||||||
|  | 
 | ||||||
|  |     Instruction instr = *(Instruction*)&g_state.vs.program_code[(*offset_ptr)++]; | ||||||
|  |     OpCode::Id opcode = instr.opcode.Value(); | ||||||
|  |     auto instr_func = instr_table[static_cast<unsigned>(opcode)]; | ||||||
|  | 
 | ||||||
|  |     if (instr_func) { | ||||||
|  |         // JIT the instruction!
 | ||||||
|  |         ((*this).*instr_func)(instr); | ||||||
|  |     } else { | ||||||
|  |         // Unhandled instruction
 | ||||||
|  |         LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", instr.opcode.Value(), instr.hex); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | CompiledShader* JitCompiler::Compile() { | ||||||
|  |     const u8* start = GetCodePtr(); | ||||||
|  |     const auto& code = g_state.vs.program_code; | ||||||
|  |     unsigned offset = g_state.regs.vs.main_offset; | ||||||
|  | 
 | ||||||
|  |     ABI_PushAllCalleeSavedRegsAndAdjustStack(); | ||||||
|  | 
 | ||||||
|  |     MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1)); | ||||||
|  |     MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms)); | ||||||
|  | 
 | ||||||
|  |     // Zero address/loop  registers
 | ||||||
|  |     XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0)); | ||||||
|  |     XOR(64, R(ADDROFFS_REG_1), R(ADDROFFS_REG_1)); | ||||||
|  |     XOR(64, R(LOOPCOUNT_REG), R(LOOPCOUNT_REG)); | ||||||
|  | 
 | ||||||
|  |     // Used to set a register to one
 | ||||||
|  |     static const __m128 one = { 1.f, 1.f, 1.f, 1.f }; | ||||||
|  |     MOV(PTRBITS, R(RAX), ImmPtr(&one)); | ||||||
|  |     MOVAPS(ONE, MDisp(RAX, 0)); | ||||||
|  | 
 | ||||||
|  |     // Used to negate registers
 | ||||||
|  |     static const __m128 neg = { -0.f, -0.f, -0.f, -0.f }; | ||||||
|  |     MOV(PTRBITS, R(RAX), ImmPtr(&neg)); | ||||||
|  |     MOVAPS(NEGBIT, MDisp(RAX, 0)); | ||||||
|  | 
 | ||||||
|  |     looping = false; | ||||||
|  | 
 | ||||||
|  |     while (offset < g_state.vs.program_code.size()) { | ||||||
|  |         Compile_NextInstr(&offset); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     return (CompiledShader*)start; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | JitCompiler::JitCompiler() { | ||||||
|  |     AllocCodeSpace(1024 * 1024 * 4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void JitCompiler::Clear() { | ||||||
|  |     ClearCodeSpace(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | } // namespace Shader
 | ||||||
|  | 
 | ||||||
|  | } // namespace Pica
 | ||||||
							
								
								
									
										79
									
								
								src/video_core/shader/shader_jit_x64.h
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										79
									
								
								src/video_core/shader/shader_jit_x64.h
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,79 @@ | ||||||
|  | // Copyright 2015 Citra Emulator Project
 | ||||||
|  | // Licensed under GPLv2 or any later version
 | ||||||
|  | // Refer to the license.txt file included.
 | ||||||
|  | 
 | ||||||
|  | #pragma once | ||||||
|  | 
 | ||||||
|  | #include <nihstro/shader_bytecode.h> | ||||||
|  | 
 | ||||||
|  | #include "common/x64/emitter.h" | ||||||
|  | 
 | ||||||
|  | #include "video_core/pica.h" | ||||||
|  | 
 | ||||||
|  | #include "shader.h" | ||||||
|  | 
 | ||||||
|  | using nihstro::Instruction; | ||||||
|  | using nihstro::OpCode; | ||||||
|  | using nihstro::SwizzlePattern; | ||||||
|  | 
 | ||||||
|  | namespace Pica { | ||||||
|  | 
 | ||||||
|  | namespace Shader { | ||||||
|  | 
 | ||||||
|  | using CompiledShader = void(void* registers); | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 | ||||||
|  |  * code that can be executed on the host machine directly. | ||||||
|  |  */ | ||||||
|  | class JitCompiler : public Gen::XCodeBlock { | ||||||
|  | public: | ||||||
|  |     JitCompiler(); | ||||||
|  | 
 | ||||||
|  |     CompiledShader* Compile(); | ||||||
|  | 
 | ||||||
|  |     void Clear(); | ||||||
|  | 
 | ||||||
|  |     void Compile_ADD(Instruction instr); | ||||||
|  |     void Compile_DP3(Instruction instr); | ||||||
|  |     void Compile_DP4(Instruction instr); | ||||||
|  |     void Compile_MUL(Instruction instr); | ||||||
|  |     void Compile_FLR(Instruction instr); | ||||||
|  |     void Compile_MAX(Instruction instr); | ||||||
|  |     void Compile_MIN(Instruction instr); | ||||||
|  |     void Compile_RCP(Instruction instr); | ||||||
|  |     void Compile_RSQ(Instruction instr); | ||||||
|  |     void Compile_MOVA(Instruction instr); | ||||||
|  |     void Compile_MOV(Instruction instr); | ||||||
|  |     void Compile_SLTI(Instruction instr); | ||||||
|  |     void Compile_NOP(Instruction instr); | ||||||
|  |     void Compile_END(Instruction instr); | ||||||
|  |     void Compile_CALL(Instruction instr); | ||||||
|  |     void Compile_CALLC(Instruction instr); | ||||||
|  |     void Compile_CALLU(Instruction instr); | ||||||
|  |     void Compile_IF(Instruction instr); | ||||||
|  |     void Compile_LOOP(Instruction instr); | ||||||
|  |     void Compile_JMP(Instruction instr); | ||||||
|  |     void Compile_CMP(Instruction instr); | ||||||
|  |     void Compile_MAD(Instruction instr); | ||||||
|  | 
 | ||||||
|  | private: | ||||||
|  |     void Compile_Block(unsigned stop); | ||||||
|  |     void Compile_NextInstr(unsigned* offset); | ||||||
|  | 
 | ||||||
|  |     void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); | ||||||
|  |     void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); | ||||||
|  | 
 | ||||||
|  |     void Compile_EvaluateCondition(Instruction instr); | ||||||
|  |     void Compile_UniformCondition(Instruction instr); | ||||||
|  | 
 | ||||||
|  |     /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks.
 | ||||||
|  |     unsigned* offset_ptr = nullptr; | ||||||
|  | 
 | ||||||
|  |     /// Set to true if currently in a loop, used to check for the existence of nested loops
 | ||||||
|  |     bool looping = false; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | } // Shader
 | ||||||
|  | 
 | ||||||
|  | } // Pica
 | ||||||
|  | @ -1,73 +0,0 @@ | ||||||
| // Copyright 2014 Citra Emulator Project
 |  | ||||||
| // Licensed under GPLv2 or any later version
 |  | ||||||
| // Refer to the license.txt file included.
 |  | ||||||
| 
 |  | ||||||
| #pragma once |  | ||||||
| 
 |  | ||||||
| #include <type_traits> |  | ||||||
| 
 |  | ||||||
| #include "common/vector_math.h" |  | ||||||
| 
 |  | ||||||
| #include "pica.h" |  | ||||||
| 
 |  | ||||||
| namespace Pica { |  | ||||||
| 
 |  | ||||||
| namespace VertexShader { |  | ||||||
| 
 |  | ||||||
| struct InputVertex { |  | ||||||
|     Math::Vec4<float24> attr[16]; |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| struct OutputVertex { |  | ||||||
|     OutputVertex() = default; |  | ||||||
| 
 |  | ||||||
|     // VS output attributes
 |  | ||||||
|     Math::Vec4<float24> pos; |  | ||||||
|     Math::Vec4<float24> dummy; // quaternions (not implemented, yet)
 |  | ||||||
|     Math::Vec4<float24> color; |  | ||||||
|     Math::Vec2<float24> tc0; |  | ||||||
|     Math::Vec2<float24> tc1; |  | ||||||
|     float24 pad[6]; |  | ||||||
|     Math::Vec2<float24> tc2; |  | ||||||
| 
 |  | ||||||
|     // Padding for optimal alignment
 |  | ||||||
|     float24 pad2[4]; |  | ||||||
| 
 |  | ||||||
|     // Attributes used to store intermediate results
 |  | ||||||
| 
 |  | ||||||
|     // position after perspective divide
 |  | ||||||
|     Math::Vec3<float24> screenpos; |  | ||||||
|     float24 pad3; |  | ||||||
| 
 |  | ||||||
|     // Linear interpolation
 |  | ||||||
|     // factor: 0=this, 1=vtx
 |  | ||||||
|     void Lerp(float24 factor, const OutputVertex& vtx) { |  | ||||||
|         pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); |  | ||||||
| 
 |  | ||||||
|         // TODO: Should perform perspective correct interpolation here...
 |  | ||||||
|         tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); |  | ||||||
|         tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); |  | ||||||
|         tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); |  | ||||||
| 
 |  | ||||||
|         screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); |  | ||||||
| 
 |  | ||||||
|         color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     // Linear interpolation
 |  | ||||||
|     // factor: 0=v0, 1=v1
 |  | ||||||
|     static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { |  | ||||||
|         OutputVertex ret = v0; |  | ||||||
|         ret.Lerp(factor, v1); |  | ||||||
|         return ret; |  | ||||||
|     } |  | ||||||
| }; |  | ||||||
| static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); |  | ||||||
| static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); |  | ||||||
| 
 |  | ||||||
| OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup); |  | ||||||
| 
 |  | ||||||
| } // namespace
 |  | ||||||
| 
 |  | ||||||
| } // namespace
 |  | ||||||
| 
 |  | ||||||
|  | @ -23,6 +23,7 @@ EmuWindow*      g_emu_window    = nullptr;     ///< Frontend emulator window | ||||||
| RendererBase*   g_renderer      = nullptr;     ///< Renderer plugin
 | RendererBase*   g_renderer      = nullptr;     ///< Renderer plugin
 | ||||||
| 
 | 
 | ||||||
| std::atomic<bool> g_hw_renderer_enabled; | std::atomic<bool> g_hw_renderer_enabled; | ||||||
|  | std::atomic<bool> g_shader_jit_enabled; | ||||||
| 
 | 
 | ||||||
| /// Initialize the video core
 | /// Initialize the video core
 | ||||||
| void Init(EmuWindow* emu_window) { | void Init(EmuWindow* emu_window) { | ||||||
|  |  | ||||||
|  | @ -32,8 +32,9 @@ static const int kScreenBottomHeight    = 240;  ///< 3DS bottom screen height | ||||||
| extern RendererBase*   g_renderer;              ///< Renderer plugin
 | extern RendererBase*   g_renderer;              ///< Renderer plugin
 | ||||||
| extern EmuWindow*      g_emu_window;            ///< Emu window
 | extern EmuWindow*      g_emu_window;            ///< Emu window
 | ||||||
| 
 | 
 | ||||||
| // TODO: Wrap this in a user settings struct along with any other graphics settings (often set from qt ui)
 | // TODO: Wrap these in a user settings struct along with any other graphics settings (often set from qt ui)
 | ||||||
| extern std::atomic<bool> g_hw_renderer_enabled; | extern std::atomic<bool> g_hw_renderer_enabled; | ||||||
|  | extern std::atomic<bool> g_shader_jit_enabled; | ||||||
| 
 | 
 | ||||||
| /// Start the video core
 | /// Start the video core
 | ||||||
| void Start(); | void Start(); | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue