mirror of
				https://github.com/PabloMK7/citra.git
				synced 2025-10-30 21:30:04 +00:00 
			
		
		
		
	Optimize AttributeBuffer to OutputVertex conversion (#3283)
Optimize AttributeBuffer to OutputVertex conversion First I unrolled the inner loop, then I pushed semantics validation outside of the hotloop. I also added overflow slots to avoid conditional branches. Super Mario 3D Land's intro runs at almost full speed when compiled with Clang, and theres a noticible speed increase in MSVC. GCC hasn't been tested but I'm confident in its ability to optimize this code.
This commit is contained in:
		
							parent
							
								
									3f7f2b42c0
								
							
						
					
					
						commit
						41929371dc
					
				
					 4 changed files with 34 additions and 18 deletions
				
			
		|  | @ -221,6 +221,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | ||||||
|                     MICROPROFILE_SCOPE(GPU_Drawing); |                     MICROPROFILE_SCOPE(GPU_Drawing); | ||||||
|                     immediate_attribute_id = 0; |                     immediate_attribute_id = 0; | ||||||
| 
 | 
 | ||||||
|  |                     Shader::OutputVertex::ValidateSemantics(regs.rasterizer); | ||||||
|  | 
 | ||||||
|                     auto* shader_engine = Shader::GetEngine(); |                     auto* shader_engine = Shader::GetEngine(); | ||||||
|                     shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); |                     shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); | ||||||
| 
 | 
 | ||||||
|  | @ -289,6 +291,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | ||||||
|         // Later, these can be compiled and cached.
 |         // Later, these can be compiled and cached.
 | ||||||
|         const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress(); |         const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress(); | ||||||
|         VertexLoader loader(regs.pipeline); |         VertexLoader loader(regs.pipeline); | ||||||
|  |         Shader::OutputVertex::ValidateSemantics(regs.rasterizer); | ||||||
| 
 | 
 | ||||||
|         // Load vertices
 |         // Load vertices
 | ||||||
|         bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed)); |         bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed)); | ||||||
|  |  | ||||||
|  | @ -87,6 +87,8 @@ struct RasterizerRegs { | ||||||
|         BitField<8, 5, Semantic> map_y; |         BitField<8, 5, Semantic> map_y; | ||||||
|         BitField<16, 5, Semantic> map_z; |         BitField<16, 5, Semantic> map_z; | ||||||
|         BitField<24, 5, Semantic> map_w; |         BitField<24, 5, Semantic> map_w; | ||||||
|  | 
 | ||||||
|  |         u32 raw; | ||||||
|     } vs_output_attributes[7]; |     } vs_output_attributes[7]; | ||||||
| 
 | 
 | ||||||
|     INSERT_PADDING_WORDS(0xe); |     INSERT_PADDING_WORDS(0xe); | ||||||
|  |  | ||||||
|  | @ -2,6 +2,7 @@ | ||||||
| // Licensed under GPLv2 or any later version
 | // Licensed under GPLv2 or any later version
 | ||||||
| // Refer to the license.txt file included.
 | // Refer to the license.txt file included.
 | ||||||
| 
 | 
 | ||||||
|  | #include <cinttypes> | ||||||
| #include <cmath> | #include <cmath> | ||||||
| #include <cstring> | #include <cstring> | ||||||
| #include "common/bit_set.h" | #include "common/bit_set.h" | ||||||
|  | @ -21,32 +22,41 @@ namespace Pica { | ||||||
| 
 | 
 | ||||||
| namespace Shader { | namespace Shader { | ||||||
| 
 | 
 | ||||||
|  | void OutputVertex::ValidateSemantics(const RasterizerRegs& regs) { | ||||||
|  |     unsigned int num_attributes = regs.vs_output_total; | ||||||
|  |     ASSERT(num_attributes <= 7); | ||||||
|  |     for (size_t attrib = 0; attrib < num_attributes; ++attrib) { | ||||||
|  |         u32 output_register_map = regs.vs_output_attributes[attrib].raw; | ||||||
|  |         for (size_t comp = 0; comp < 4; ++comp) { | ||||||
|  |             u32 semantic = (output_register_map >> (8 * comp)) & 0x1F; | ||||||
|  |             ASSERT_MSG(semantic < 24 || semantic == RasterizerRegs::VSOutputAttributes::INVALID, | ||||||
|  |                        "Invalid/unknown semantic id: %" PRIu32, semantic); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
| OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, | OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, | ||||||
|                                                const AttributeBuffer& input) { |                                                const AttributeBuffer& input) { | ||||||
|     // Setup output data
 |     // Setup output data
 | ||||||
|     union { |     union { | ||||||
|         OutputVertex ret{}; |         OutputVertex ret{}; | ||||||
|         std::array<float24, 24> vertex_slots; |         // Allow us to overflow OutputVertex to avoid branches, since
 | ||||||
|  |         // RasterizerRegs::VSOutputAttributes::INVALID would write to slot 31, which
 | ||||||
|  |         // would be out of bounds otherwise.
 | ||||||
|  |         std::array<float24, 32> vertex_slots_overflow; | ||||||
|     }; |     }; | ||||||
|     static_assert(sizeof(vertex_slots) == sizeof(ret), "Struct and array have different sizes."); |  | ||||||
| 
 | 
 | ||||||
|     unsigned int num_attributes = regs.vs_output_total; |     // Assert that OutputVertex has enough space for 24 semantic registers
 | ||||||
|     ASSERT(num_attributes <= 7); |     static_assert(sizeof(std::array<float24, 24>) == sizeof(ret), | ||||||
|     for (unsigned int i = 0; i < num_attributes; ++i) { |                   "Struct and array have different sizes."); | ||||||
|         const auto& output_register_map = regs.vs_output_attributes[i]; |  | ||||||
| 
 | 
 | ||||||
|         RasterizerRegs::VSOutputAttributes::Semantic semantics[4] = { |     unsigned int num_attributes = regs.vs_output_total & 7; | ||||||
|             output_register_map.map_x, output_register_map.map_y, output_register_map.map_z, |     for (size_t attrib = 0; attrib < num_attributes; ++attrib) { | ||||||
|             output_register_map.map_w}; |         const auto output_register_map = regs.vs_output_attributes[attrib]; | ||||||
| 
 |         vertex_slots_overflow[output_register_map.map_x] = input.attr[attrib][0]; | ||||||
|         for (unsigned comp = 0; comp < 4; ++comp) { |         vertex_slots_overflow[output_register_map.map_y] = input.attr[attrib][1]; | ||||||
|             RasterizerRegs::VSOutputAttributes::Semantic semantic = semantics[comp]; |         vertex_slots_overflow[output_register_map.map_z] = input.attr[attrib][2]; | ||||||
|             if (semantic < vertex_slots.size()) { |         vertex_slots_overflow[output_register_map.map_w] = input.attr[attrib][3]; | ||||||
|                 vertex_slots[semantic] = input.attr[i][comp]; |  | ||||||
|             } else if (semantic != RasterizerRegs::VSOutputAttributes::INVALID) { |  | ||||||
|                 LOG_ERROR(HW_GPU, "Invalid/unknown semantic id: %u", (unsigned int)semantic); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     // The hardware takes the absolute and saturates vertex colors like this, *before* doing
 |     // The hardware takes the absolute and saturates vertex colors like this, *before* doing
 | ||||||
|  |  | ||||||
|  | @ -50,6 +50,7 @@ struct OutputVertex { | ||||||
|     INSERT_PADDING_WORDS(1); |     INSERT_PADDING_WORDS(1); | ||||||
|     Math::Vec2<float24> tc2; |     Math::Vec2<float24> tc2; | ||||||
| 
 | 
 | ||||||
|  |     static void ValidateSemantics(const RasterizerRegs& regs); | ||||||
|     static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs, |     static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs, | ||||||
|                                             const AttributeBuffer& output); |                                             const AttributeBuffer& output); | ||||||
| }; | }; | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue