mirror of
				https://github.com/PabloMK7/citra.git
				synced 2025-10-31 05:40:04 +00:00 
			
		
		
		
	
						commit
						b7e0b16354
					
				
					 7 changed files with 103 additions and 79 deletions
				
			
		|  | @ -41,11 +41,11 @@ else() | |||
|     message(STATUS "libpng not found. Some debugging features have been disabled.") | ||||
| endif() | ||||
| 
 | ||||
| find_package(Boost) | ||||
| find_package(Boost 1.57.0) | ||||
| if (Boost_FOUND) | ||||
|     include_directories(${Boost_INCLUDE_DIRS}) | ||||
| else() | ||||
|     message(STATUS "Boost not found, falling back to externals") | ||||
|     message(STATUS "Boost 1.57.0 or newer not found, falling back to externals") | ||||
|     include_directories(externals/boost) | ||||
| endif() | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										2
									
								
								externals/boost
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								externals/boost
									
										
									
									
										vendored
									
									
								
							|  | @ -1 +1 @@ | |||
| Subproject commit b060148c08ae87a3a5809c4f48cb26ac667487ab | ||||
| Subproject commit 97052c28acb141dbf3c5e14114af99045344b695 | ||||
|  | @ -2,7 +2,7 @@ | |||
| // Licensed under GPLv2 or any later version
 | ||||
| // Refer to the license.txt file included.
 | ||||
| 
 | ||||
| #include <vector> | ||||
| #include <boost/container/static_vector.hpp> | ||||
| 
 | ||||
| #include "clipper.h" | ||||
| #include "pica.h" | ||||
|  | @ -91,25 +91,31 @@ static void InitScreenCoordinates(OutputVertex& vtx) | |||
|     viewport.zscale     = float24::FromRawFloat24(registers.viewport_depth_range); | ||||
|     viewport.offset_z   = float24::FromRawFloat24(registers.viewport_depth_far_plane); | ||||
| 
 | ||||
|     float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w; | ||||
|     vtx.color *= inv_w; | ||||
|     vtx.tc0 *= inv_w; | ||||
|     vtx.tc1 *= inv_w; | ||||
|     vtx.tc2 *= inv_w; | ||||
|     vtx.pos.w = inv_w; | ||||
| 
 | ||||
|     // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
 | ||||
|     vtx.screenpos[0] = (vtx.pos.x / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x; | ||||
|     vtx.screenpos[1] = (vtx.pos.y / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y; | ||||
|     vtx.screenpos[2] = viewport.offset_z - vtx.pos.z / vtx.pos.w * viewport.zscale; | ||||
|     vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x; | ||||
|     vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y; | ||||
|     vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale; | ||||
| } | ||||
| 
 | ||||
| void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { | ||||
|     using boost::container::static_vector; | ||||
| 
 | ||||
|     // TODO (neobrain):
 | ||||
|     // The list of output vertices has some fixed maximum size,
 | ||||
|     // however I haven't taken the time to figure out what it is exactly.
 | ||||
|     // For now, we hence just assume a maximal size of 1000 vertices.
 | ||||
|     const size_t max_vertices = 1000; | ||||
|     std::vector<OutputVertex> buffer_vertices; | ||||
|     std::vector<OutputVertex*> output_list{ &v0, &v1, &v2 }; | ||||
| 
 | ||||
|     // Make sure to reserve space for all vertices.
 | ||||
|     // Without this, buffer reallocation would invalidate references.
 | ||||
|     buffer_vertices.reserve(max_vertices); | ||||
|     // Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at
 | ||||
|     // the new edge (or less in degenerate cases). As such, we can say that each clipping plane
 | ||||
|     // introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a
 | ||||
|     // fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9.
 | ||||
|     static const size_t MAX_VERTICES = 9; | ||||
|     static_vector<OutputVertex, MAX_VERTICES> buffer_a = { v0, v1, v2 }; | ||||
|     static_vector<OutputVertex, MAX_VERTICES> buffer_b; | ||||
|     auto* output_list = &buffer_a; | ||||
|     auto* input_list  = &buffer_b; | ||||
| 
 | ||||
|     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
 | ||||
|     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
 | ||||
|  | @ -120,48 +126,45 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { | |||
|                        ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)), | ||||
|                        ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) { | ||||
| 
 | ||||
|         const std::vector<OutputVertex*> input_list = output_list; | ||||
|         output_list.clear(); | ||||
|         std::swap(input_list, output_list); | ||||
|         output_list->clear(); | ||||
| 
 | ||||
|         const OutputVertex* reference_vertex = input_list.back(); | ||||
|         const OutputVertex* reference_vertex = &input_list->back(); | ||||
| 
 | ||||
|         for (const auto& vertex : input_list) { | ||||
|         for (const auto& vertex : *input_list) { | ||||
|             // NOTE: This algorithm changes vertex order in some cases!
 | ||||
|             if (edge.IsInside(*vertex)) { | ||||
|             if (edge.IsInside(vertex)) { | ||||
|                 if (edge.IsOutSide(*reference_vertex)) { | ||||
|                     buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex)); | ||||
|                     output_list.push_back(&(buffer_vertices.back())); | ||||
|                     output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); | ||||
|                 } | ||||
| 
 | ||||
|                 output_list.push_back(vertex); | ||||
|                 output_list->push_back(vertex); | ||||
|             } else if (edge.IsInside(*reference_vertex)) { | ||||
|                 buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex)); | ||||
|                 output_list.push_back(&(buffer_vertices.back())); | ||||
|                 output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); | ||||
|             } | ||||
| 
 | ||||
|             reference_vertex = vertex; | ||||
|             reference_vertex = &vertex; | ||||
|         } | ||||
| 
 | ||||
|         // Need to have at least a full triangle to continue...
 | ||||
|         if (output_list.size() < 3) | ||||
|         if (output_list->size() < 3) | ||||
|             return; | ||||
|     } | ||||
| 
 | ||||
|     InitScreenCoordinates(*(output_list[0])); | ||||
|     InitScreenCoordinates(*(output_list[1])); | ||||
|     InitScreenCoordinates((*output_list)[0]); | ||||
|     InitScreenCoordinates((*output_list)[1]); | ||||
| 
 | ||||
|     for (size_t i = 0; i < output_list.size() - 2; i ++) { | ||||
|         OutputVertex& vtx0 = *(output_list[0]); | ||||
|         OutputVertex& vtx1 = *(output_list[i+1]); | ||||
|         OutputVertex& vtx2 = *(output_list[i+2]); | ||||
|     for (size_t i = 0; i < output_list->size() - 2; i ++) { | ||||
|         OutputVertex& vtx0 = (*output_list)[0]; | ||||
|         OutputVertex& vtx1 = (*output_list)[i+1]; | ||||
|         OutputVertex& vtx2 = (*output_list)[i+2]; | ||||
| 
 | ||||
|         InitScreenCoordinates(vtx2); | ||||
| 
 | ||||
|         LOG_TRACE(Render_Software, | ||||
|                   "Triangle %lu/%lu (%lu buffer vertices) at position (%.3f, %.3f, %.3f, %.3f), " | ||||
|                   "Triangle %lu/%lu at position (%.3f, %.3f, %.3f, %.3f), " | ||||
|                   "(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and " | ||||
|                   "screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)", | ||||
|                   i,output_list.size(), buffer_vertices.size(), | ||||
|                   i, output_list->size(), | ||||
|                   vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(), | ||||
|                   vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(), | ||||
|                   vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(), | ||||
|  |  | |||
|  | @ -304,7 +304,6 @@ std::unique_ptr<PicaTrace> FinishPicaTracing() | |||
| } | ||||
| 
 | ||||
| const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const TextureInfo& info, bool disable_alpha) { | ||||
| 
 | ||||
|     // Images are split into 8x8 tiles. Each tile is composed of four 4x4 subtiles each
 | ||||
|     // of which is composed of four 2x2 subtiles each of which is composed of four texels.
 | ||||
|     // Each structure is embedded into the next-bigger one in a diagonal pattern, e.g.
 | ||||
|  | @ -323,41 +322,39 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture | |||
|     // 02 03 06 07 18 19 22 23
 | ||||
|     // 00 01 04 05 16 17 20 21
 | ||||
| 
 | ||||
|     // TODO(neobrain): Not sure if this swizzling pattern is used for all textures.
 | ||||
|     // To be flexible in case different but similar patterns are used, we keep this
 | ||||
|     // somewhat inefficient code around for now.
 | ||||
|     int texel_index_within_tile = 0; | ||||
|     for (int block_size_index = 0; block_size_index < 3; ++block_size_index) { | ||||
|         int sub_tile_width = 1 << block_size_index; | ||||
|         int sub_tile_height = 1 << block_size_index; | ||||
|     const unsigned int block_width = 8; | ||||
|     const unsigned int block_height = 8; | ||||
| 
 | ||||
|         int sub_tile_index = (x & sub_tile_width) << block_size_index; | ||||
|         sub_tile_index += 2 * ((y & sub_tile_height) << block_size_index); | ||||
|         texel_index_within_tile += sub_tile_index; | ||||
|     } | ||||
|     const unsigned int coarse_x = x & ~7; | ||||
|     const unsigned int coarse_y = y & ~7; | ||||
| 
 | ||||
|     const int block_width = 8; | ||||
|     const int block_height = 8; | ||||
|     // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
 | ||||
|     // arranged in a Z-order curve. More details on the bit manipulation at:
 | ||||
|     // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
 | ||||
|     unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
 | ||||
|     i = (i ^ (i << 2)) & 0x1313;              // ---2 --10
 | ||||
|     i = (i ^ (i << 1)) & 0x1515;              // ---2 -1-0
 | ||||
|     i = (i | (i >> 7)) & 0x3F; | ||||
| 
 | ||||
|     int coarse_x = (x / block_width) * block_width; | ||||
|     int coarse_y = (y / block_height) * block_height; | ||||
|     source += coarse_y * info.stride; | ||||
|     const unsigned int offset = coarse_x * block_height + i; | ||||
| 
 | ||||
|     switch (info.format) { | ||||
|     case Regs::TextureFormat::RGBA8: | ||||
|     { | ||||
|         const u8* source_ptr = source + coarse_x * block_height * 4 + coarse_y * info.stride + texel_index_within_tile * 4; | ||||
|         const u8* source_ptr = source + offset * 4; | ||||
|         return { source_ptr[3], source_ptr[2], source_ptr[1], disable_alpha ? (u8)255 : source_ptr[0] }; | ||||
|     } | ||||
| 
 | ||||
|     case Regs::TextureFormat::RGB8: | ||||
|     { | ||||
|         const u8* source_ptr = source + coarse_x * block_height * 3 + coarse_y * info.stride + texel_index_within_tile * 3; | ||||
|         const u8* source_ptr = source + offset * 3; | ||||
|         return { source_ptr[2], source_ptr[1], source_ptr[0], 255 }; | ||||
|     } | ||||
| 
 | ||||
|     case Regs::TextureFormat::RGBA5551: | ||||
|     { | ||||
|         const u16 source_ptr = *(const u16*)(source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2); | ||||
|         const u16 source_ptr = *(const u16*)(source + offset * 2); | ||||
|         u8 r = (source_ptr >> 11) & 0x1F; | ||||
|         u8 g = ((source_ptr) >> 6) & 0x1F; | ||||
|         u8 b = (source_ptr >> 1) & 0x1F; | ||||
|  | @ -367,7 +364,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture | |||
| 
 | ||||
|     case Regs::TextureFormat::RGB565: | ||||
|     { | ||||
|         const u16 source_ptr = *(const u16*)(source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2); | ||||
|         const u16 source_ptr = *(const u16*)(source + offset * 2); | ||||
|         u8 r = (source_ptr >> 11) & 0x1F; | ||||
|         u8 g = ((source_ptr) >> 5) & 0x3F; | ||||
|         u8 b = (source_ptr) & 0x1F; | ||||
|  | @ -376,7 +373,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture | |||
| 
 | ||||
|     case Regs::TextureFormat::RGBA4: | ||||
|     { | ||||
|         const u8* source_ptr = source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2; | ||||
|         const u8* source_ptr = source + offset * 2; | ||||
|         u8 r = source_ptr[1] >> 4; | ||||
|         u8 g = source_ptr[1] & 0xFF; | ||||
|         u8 b = source_ptr[0] >> 4; | ||||
|  | @ -390,7 +387,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture | |||
| 
 | ||||
|     case Regs::TextureFormat::IA8: | ||||
|     { | ||||
|         const u8* source_ptr = source + coarse_x * block_height * 2 + coarse_y * info.stride + texel_index_within_tile * 2; | ||||
|         const u8* source_ptr = source + offset * 2; | ||||
| 
 | ||||
|         // TODO: component order not verified
 | ||||
| 
 | ||||
|  | @ -404,13 +401,13 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture | |||
| 
 | ||||
|     case Regs::TextureFormat::I8: | ||||
|     { | ||||
|         const u8* source_ptr = source + coarse_x * block_height + coarse_y * info.stride + texel_index_within_tile; | ||||
|         const u8* source_ptr = source + offset; | ||||
|         return { *source_ptr, *source_ptr, *source_ptr, 255 }; | ||||
|     } | ||||
| 
 | ||||
|     case Regs::TextureFormat::A8: | ||||
|     { | ||||
|         const u8* source_ptr = source + coarse_x * block_height + coarse_y * info.stride + texel_index_within_tile; | ||||
|         const u8* source_ptr = source + offset; | ||||
| 
 | ||||
|         if (disable_alpha) { | ||||
|             return { *source_ptr, *source_ptr, *source_ptr, 255 }; | ||||
|  | @ -421,7 +418,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture | |||
| 
 | ||||
|     case Regs::TextureFormat::IA4: | ||||
|     { | ||||
|         const u8* source_ptr = source + coarse_x * block_height / 2 + coarse_y * info.stride + texel_index_within_tile / 2; | ||||
|         const u8* source_ptr = source + offset / 2; | ||||
| 
 | ||||
|         // TODO: component order not verified
 | ||||
| 
 | ||||
|  | @ -440,7 +437,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture | |||
| 
 | ||||
|     case Regs::TextureFormat::A4: | ||||
|     { | ||||
|         const u8* source_ptr = source + coarse_x * block_height / 2 + coarse_y * info.stride + texel_index_within_tile / 2; | ||||
|         const u8* source_ptr = source + offset / 2; | ||||
| 
 | ||||
|         // TODO: component order not verified
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -757,6 +757,26 @@ struct float24 { | |||
|         return float24::FromFloat32(ToFloat32() - flt.ToFloat32()); | ||||
|     } | ||||
| 
 | ||||
|     float24& operator *= (const float24& flt) { | ||||
|         value *= flt.ToFloat32(); | ||||
|         return *this; | ||||
|     } | ||||
| 
 | ||||
|     float24& operator /= (const float24& flt) { | ||||
|         value /= flt.ToFloat32(); | ||||
|         return *this; | ||||
|     } | ||||
| 
 | ||||
|     float24& operator += (const float24& flt) { | ||||
|         value += flt.ToFloat32(); | ||||
|         return *this; | ||||
|     } | ||||
| 
 | ||||
|     float24& operator -= (const float24& flt) { | ||||
|         value -= flt.ToFloat32(); | ||||
|         return *this; | ||||
|     } | ||||
| 
 | ||||
|     float24 operator - () const { | ||||
|         return float24::FromFloat32(-ToFloat32()); | ||||
|     } | ||||
|  |  | |||
|  | @ -106,6 +106,11 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||
|     int bias1 = IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0; | ||||
|     int bias2 = IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0; | ||||
| 
 | ||||
|     auto w_inverse = Math::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w); | ||||
| 
 | ||||
|     auto textures = registers.GetTextures(); | ||||
|     auto tev_stages = registers.GetTevStages(); | ||||
| 
 | ||||
|     // TODO: Not sure if looping through x first might be faster
 | ||||
|     for (u16 y = min_y; y < max_y; y += 0x10) { | ||||
|         for (u16 x = min_x; x < max_x; x += 0x10) { | ||||
|  | @ -129,6 +134,11 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||
|             if (w0 < 0 || w1 < 0 || w2 < 0) | ||||
|                 continue; | ||||
| 
 | ||||
|             auto baricentric_coordinates = Math::MakeVec(float24::FromFloat32(static_cast<float>(w0)), | ||||
|                                                 float24::FromFloat32(static_cast<float>(w1)), | ||||
|                                                 float24::FromFloat32(static_cast<float>(w2))); | ||||
|             float24 interpolated_w_inverse = float24::FromFloat32(1.0f) / Math::Dot(w_inverse, baricentric_coordinates); | ||||
| 
 | ||||
|             // Perspective correct attribute interpolation:
 | ||||
|             // Attribute values cannot be calculated by simple linear interpolation since
 | ||||
|             // they are not linear in screen space. For example, when interpolating a
 | ||||
|  | @ -145,19 +155,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||
|             //
 | ||||
|             // The generalization to three vertices is straightforward in baricentric coordinates.
 | ||||
|             auto GetInterpolatedAttribute = [&](float24 attr0, float24 attr1, float24 attr2) { | ||||
|                 auto attr_over_w = Math::MakeVec(attr0 / v0.pos.w, | ||||
|                                                  attr1 / v1.pos.w, | ||||
|                                                  attr2 / v2.pos.w); | ||||
|                 auto w_inverse   = Math::MakeVec(float24::FromFloat32(1.f) / v0.pos.w, | ||||
|                                                  float24::FromFloat32(1.f) / v1.pos.w, | ||||
|                                                  float24::FromFloat32(1.f) / v2.pos.w); | ||||
|                 auto baricentric_coordinates = Math::MakeVec(float24::FromFloat32(static_cast<float>(w0)), | ||||
|                                                              float24::FromFloat32(static_cast<float>(w1)), | ||||
|                                                              float24::FromFloat32(static_cast<float>(w2))); | ||||
| 
 | ||||
|                 auto attr_over_w = Math::MakeVec(attr0, attr1, attr2); | ||||
|                 float24 interpolated_attr_over_w = Math::Dot(attr_over_w, baricentric_coordinates); | ||||
|                 float24 interpolated_w_inverse   = Math::Dot(w_inverse,   baricentric_coordinates); | ||||
|                 return interpolated_attr_over_w / interpolated_w_inverse; | ||||
|                 return interpolated_attr_over_w * interpolated_w_inverse; | ||||
|             }; | ||||
| 
 | ||||
|             Math::Vec4<u8> primary_color{ | ||||
|  | @ -177,7 +177,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||
| 
 | ||||
|             Math::Vec4<u8> texture_color[3]{}; | ||||
|             for (int i = 0; i < 3; ++i) { | ||||
|                 auto texture = registers.GetTextures()[i]; | ||||
|                 const auto& texture = textures[i]; | ||||
|                 if (!texture.enabled) | ||||
|                     continue; | ||||
| 
 | ||||
|  | @ -219,7 +219,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0, | |||
|             // with some basic arithmetic. Alpha combiners can be configured separately but work
 | ||||
|             // analogously.
 | ||||
|             Math::Vec4<u8> combiner_output; | ||||
|             for (auto tev_stage : registers.GetTevStages()) { | ||||
|             for (const auto& tev_stage : tev_stages) { | ||||
|                 using Source = Regs::TevStageConfig::Source; | ||||
|                 using ColorModifier = Regs::TevStageConfig::ColorModifier; | ||||
|                 using AlphaModifier = Regs::TevStageConfig::AlphaModifier; | ||||
|  |  | |||
|  | @ -469,6 +469,10 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes) | |||
| 
 | ||||
|     // Setup output register table
 | ||||
|     OutputVertex ret; | ||||
|     // Zero output so that attributes which aren't output won't have denormals in them, which will
 | ||||
|     // slow us down later.
 | ||||
|     memset(&ret, 0, sizeof(ret)); | ||||
| 
 | ||||
|     for (int i = 0; i < 7; ++i) { | ||||
|         const auto& output_register_map = registers.vs_output_attributes[i]; | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue