mirror of
				https://github.com/PabloMK7/citra.git
				synced 2025-10-31 13:50:03 +00:00 
			
		
		
		
	Refactor software renderer (#6621)
This commit is contained in:
		
							parent
							
								
									7198243319
								
							
						
					
					
						commit
						9b82de6b24
					
				
					 39 changed files with 1815 additions and 1796 deletions
				
			
		|  | @ -83,8 +83,6 @@ add_library(video_core STATIC | |||
|     renderer_opengl/post_processing_opengl.h | ||||
|     renderer_opengl/renderer_opengl.cpp | ||||
|     renderer_opengl/renderer_opengl.h | ||||
|     renderer_software/rasterizer.cpp | ||||
|     renderer_software/rasterizer.h | ||||
|     renderer_software/renderer_software.cpp | ||||
|     renderer_software/renderer_software.h | ||||
|     renderer_software/sw_clipper.cpp | ||||
|  |  | |||
|  | @ -90,16 +90,16 @@ static void WriteUniformFloatReg(ShaderRegs& config, Shader::ShaderSetup& setup, | |||
|                 for (auto i : {0, 1, 2, 3}) { | ||||
|                     float buffer_value; | ||||
|                     std::memcpy(&buffer_value, &uniform_write_buffer[i], sizeof(float)); | ||||
|                     uniform[3 - i] = float24::FromFloat32(buffer_value); | ||||
|                     uniform[3 - i] = f24::FromFloat32(buffer_value); | ||||
|                 } | ||||
|             } else { | ||||
|                 // TODO: Untested
 | ||||
|                 uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8); | ||||
|                 uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | | ||||
|                                              ((uniform_write_buffer[1] >> 16) & 0xFFFF)); | ||||
|                 uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | | ||||
|                                              ((uniform_write_buffer[2] >> 24) & 0xFF)); | ||||
|                 uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF); | ||||
|                 uniform.w = f24::FromRaw(uniform_write_buffer[0] >> 8); | ||||
|                 uniform.z = f24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | | ||||
|                                          ((uniform_write_buffer[1] >> 16) & 0xFFFF)); | ||||
|                 uniform.y = f24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | | ||||
|                                          ((uniform_write_buffer[2] >> 24) & 0xFF)); | ||||
|                 uniform.x = f24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF); | ||||
|             } | ||||
| 
 | ||||
|             LOG_TRACE(HW_GPU, "Set {} float uniform {:x} to ({} {} {} {})", | ||||
|  | @ -182,15 +182,15 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
|                 break; | ||||
|             } | ||||
| 
 | ||||
|             Common::Vec4<float24> attribute; | ||||
|             Common::Vec4<f24> attribute; | ||||
| 
 | ||||
|             // NOTE: The destination component order indeed is "backwards"
 | ||||
|             attribute.w = float24::FromRaw(g_state.default_attr_write_buffer[0] >> 8); | ||||
|             attribute.z = float24::FromRaw(((g_state.default_attr_write_buffer[0] & 0xFF) << 16) | | ||||
|                                            ((g_state.default_attr_write_buffer[1] >> 16) & 0xFFFF)); | ||||
|             attribute.y = float24::FromRaw(((g_state.default_attr_write_buffer[1] & 0xFFFF) << 8) | | ||||
|                                            ((g_state.default_attr_write_buffer[2] >> 24) & 0xFF)); | ||||
|             attribute.x = float24::FromRaw(g_state.default_attr_write_buffer[2] & 0xFFFFFF); | ||||
|             attribute.w = f24::FromRaw(g_state.default_attr_write_buffer[0] >> 8); | ||||
|             attribute.z = f24::FromRaw(((g_state.default_attr_write_buffer[0] & 0xFF) << 16) | | ||||
|                                        ((g_state.default_attr_write_buffer[1] >> 16) & 0xFFFF)); | ||||
|             attribute.y = f24::FromRaw(((g_state.default_attr_write_buffer[1] & 0xFFFF) << 8) | | ||||
|                                        ((g_state.default_attr_write_buffer[2] >> 24) & 0xFF)); | ||||
|             attribute.x = f24::FromRaw(g_state.default_attr_write_buffer[2] & 0xFFFFFF); | ||||
| 
 | ||||
|             LOG_TRACE(HW_GPU, "Set default VS attribute {:x} to ({} {} {} {})", (int)setup.index, | ||||
|                       attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(), | ||||
|  |  | |||
|  | @ -85,8 +85,8 @@ private: | |||
|     const Regs& regs; | ||||
|     Shader::GSUnitState& unit; | ||||
|     Shader::AttributeBuffer attribute_buffer; | ||||
|     Common::Vec4<float24>* buffer_cur; | ||||
|     Common::Vec4<float24>* buffer_end; | ||||
|     Common::Vec4<f24>* buffer_cur; | ||||
|     Common::Vec4<f24>* buffer_end; | ||||
|     unsigned int vs_output_num; | ||||
| 
 | ||||
|     GeometryPipeline_Point() : regs(g_state.regs), unit(g_state.gs_unit) {} | ||||
|  | @ -146,7 +146,7 @@ public: | |||
|         DEBUG_ASSERT(need_index); | ||||
| 
 | ||||
|         // The number of vertex input is put to the uniform register
 | ||||
|         float24 vertex_num = float24::FromFloat32(static_cast<float>(val)); | ||||
|         f24 vertex_num = f24::FromFloat32(static_cast<float>(val)); | ||||
|         setup.uniforms.f[0] = Common::MakeVec(vertex_num, vertex_num, vertex_num, vertex_num); | ||||
| 
 | ||||
|         // The second uniform register and so on are used for receiving input vertices
 | ||||
|  | @ -183,7 +183,7 @@ private: | |||
|     Shader::ShaderSetup& setup; | ||||
|     unsigned int main_vertex_num; | ||||
|     unsigned int total_vertex_num; | ||||
|     Common::Vec4<float24>* buffer_cur; | ||||
|     Common::Vec4<f24>* buffer_cur; | ||||
|     unsigned int vs_output_num; | ||||
| 
 | ||||
|     GeometryPipeline_VariablePrimitive() : regs(g_state.regs), setup(g_state.gs) {} | ||||
|  | @ -257,9 +257,9 @@ public: | |||
| private: | ||||
|     [[maybe_unused]] const Regs& regs; | ||||
|     Shader::ShaderSetup& setup; | ||||
|     Common::Vec4<float24>* buffer_begin; | ||||
|     Common::Vec4<float24>* buffer_cur; | ||||
|     Common::Vec4<float24>* buffer_end; | ||||
|     Common::Vec4<f24>* buffer_begin; | ||||
|     Common::Vec4<f24>* buffer_cur; | ||||
|     Common::Vec4<f24>* buffer_end; | ||||
|     unsigned int vs_output_num; | ||||
| 
 | ||||
|     GeometryPipeline_FixedPrimitive() : regs(g_state.regs), setup(g_state.gs) {} | ||||
|  |  | |||
|  | @ -25,20 +25,20 @@ namespace Pica { | |||
| template <unsigned M, unsigned E> | ||||
| struct Float { | ||||
| public: | ||||
|     static Float<M, E> FromFloat32(float val) { | ||||
|     static constexpr Float<M, E> FromFloat32(float val) { | ||||
|         Float<M, E> ret; | ||||
|         ret.value = val; | ||||
|         return ret; | ||||
|     } | ||||
| 
 | ||||
|     static Float<M, E> FromRaw(u32 hex) { | ||||
|     static constexpr Float<M, E> FromRaw(u32 hex) { | ||||
|         Float<M, E> res; | ||||
| 
 | ||||
|         const int width = M + E + 1; | ||||
|         const int bias = 128 - (1 << (E - 1)); | ||||
|         int exponent = (hex >> M) & ((1 << E) - 1); | ||||
|         const unsigned mantissa = hex & ((1 << M) - 1); | ||||
|         const unsigned sign = (hex >> (E + M)) << 31; | ||||
|         const s32 width = M + E + 1; | ||||
|         const s32 bias = 128 - (1 << (E - 1)); | ||||
|         s32 exponent = (hex >> M) & ((1 << E) - 1); | ||||
|         const u32 mantissa = hex & ((1 << M) - 1); | ||||
|         const u32 sign = (hex >> (E + M)) << 31; | ||||
| 
 | ||||
|         if (hex & ((1 << (width - 1)) - 1)) { | ||||
|             if (exponent == (1 << E) - 1) | ||||
|  | @ -55,16 +55,20 @@ public: | |||
|         return res; | ||||
|     } | ||||
| 
 | ||||
|     static Float<M, E> Zero() { | ||||
|     static constexpr Float<M, E> Zero() { | ||||
|         return FromFloat32(0.f); | ||||
|     } | ||||
| 
 | ||||
|     static constexpr Float<M, E> One() { | ||||
|         return FromFloat32(1.f); | ||||
|     } | ||||
| 
 | ||||
|     // Not recommended for anything but logging
 | ||||
|     float ToFloat32() const { | ||||
|     constexpr float ToFloat32() const { | ||||
|         return value; | ||||
|     } | ||||
| 
 | ||||
|     Float<M, E> operator*(const Float<M, E>& flt) const { | ||||
|     constexpr Float<M, E> operator*(const Float<M, E>& flt) const { | ||||
|         float result = value * flt.ToFloat32(); | ||||
|         // PICA gives 0 instead of NaN when multiplying by inf
 | ||||
|         if (std::isnan(result)) | ||||
|  | @ -73,70 +77,70 @@ public: | |||
|         return Float<M, E>::FromFloat32(result); | ||||
|     } | ||||
| 
 | ||||
|     Float<M, E> operator/(const Float<M, E>& flt) const { | ||||
|     constexpr Float<M, E> operator/(const Float<M, E>& flt) const { | ||||
|         return Float<M, E>::FromFloat32(ToFloat32() / flt.ToFloat32()); | ||||
|     } | ||||
| 
 | ||||
|     Float<M, E> operator+(const Float<M, E>& flt) const { | ||||
|     constexpr Float<M, E> operator+(const Float<M, E>& flt) const { | ||||
|         return Float<M, E>::FromFloat32(ToFloat32() + flt.ToFloat32()); | ||||
|     } | ||||
| 
 | ||||
|     Float<M, E> operator-(const Float<M, E>& flt) const { | ||||
|     constexpr Float<M, E> operator-(const Float<M, E>& flt) const { | ||||
|         return Float<M, E>::FromFloat32(ToFloat32() - flt.ToFloat32()); | ||||
|     } | ||||
| 
 | ||||
|     Float<M, E>& operator*=(const Float<M, E>& flt) { | ||||
|     constexpr Float<M, E>& operator*=(const Float<M, E>& flt) { | ||||
|         value = operator*(flt).value; | ||||
|         return *this; | ||||
|     } | ||||
| 
 | ||||
|     Float<M, E>& operator/=(const Float<M, E>& flt) { | ||||
|     constexpr Float<M, E>& operator/=(const Float<M, E>& flt) { | ||||
|         value /= flt.ToFloat32(); | ||||
|         return *this; | ||||
|     } | ||||
| 
 | ||||
|     Float<M, E>& operator+=(const Float<M, E>& flt) { | ||||
|     constexpr Float<M, E>& operator+=(const Float<M, E>& flt) { | ||||
|         value += flt.ToFloat32(); | ||||
|         return *this; | ||||
|     } | ||||
| 
 | ||||
|     Float<M, E>& operator-=(const Float<M, E>& flt) { | ||||
|     constexpr Float<M, E>& operator-=(const Float<M, E>& flt) { | ||||
|         value -= flt.ToFloat32(); | ||||
|         return *this; | ||||
|     } | ||||
| 
 | ||||
|     Float<M, E> operator-() const { | ||||
|     constexpr Float<M, E> operator-() const { | ||||
|         return Float<M, E>::FromFloat32(-ToFloat32()); | ||||
|     } | ||||
| 
 | ||||
|     bool operator<(const Float<M, E>& flt) const { | ||||
|     constexpr bool operator<(const Float<M, E>& flt) const { | ||||
|         return ToFloat32() < flt.ToFloat32(); | ||||
|     } | ||||
| 
 | ||||
|     bool operator>(const Float<M, E>& flt) const { | ||||
|     constexpr bool operator>(const Float<M, E>& flt) const { | ||||
|         return ToFloat32() > flt.ToFloat32(); | ||||
|     } | ||||
| 
 | ||||
|     bool operator>=(const Float<M, E>& flt) const { | ||||
|     constexpr bool operator>=(const Float<M, E>& flt) const { | ||||
|         return ToFloat32() >= flt.ToFloat32(); | ||||
|     } | ||||
| 
 | ||||
|     bool operator<=(const Float<M, E>& flt) const { | ||||
|     constexpr bool operator<=(const Float<M, E>& flt) const { | ||||
|         return ToFloat32() <= flt.ToFloat32(); | ||||
|     } | ||||
| 
 | ||||
|     bool operator==(const Float<M, E>& flt) const { | ||||
|     constexpr bool operator==(const Float<M, E>& flt) const { | ||||
|         return ToFloat32() == flt.ToFloat32(); | ||||
|     } | ||||
| 
 | ||||
|     bool operator!=(const Float<M, E>& flt) const { | ||||
|     constexpr bool operator!=(const Float<M, E>& flt) const { | ||||
|         return ToFloat32() != flt.ToFloat32(); | ||||
|     } | ||||
| 
 | ||||
| private: | ||||
|     static const unsigned MASK = (1 << (M + E + 1)) - 1; | ||||
|     static const unsigned MANTISSA_MASK = (1 << M) - 1; | ||||
|     static const unsigned EXPONENT_MASK = (1 << E) - 1; | ||||
|     static constexpr u32 MASK = (1 << (M + E + 1)) - 1; | ||||
|     static constexpr u32 MANTISSA_MASK = (1 << M) - 1; | ||||
|     static constexpr u32 EXPONENT_MASK = (1 << E) - 1; | ||||
| 
 | ||||
|     // Stored as a regular float, merely for convenience
 | ||||
|     // TODO: Perform proper arithmetic on this!
 | ||||
|  | @ -149,8 +153,8 @@ private: | |||
|     } | ||||
| }; | ||||
| 
 | ||||
| using float24 = Float<16, 7>; | ||||
| using float20 = Float<12, 7>; | ||||
| using float16 = Float<10, 5>; | ||||
| using f24 = Pica::Float<16, 7>; | ||||
| using f20 = Pica::Float<12, 7>; | ||||
| using f16 = Pica::Float<10, 5>; | ||||
| 
 | ||||
| } // namespace Pica
 | ||||
|  |  | |||
|  | @ -10,6 +10,8 @@ | |||
| 
 | ||||
| namespace VideoCore { | ||||
| 
 | ||||
| using Pica::f24; | ||||
| 
 | ||||
| static Common::Vec4f ColorRGBA8(const u32 color) { | ||||
|     const auto rgba = | ||||
|         Common::Vec4u{color >> 0 & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF, color >> 24 & 0xFF}; | ||||
|  | @ -73,7 +75,7 @@ RasterizerAccelerated::RasterizerAccelerated(Memory::MemorySystem& memory_) | |||
|  * Fortunately however, the 3DS hardware happens to also use this exact same logic to work around | ||||
|  * these issues, making this basic implementation actually more accurate to the hardware. | ||||
|  */ | ||||
| static bool AreQuaternionsOpposite(Common::Vec4<Pica::float24> qa, Common::Vec4<Pica::float24> qb) { | ||||
| static bool AreQuaternionsOpposite(Common::Vec4<f24> qa, Common::Vec4<f24> qb) { | ||||
|     Common::Vec4f a{qa.x.ToFloat32(), qa.y.ToFloat32(), qa.z.ToFloat32(), qa.w.ToFloat32()}; | ||||
|     Common::Vec4f b{qb.x.ToFloat32(), qb.y.ToFloat32(), qb.z.ToFloat32(), qb.w.ToFloat32()}; | ||||
| 
 | ||||
|  | @ -612,7 +614,7 @@ void RasterizerAccelerated::NotifyPicaRegisterChanged(u32 id) { | |||
| } | ||||
| 
 | ||||
| void RasterizerAccelerated::SyncDepthScale() { | ||||
|     float depth_scale = Pica::float24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32(); | ||||
|     const f32 depth_scale = f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32(); | ||||
| 
 | ||||
|     if (depth_scale != uniform_block_data.data.depth_scale) { | ||||
|         uniform_block_data.data.depth_scale = depth_scale; | ||||
|  | @ -621,8 +623,7 @@ void RasterizerAccelerated::SyncDepthScale() { | |||
| } | ||||
| 
 | ||||
| void RasterizerAccelerated::SyncDepthOffset() { | ||||
|     float depth_offset = | ||||
|         Pica::float24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32(); | ||||
|     const f32 depth_offset = f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32(); | ||||
| 
 | ||||
|     if (depth_offset != uniform_block_data.data.depth_offset) { | ||||
|         uniform_block_data.data.depth_offset = depth_offset; | ||||
|  | @ -646,16 +647,16 @@ void RasterizerAccelerated::SyncFogColor() { | |||
| 
 | ||||
| void RasterizerAccelerated::SyncProcTexNoise() { | ||||
|     const Common::Vec2f proctex_noise_f = { | ||||
|         Pica::float16::FromRaw(regs.texturing.proctex_noise_frequency.u).ToFloat32(), | ||||
|         Pica::float16::FromRaw(regs.texturing.proctex_noise_frequency.v).ToFloat32(), | ||||
|         Pica::f16::FromRaw(regs.texturing.proctex_noise_frequency.u).ToFloat32(), | ||||
|         Pica::f16::FromRaw(regs.texturing.proctex_noise_frequency.v).ToFloat32(), | ||||
|     }; | ||||
|     const Common::Vec2f proctex_noise_a = { | ||||
|         regs.texturing.proctex_noise_u.amplitude / 4095.0f, | ||||
|         regs.texturing.proctex_noise_v.amplitude / 4095.0f, | ||||
|     }; | ||||
|     const Common::Vec2f proctex_noise_p = { | ||||
|         Pica::float16::FromRaw(regs.texturing.proctex_noise_u.phase).ToFloat32(), | ||||
|         Pica::float16::FromRaw(regs.texturing.proctex_noise_v.phase).ToFloat32(), | ||||
|         Pica::f16::FromRaw(regs.texturing.proctex_noise_u.phase).ToFloat32(), | ||||
|         Pica::f16::FromRaw(regs.texturing.proctex_noise_v.phase).ToFloat32(), | ||||
|     }; | ||||
| 
 | ||||
|     if (proctex_noise_f != uniform_block_data.data.proctex_noise_f || | ||||
|  | @ -669,8 +670,8 @@ void RasterizerAccelerated::SyncProcTexNoise() { | |||
| } | ||||
| 
 | ||||
| void RasterizerAccelerated::SyncProcTexBias() { | ||||
|     const auto proctex_bias = Pica::float16::FromRaw(regs.texturing.proctex.bias_low | | ||||
|                                                      (regs.texturing.proctex_lut.bias_high << 8)) | ||||
|     const auto proctex_bias = Pica::f16::FromRaw(regs.texturing.proctex.bias_low | | ||||
|                                                  (regs.texturing.proctex_lut.bias_high << 8)) | ||||
|                                   .ToFloat32(); | ||||
|     if (proctex_bias != uniform_block_data.data.proctex_bias) { | ||||
|         uniform_block_data.data.proctex_bias = proctex_bias; | ||||
|  | @ -687,7 +688,7 @@ void RasterizerAccelerated::SyncAlphaTest() { | |||
| } | ||||
| 
 | ||||
| void RasterizerAccelerated::SyncCombinerColor() { | ||||
|     auto combiner_color = ColorRGBA8(regs.texturing.tev_combiner_buffer_color.raw); | ||||
|     const auto combiner_color = ColorRGBA8(regs.texturing.tev_combiner_buffer_color.raw); | ||||
|     if (combiner_color != uniform_block_data.data.tev_combiner_buffer_color) { | ||||
|         uniform_block_data.data.tev_combiner_buffer_color = combiner_color; | ||||
|         uniform_block_data.dirty = true; | ||||
|  | @ -695,7 +696,7 @@ void RasterizerAccelerated::SyncCombinerColor() { | |||
| } | ||||
| 
 | ||||
| void RasterizerAccelerated::SyncTevConstColor( | ||||
|     std::size_t stage_index, const Pica::TexturingRegs::TevStageConfig& tev_stage) { | ||||
|     const size_t stage_index, const Pica::TexturingRegs::TevStageConfig& tev_stage) { | ||||
|     const auto const_color = ColorRGBA8(tev_stage.const_color); | ||||
| 
 | ||||
|     if (const_color == uniform_block_data.data.const_color[stage_index]) { | ||||
|  | @ -707,7 +708,7 @@ void RasterizerAccelerated::SyncTevConstColor( | |||
| } | ||||
| 
 | ||||
| void RasterizerAccelerated::SyncGlobalAmbient() { | ||||
|     auto color = LightColor(regs.lighting.global_ambient); | ||||
|     const auto color = LightColor(regs.lighting.global_ambient); | ||||
|     if (color != uniform_block_data.data.lighting_global_ambient) { | ||||
|         uniform_block_data.data.lighting_global_ambient = color; | ||||
|         uniform_block_data.dirty = true; | ||||
|  | @ -715,7 +716,7 @@ void RasterizerAccelerated::SyncGlobalAmbient() { | |||
| } | ||||
| 
 | ||||
| void RasterizerAccelerated::SyncLightSpecular0(int light_index) { | ||||
|     auto color = LightColor(regs.lighting.light[light_index].specular_0); | ||||
|     const auto color = LightColor(regs.lighting.light[light_index].specular_0); | ||||
|     if (color != uniform_block_data.data.light_src[light_index].specular_0) { | ||||
|         uniform_block_data.data.light_src[light_index].specular_0 = color; | ||||
|         uniform_block_data.dirty = true; | ||||
|  | @ -723,7 +724,7 @@ void RasterizerAccelerated::SyncLightSpecular0(int light_index) { | |||
| } | ||||
| 
 | ||||
| void RasterizerAccelerated::SyncLightSpecular1(int light_index) { | ||||
|     auto color = LightColor(regs.lighting.light[light_index].specular_1); | ||||
|     const auto color = LightColor(regs.lighting.light[light_index].specular_1); | ||||
|     if (color != uniform_block_data.data.light_src[light_index].specular_1) { | ||||
|         uniform_block_data.data.light_src[light_index].specular_1 = color; | ||||
|         uniform_block_data.dirty = true; | ||||
|  | @ -731,7 +732,7 @@ void RasterizerAccelerated::SyncLightSpecular1(int light_index) { | |||
| } | ||||
| 
 | ||||
| void RasterizerAccelerated::SyncLightDiffuse(int light_index) { | ||||
|     auto color = LightColor(regs.lighting.light[light_index].diffuse); | ||||
|     const auto color = LightColor(regs.lighting.light[light_index].diffuse); | ||||
|     if (color != uniform_block_data.data.light_src[light_index].diffuse) { | ||||
|         uniform_block_data.data.light_src[light_index].diffuse = color; | ||||
|         uniform_block_data.dirty = true; | ||||
|  | @ -739,7 +740,7 @@ void RasterizerAccelerated::SyncLightDiffuse(int light_index) { | |||
| } | ||||
| 
 | ||||
| void RasterizerAccelerated::SyncLightAmbient(int light_index) { | ||||
|     auto color = LightColor(regs.lighting.light[light_index].ambient); | ||||
|     const auto color = LightColor(regs.lighting.light[light_index].ambient); | ||||
|     if (color != uniform_block_data.data.light_src[light_index].ambient) { | ||||
|         uniform_block_data.data.light_src[light_index].ambient = color; | ||||
|         uniform_block_data.dirty = true; | ||||
|  | @ -748,9 +749,9 @@ void RasterizerAccelerated::SyncLightAmbient(int light_index) { | |||
| 
 | ||||
| void RasterizerAccelerated::SyncLightPosition(int light_index) { | ||||
|     const Common::Vec3f position = { | ||||
|         Pica::float16::FromRaw(regs.lighting.light[light_index].x).ToFloat32(), | ||||
|         Pica::float16::FromRaw(regs.lighting.light[light_index].y).ToFloat32(), | ||||
|         Pica::float16::FromRaw(regs.lighting.light[light_index].z).ToFloat32(), | ||||
|         Pica::f16::FromRaw(regs.lighting.light[light_index].x).ToFloat32(), | ||||
|         Pica::f16::FromRaw(regs.lighting.light[light_index].y).ToFloat32(), | ||||
|         Pica::f16::FromRaw(regs.lighting.light[light_index].z).ToFloat32(), | ||||
|     }; | ||||
| 
 | ||||
|     if (position != uniform_block_data.data.light_src[light_index].position) { | ||||
|  | @ -771,8 +772,8 @@ void RasterizerAccelerated::SyncLightSpotDirection(int light_index) { | |||
| } | ||||
| 
 | ||||
| void RasterizerAccelerated::SyncLightDistanceAttenuationBias(int light_index) { | ||||
|     float dist_atten_bias = | ||||
|         Pica::float20::FromRaw(regs.lighting.light[light_index].dist_atten_bias).ToFloat32(); | ||||
|     const f32 dist_atten_bias = | ||||
|         Pica::f20::FromRaw(regs.lighting.light[light_index].dist_atten_bias).ToFloat32(); | ||||
| 
 | ||||
|     if (dist_atten_bias != uniform_block_data.data.light_src[light_index].dist_atten_bias) { | ||||
|         uniform_block_data.data.light_src[light_index].dist_atten_bias = dist_atten_bias; | ||||
|  | @ -781,8 +782,8 @@ void RasterizerAccelerated::SyncLightDistanceAttenuationBias(int light_index) { | |||
| } | ||||
| 
 | ||||
| void RasterizerAccelerated::SyncLightDistanceAttenuationScale(int light_index) { | ||||
|     float dist_atten_scale = | ||||
|         Pica::float20::FromRaw(regs.lighting.light[light_index].dist_atten_scale).ToFloat32(); | ||||
|     const f32 dist_atten_scale = | ||||
|         Pica::f20::FromRaw(regs.lighting.light[light_index].dist_atten_scale).ToFloat32(); | ||||
| 
 | ||||
|     if (dist_atten_scale != uniform_block_data.data.light_src[light_index].dist_atten_scale) { | ||||
|         uniform_block_data.data.light_src[light_index].dist_atten_scale = dist_atten_scale; | ||||
|  | @ -792,8 +793,8 @@ void RasterizerAccelerated::SyncLightDistanceAttenuationScale(int light_index) { | |||
| 
 | ||||
| void RasterizerAccelerated::SyncShadowBias() { | ||||
|     const auto& shadow = regs.framebuffer.shadow; | ||||
|     float constant = Pica::float16::FromRaw(shadow.constant).ToFloat32(); | ||||
|     float linear = Pica::float16::FromRaw(shadow.linear).ToFloat32(); | ||||
|     const f32 constant = Pica::f16::FromRaw(shadow.constant).ToFloat32(); | ||||
|     const f32 linear = Pica::f16::FromRaw(shadow.linear).ToFloat32(); | ||||
| 
 | ||||
|     if (constant != uniform_block_data.data.shadow_bias_constant || | ||||
|         linear != uniform_block_data.data.shadow_bias_linear) { | ||||
|  | @ -804,7 +805,7 @@ void RasterizerAccelerated::SyncShadowBias() { | |||
| } | ||||
| 
 | ||||
| void RasterizerAccelerated::SyncShadowTextureBias() { | ||||
|     int bias = regs.texturing.shadow.bias << 1; | ||||
|     const s32 bias = regs.texturing.shadow.bias << 1; | ||||
|     if (bias != uniform_block_data.data.shadow_texture_bias) { | ||||
|         uniform_block_data.data.shadow_texture_bias = bias; | ||||
|         uniform_block_data.dirty = true; | ||||
|  | @ -813,7 +814,7 @@ void RasterizerAccelerated::SyncShadowTextureBias() { | |||
| 
 | ||||
| void RasterizerAccelerated::SyncTextureLodBias(int tex_index) { | ||||
|     const auto pica_textures = regs.texturing.GetTextures(); | ||||
|     const float bias = pica_textures[tex_index].config.lod.bias / 256.0f; | ||||
|     const f32 bias = pica_textures[tex_index].config.lod.bias / 256.0f; | ||||
|     if (bias != uniform_block_data.data.tex_lod_bias[tex_index]) { | ||||
|         uniform_block_data.data.tex_lod_bias[tex_index] = bias; | ||||
|         uniform_block_data.dirty = true; | ||||
|  |  | |||
|  | @ -37,9 +37,9 @@ struct RasterizerRegs { | |||
|     BitField<0, 1, u32> clip_enable; | ||||
|     BitField<0, 24, u32> clip_coef[4]; // float24
 | ||||
| 
 | ||||
|     Common::Vec4<float24> GetClipCoef() const { | ||||
|         return {float24::FromRaw(clip_coef[0]), float24::FromRaw(clip_coef[1]), | ||||
|                 float24::FromRaw(clip_coef[2]), float24::FromRaw(clip_coef[3])}; | ||||
|     Common::Vec4<f24> GetClipCoef() const { | ||||
|         return {f24::FromRaw(clip_coef[0]), f24::FromRaw(clip_coef[1]), f24::FromRaw(clip_coef[2]), | ||||
|                 f24::FromRaw(clip_coef[3])}; | ||||
|     } | ||||
| 
 | ||||
|     Common::Rectangle<s32> GetViewportRect() const { | ||||
|  | @ -47,9 +47,9 @@ struct RasterizerRegs { | |||
|             // These registers hold half-width and half-height, so must be multiplied by 2
 | ||||
|             viewport_corner.x,  // left
 | ||||
|             viewport_corner.y + // top
 | ||||
|                 static_cast<s32>(float24::FromRaw(viewport_size_y).ToFloat32() * 2), | ||||
|                 static_cast<s32>(f24::FromRaw(viewport_size_y).ToFloat32() * 2), | ||||
|             viewport_corner.x + // right
 | ||||
|                 static_cast<s32>(float24::FromRaw(viewport_size_x).ToFloat32() * 2), | ||||
|                 static_cast<s32>(f24::FromRaw(viewport_size_x).ToFloat32() * 2), | ||||
|             viewport_corner.y // bottom
 | ||||
|         }; | ||||
|     } | ||||
|  |  | |||
|  | @ -18,6 +18,12 @@ class System; | |||
| 
 | ||||
| namespace VideoCore { | ||||
| 
 | ||||
| enum class ScreenId : u32 { | ||||
|     TopLeft, | ||||
|     TopRight, | ||||
|     Bottom, | ||||
| }; | ||||
| 
 | ||||
| struct RendererSettings { | ||||
|     // Screenshot
 | ||||
|     std::atomic_bool screenshot_requested{false}; | ||||
|  | @ -75,7 +81,7 @@ public: | |||
|         return current_fps; | ||||
|     } | ||||
| 
 | ||||
|     int GetCurrentFrame() const { | ||||
|     s32 GetCurrentFrame() const { | ||||
|         return current_frame; | ||||
|     } | ||||
| 
 | ||||
|  | @ -108,7 +114,7 @@ protected: | |||
|     Frontend::EmuWindow& render_window;    ///< Reference to the render window handle.
 | ||||
|     Frontend::EmuWindow* secondary_window; ///< Reference to the secondary render window handle.
 | ||||
|     f32 current_fps = 0.0f;                ///< Current framerate, should be set by the renderer
 | ||||
|     int current_frame = 0;                 ///< Current frame, should be set by the renderer
 | ||||
|     s32 current_frame = 0;                 ///< Current frame, should be set by the renderer
 | ||||
| }; | ||||
| 
 | ||||
| } // namespace VideoCore
 | ||||
|  |  | |||
|  | @ -1,901 +0,0 @@ | |||
| // Copyright 2014 Citra Emulator Project
 | ||||
| // Licensed under GPLv2 or any later version
 | ||||
| // Refer to the license.txt file included.
 | ||||
| 
 | ||||
| #include <algorithm> | ||||
| #include <array> | ||||
| #include <cmath> | ||||
| #include <tuple> | ||||
| #include "common/assert.h" | ||||
| #include "common/bit_field.h" | ||||
| #include "common/color.h" | ||||
| #include "common/common_types.h" | ||||
| #include "common/logging/log.h" | ||||
| #include "common/microprofile.h" | ||||
| #include "common/quaternion.h" | ||||
| #include "common/vector_math.h" | ||||
| #include "core/hw/gpu.h" | ||||
| #include "core/memory.h" | ||||
| #include "video_core/debug_utils/debug_utils.h" | ||||
| #include "video_core/pica_state.h" | ||||
| #include "video_core/pica_types.h" | ||||
| #include "video_core/regs_framebuffer.h" | ||||
| #include "video_core/regs_rasterizer.h" | ||||
| #include "video_core/regs_texturing.h" | ||||
| #include "video_core/renderer_software/rasterizer.h" | ||||
| #include "video_core/renderer_software/sw_framebuffer.h" | ||||
| #include "video_core/renderer_software/sw_lighting.h" | ||||
| #include "video_core/renderer_software/sw_proctex.h" | ||||
| #include "video_core/renderer_software/sw_texturing.h" | ||||
| #include "video_core/shader/shader.h" | ||||
| #include "video_core/texture/texture_decode.h" | ||||
| #include "video_core/utils.h" | ||||
| #include "video_core/video_core.h" | ||||
| 
 | ||||
| namespace Pica::Rasterizer { | ||||
| 
 | ||||
| // NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
 | ||||
| struct Fix12P4 { | ||||
|     Fix12P4() {} | ||||
|     Fix12P4(u16 val) : val(val) {} | ||||
| 
 | ||||
|     static u16 FracMask() { | ||||
|         return 0xF; | ||||
|     } | ||||
|     static u16 IntMask() { | ||||
|         return (u16)~0xF; | ||||
|     } | ||||
| 
 | ||||
|     operator u16() const { | ||||
|         return val; | ||||
|     } | ||||
| 
 | ||||
|     bool operator<(const Fix12P4& oth) const { | ||||
|         return (u16) * this < (u16)oth; | ||||
|     } | ||||
| 
 | ||||
| private: | ||||
|     u16 val; | ||||
| }; | ||||
| 
 | ||||
| /**
 | ||||
|  * Calculate signed area of the triangle spanned by the three argument vertices. | ||||
|  * The sign denotes an orientation. | ||||
|  * | ||||
|  * @todo define orientation concretely. | ||||
|  */ | ||||
| static int SignedArea(const Common::Vec2<Fix12P4>& vtx1, const Common::Vec2<Fix12P4>& vtx2, | ||||
|                       const Common::Vec2<Fix12P4>& vtx3) { | ||||
|     const auto vec1 = Common::MakeVec(vtx2 - vtx1, 0); | ||||
|     const auto vec2 = Common::MakeVec(vtx3 - vtx1, 0); | ||||
|     // TODO: There is a very small chance this will overflow for sizeof(int) == 4
 | ||||
|     return Common::Cross(vec1, vec2).z; | ||||
| }; | ||||
| 
 | ||||
| /// Convert a 3D vector for cube map coordinates to 2D texture coordinates along with the face name
 | ||||
| static std::tuple<float24, float24, float24, PAddr> ConvertCubeCoord(float24 u, float24 v, | ||||
|                                                                      float24 w, | ||||
|                                                                      const TexturingRegs& regs) { | ||||
|     const float abs_u = std::abs(u.ToFloat32()); | ||||
|     const float abs_v = std::abs(v.ToFloat32()); | ||||
|     const float abs_w = std::abs(w.ToFloat32()); | ||||
|     float24 x, y, z; | ||||
|     PAddr addr; | ||||
|     if (abs_u > abs_v && abs_u > abs_w) { | ||||
|         if (u > float24::FromFloat32(0)) { | ||||
|             addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::PositiveX); | ||||
|             y = -v; | ||||
|         } else { | ||||
|             addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::NegativeX); | ||||
|             y = v; | ||||
|         } | ||||
|         x = -w; | ||||
|         z = u; | ||||
|     } else if (abs_v > abs_w) { | ||||
|         if (v > float24::FromFloat32(0)) { | ||||
|             addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::PositiveY); | ||||
|             x = u; | ||||
|         } else { | ||||
|             addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::NegativeY); | ||||
|             x = -u; | ||||
|         } | ||||
|         y = w; | ||||
|         z = v; | ||||
|     } else { | ||||
|         if (w > float24::FromFloat32(0)) { | ||||
|             addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::PositiveZ); | ||||
|             y = -v; | ||||
|         } else { | ||||
|             addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::NegativeZ); | ||||
|             y = v; | ||||
|         } | ||||
|         x = u; | ||||
|         z = w; | ||||
|     } | ||||
|     float24 z_abs = float24::FromFloat32(std::abs(z.ToFloat32())); | ||||
|     const float24 half = float24::FromFloat32(0.5f); | ||||
|     return std::make_tuple(x / z * half + half, y / z * half + half, z_abs, addr); | ||||
| } | ||||
| 
 | ||||
| MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240)); | ||||
| 
 | ||||
| /**
 | ||||
|  * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing | ||||
|  * culling via recursion. | ||||
|  */ | ||||
| static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Vertex& v2, | ||||
|                                     bool reversed = false) { | ||||
|     const auto& regs = g_state.regs; | ||||
|     MICROPROFILE_SCOPE(GPU_Rasterization); | ||||
| 
 | ||||
|     // vertex positions in rasterizer coordinates
 | ||||
|     static auto FloatToFix = [](float24 flt) { | ||||
|         // TODO: Rounding here is necessary to prevent garbage pixels at
 | ||||
|         //       triangle borders. Is it that the correct solution, though?
 | ||||
|         return Fix12P4(static_cast<unsigned short>(round(flt.ToFloat32() * 16.0f))); | ||||
|     }; | ||||
|     static auto ScreenToRasterizerCoordinates = [](const Common::Vec3<float24>& vec) { | ||||
|         return Common::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)}; | ||||
|     }; | ||||
| 
 | ||||
|     Common::Vec3<Fix12P4> vtxpos[3]{ScreenToRasterizerCoordinates(v0.screenpos), | ||||
|                                     ScreenToRasterizerCoordinates(v1.screenpos), | ||||
|                                     ScreenToRasterizerCoordinates(v2.screenpos)}; | ||||
| 
 | ||||
|     if (regs.rasterizer.cull_mode == RasterizerRegs::CullMode::KeepAll) { | ||||
|         // Make sure we always end up with a triangle wound counter-clockwise
 | ||||
|         if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) { | ||||
|             ProcessTriangleInternal(v0, v2, v1, true); | ||||
|             return; | ||||
|         } | ||||
|     } else { | ||||
|         if (!reversed && regs.rasterizer.cull_mode == RasterizerRegs::CullMode::KeepClockWise) { | ||||
|             // Reverse vertex order and use the CCW code path.
 | ||||
|             ProcessTriangleInternal(v0, v2, v1, true); | ||||
|             return; | ||||
|         } | ||||
| 
 | ||||
|         // Cull away triangles which are wound clockwise.
 | ||||
|         if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) | ||||
|             return; | ||||
|     } | ||||
| 
 | ||||
|     u16 min_x = std::min({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x}); | ||||
|     u16 min_y = std::min({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y}); | ||||
|     u16 max_x = std::max({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x}); | ||||
|     u16 max_y = std::max({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y}); | ||||
| 
 | ||||
|     // Convert the scissor box coordinates to 12.4 fixed point
 | ||||
|     u16 scissor_x1 = (u16)(regs.rasterizer.scissor_test.x1 << 4); | ||||
|     u16 scissor_y1 = (u16)(regs.rasterizer.scissor_test.y1 << 4); | ||||
|     // x2,y2 have +1 added to cover the entire sub-pixel area
 | ||||
|     u16 scissor_x2 = (u16)((regs.rasterizer.scissor_test.x2 + 1) << 4); | ||||
|     u16 scissor_y2 = (u16)((regs.rasterizer.scissor_test.y2 + 1) << 4); | ||||
| 
 | ||||
|     if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Include) { | ||||
|         // Calculate the new bounds
 | ||||
|         min_x = std::max(min_x, scissor_x1); | ||||
|         min_y = std::max(min_y, scissor_y1); | ||||
|         max_x = std::min(max_x, scissor_x2); | ||||
|         max_y = std::min(max_y, scissor_y2); | ||||
|     } | ||||
| 
 | ||||
|     min_x &= Fix12P4::IntMask(); | ||||
|     min_y &= Fix12P4::IntMask(); | ||||
|     max_x = ((max_x + Fix12P4::FracMask()) & Fix12P4::IntMask()); | ||||
|     max_y = ((max_y + Fix12P4::FracMask()) & Fix12P4::IntMask()); | ||||
| 
 | ||||
|     // Triangle filling rules: Pixels on the right-sided edge or on flat bottom edges are not
 | ||||
|     // drawn. Pixels on any other triangle border are drawn. This is implemented with three bias
 | ||||
|     // values which are added to the barycentric coordinates w0, w1 and w2, respectively.
 | ||||
|     // NOTE: These are the PSP filling rules. Not sure if the 3DS uses the same ones...
 | ||||
|     auto IsRightSideOrFlatBottomEdge = [](const Common::Vec2<Fix12P4>& vtx, | ||||
|                                           const Common::Vec2<Fix12P4>& line1, | ||||
|                                           const Common::Vec2<Fix12P4>& line2) { | ||||
|         if (line1.y == line2.y) { | ||||
|             // just check if vertex is above us => bottom line parallel to x-axis
 | ||||
|             return vtx.y < line1.y; | ||||
|         } else { | ||||
|             // check if vertex is on our left => right side
 | ||||
|             // TODO: Not sure how likely this is to overflow
 | ||||
|             return (int)vtx.x < (int)line1.x + ((int)line2.x - (int)line1.x) * | ||||
|                                                    ((int)vtx.y - (int)line1.y) / | ||||
|                                                    ((int)line2.y - (int)line1.y); | ||||
|         } | ||||
|     }; | ||||
|     int bias0 = | ||||
|         IsRightSideOrFlatBottomEdge(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) ? -1 : 0; | ||||
|     int bias1 = | ||||
|         IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0; | ||||
|     int bias2 = | ||||
|         IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0; | ||||
| 
 | ||||
|     auto w_inverse = Common::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w); | ||||
| 
 | ||||
|     auto textures = regs.texturing.GetTextures(); | ||||
|     auto tev_stages = regs.texturing.GetTevStages(); | ||||
| 
 | ||||
|     bool stencil_action_enable = | ||||
|         g_state.regs.framebuffer.output_merger.stencil_test.enable && | ||||
|         g_state.regs.framebuffer.framebuffer.depth_format == FramebufferRegs::DepthFormat::D24S8; | ||||
|     const auto stencil_test = g_state.regs.framebuffer.output_merger.stencil_test; | ||||
| 
 | ||||
|     // Enter rasterization loop, starting at the center of the topleft bounding box corner.
 | ||||
|     // TODO: Not sure if looping through x first might be faster
 | ||||
|     for (u16 y = min_y + 8; y < max_y; y += 0x10) { | ||||
|         for (u16 x = min_x + 8; x < max_x; x += 0x10) { | ||||
| 
 | ||||
|             // Do not process the pixel if it's inside the scissor box and the scissor mode is set
 | ||||
|             // to Exclude
 | ||||
|             if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) { | ||||
|                 if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) | ||||
|                     continue; | ||||
|             } | ||||
| 
 | ||||
|             // Calculate the barycentric coordinates w0, w1 and w2
 | ||||
|             int w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); | ||||
|             int w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y}); | ||||
|             int w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y}); | ||||
|             int wsum = w0 + w1 + w2; | ||||
| 
 | ||||
|             // If current pixel is not covered by the current primitive
 | ||||
|             if (w0 < 0 || w1 < 0 || w2 < 0) | ||||
|                 continue; | ||||
| 
 | ||||
|             auto baricentric_coordinates = | ||||
|                 Common::MakeVec(float24::FromFloat32(static_cast<float>(w0)), | ||||
|                                 float24::FromFloat32(static_cast<float>(w1)), | ||||
|                                 float24::FromFloat32(static_cast<float>(w2))); | ||||
|             float24 interpolated_w_inverse = | ||||
|                 float24::FromFloat32(1.0f) / Common::Dot(w_inverse, baricentric_coordinates); | ||||
| 
 | ||||
|             // interpolated_z = z / w
 | ||||
|             float interpolated_z_over_w = | ||||
|                 (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 + | ||||
|                  v2.screenpos[2].ToFloat32() * w2) / | ||||
|                 wsum; | ||||
| 
 | ||||
|             // Not fully accurate. About 3 bits in precision are missing.
 | ||||
|             // Z-Buffer (z / w * scale + offset)
 | ||||
|             float depth_scale = float24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32(); | ||||
|             float depth_offset = | ||||
|                 float24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32(); | ||||
|             float depth = interpolated_z_over_w * depth_scale + depth_offset; | ||||
| 
 | ||||
|             // Potentially switch to W-Buffer
 | ||||
|             if (regs.rasterizer.depthmap_enable == | ||||
|                 Pica::RasterizerRegs::DepthBuffering::WBuffering) { | ||||
|                 // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w)
 | ||||
|                 depth *= interpolated_w_inverse.ToFloat32() * wsum; | ||||
|             } | ||||
| 
 | ||||
|             // Clamp the result
 | ||||
|             depth = std::clamp(depth, 0.0f, 1.0f); | ||||
| 
 | ||||
|             // Perspective correct attribute interpolation:
 | ||||
|             // Attribute values cannot be calculated by simple linear interpolation since
 | ||||
|             // they are not linear in screen space. For example, when interpolating a
 | ||||
|             // texture coordinate across two vertices, something simple like
 | ||||
|             //     u = (u0*w0 + u1*w1)/(w0+w1)
 | ||||
|             // will not work. However, the attribute value divided by the
 | ||||
|             // clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear
 | ||||
|             // in screenspace. Hence, we can linearly interpolate these two independently and
 | ||||
|             // calculate the interpolated attribute by dividing the results.
 | ||||
|             // I.e.
 | ||||
|             //     u_over_w   = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1)
 | ||||
|             //     one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1)
 | ||||
|             //     u = u_over_w / one_over_w
 | ||||
|             //
 | ||||
|             // The generalization to three vertices is straightforward in baricentric coordinates.
 | ||||
|             auto GetInterpolatedAttribute = [&](float24 attr0, float24 attr1, float24 attr2) { | ||||
|                 auto attr_over_w = Common::MakeVec(attr0, attr1, attr2); | ||||
|                 float24 interpolated_attr_over_w = | ||||
|                     Common::Dot(attr_over_w, baricentric_coordinates); | ||||
|                 return interpolated_attr_over_w * interpolated_w_inverse; | ||||
|             }; | ||||
| 
 | ||||
|             Common::Vec4<u8> primary_color{ | ||||
|                 static_cast<u8>(round( | ||||
|                     GetInterpolatedAttribute(v0.color.r(), v1.color.r(), v2.color.r()).ToFloat32() * | ||||
|                     255)), | ||||
|                 static_cast<u8>(round( | ||||
|                     GetInterpolatedAttribute(v0.color.g(), v1.color.g(), v2.color.g()).ToFloat32() * | ||||
|                     255)), | ||||
|                 static_cast<u8>(round( | ||||
|                     GetInterpolatedAttribute(v0.color.b(), v1.color.b(), v2.color.b()).ToFloat32() * | ||||
|                     255)), | ||||
|                 static_cast<u8>(round( | ||||
|                     GetInterpolatedAttribute(v0.color.a(), v1.color.a(), v2.color.a()).ToFloat32() * | ||||
|                     255)), | ||||
|             }; | ||||
| 
 | ||||
|             Common::Vec2<float24> uv[3]; | ||||
|             uv[0].u() = GetInterpolatedAttribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u()); | ||||
|             uv[0].v() = GetInterpolatedAttribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v()); | ||||
|             uv[1].u() = GetInterpolatedAttribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u()); | ||||
|             uv[1].v() = GetInterpolatedAttribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v()); | ||||
|             uv[2].u() = GetInterpolatedAttribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u()); | ||||
|             uv[2].v() = GetInterpolatedAttribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v()); | ||||
| 
 | ||||
|             Common::Vec4<u8> texture_color[4]{}; | ||||
|             for (int i = 0; i < 3; ++i) { | ||||
|                 const auto& texture = textures[i]; | ||||
|                 if (!texture.enabled) | ||||
|                     continue; | ||||
| 
 | ||||
|                 if (texture.config.address == 0) { | ||||
|                     texture_color[i] = {0, 0, 0, 255}; | ||||
|                     continue; | ||||
|                 } | ||||
| 
 | ||||
|                 int coordinate_i = | ||||
|                     (i == 2 && regs.texturing.main_config.texture2_use_coord1) ? 1 : i; | ||||
|                 float24 u = uv[coordinate_i].u(); | ||||
|                 float24 v = uv[coordinate_i].v(); | ||||
| 
 | ||||
|                 // Only unit 0 respects the texturing type (according to 3DBrew)
 | ||||
|                 // TODO: Refactor so cubemaps and shadowmaps can be handled
 | ||||
|                 PAddr texture_address = texture.config.GetPhysicalAddress(); | ||||
|                 float24 shadow_z; | ||||
|                 if (i == 0) { | ||||
|                     switch (texture.config.type) { | ||||
|                     case TexturingRegs::TextureConfig::Texture2D: | ||||
|                         break; | ||||
|                     case TexturingRegs::TextureConfig::ShadowCube: | ||||
|                     case TexturingRegs::TextureConfig::TextureCube: { | ||||
|                         auto w = GetInterpolatedAttribute(v0.tc0_w, v1.tc0_w, v2.tc0_w); | ||||
|                         std::tie(u, v, shadow_z, texture_address) = | ||||
|                             ConvertCubeCoord(u, v, w, regs.texturing); | ||||
|                         break; | ||||
|                     } | ||||
|                     case TexturingRegs::TextureConfig::Projection2D: { | ||||
|                         auto tc0_w = GetInterpolatedAttribute(v0.tc0_w, v1.tc0_w, v2.tc0_w); | ||||
|                         u /= tc0_w; | ||||
|                         v /= tc0_w; | ||||
|                         break; | ||||
|                     } | ||||
|                     case TexturingRegs::TextureConfig::Shadow2D: { | ||||
|                         auto tc0_w = GetInterpolatedAttribute(v0.tc0_w, v1.tc0_w, v2.tc0_w); | ||||
|                         if (!regs.texturing.shadow.orthographic) { | ||||
|                             u /= tc0_w; | ||||
|                             v /= tc0_w; | ||||
|                         } | ||||
| 
 | ||||
|                         shadow_z = float24::FromFloat32(std::abs(tc0_w.ToFloat32())); | ||||
|                         break; | ||||
|                     } | ||||
|                     case TexturingRegs::TextureConfig::Disabled: | ||||
|                         continue; // skip this unit and continue to the next unit
 | ||||
|                     default: | ||||
|                         LOG_ERROR(HW_GPU, "Unhandled texture type {:x}", (int)texture.config.type); | ||||
|                         UNIMPLEMENTED(); | ||||
|                         break; | ||||
|                     } | ||||
|                 } | ||||
| 
 | ||||
|                 int s = (int)(u * float24::FromFloat32(static_cast<float>(texture.config.width))) | ||||
|                             .ToFloat32(); | ||||
|                 int t = (int)(v * float24::FromFloat32(static_cast<float>(texture.config.height))) | ||||
|                             .ToFloat32(); | ||||
| 
 | ||||
|                 bool use_border_s = false; | ||||
|                 bool use_border_t = false; | ||||
| 
 | ||||
|                 if (texture.config.wrap_s == TexturingRegs::TextureConfig::ClampToBorder) { | ||||
|                     use_border_s = s < 0 || s >= static_cast<int>(texture.config.width); | ||||
|                 } else if (texture.config.wrap_s == TexturingRegs::TextureConfig::ClampToBorder2) { | ||||
|                     use_border_s = s >= static_cast<int>(texture.config.width); | ||||
|                 } | ||||
| 
 | ||||
|                 if (texture.config.wrap_t == TexturingRegs::TextureConfig::ClampToBorder) { | ||||
|                     use_border_t = t < 0 || t >= static_cast<int>(texture.config.height); | ||||
|                 } else if (texture.config.wrap_t == TexturingRegs::TextureConfig::ClampToBorder2) { | ||||
|                     use_border_t = t >= static_cast<int>(texture.config.height); | ||||
|                 } | ||||
| 
 | ||||
|                 if (use_border_s || use_border_t) { | ||||
|                     auto border_color = texture.config.border_color; | ||||
|                     texture_color[i] = | ||||
|                         Common::MakeVec(border_color.r.Value(), border_color.g.Value(), | ||||
|                                         border_color.b.Value(), border_color.a.Value()) | ||||
|                             .Cast<u8>(); | ||||
|                 } else { | ||||
|                     // Textures are laid out from bottom to top, hence we invert the t coordinate.
 | ||||
|                     // NOTE: This may not be the right place for the inversion.
 | ||||
|                     // TODO: Check if this applies to ETC textures, too.
 | ||||
|                     s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width); | ||||
|                     t = texture.config.height - 1 - | ||||
|                         GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height); | ||||
| 
 | ||||
|                     const u8* texture_data = | ||||
|                         VideoCore::g_memory->GetPhysicalPointer(texture_address); | ||||
|                     auto info = | ||||
|                         Texture::TextureInfo::FromPicaRegister(texture.config, texture.format); | ||||
| 
 | ||||
|                     // TODO: Apply the min and mag filters to the texture
 | ||||
|                     texture_color[i] = Texture::LookupTexture(texture_data, s, t, info); | ||||
|                 } | ||||
| 
 | ||||
|                 if (i == 0 && (texture.config.type == TexturingRegs::TextureConfig::Shadow2D || | ||||
|                                texture.config.type == TexturingRegs::TextureConfig::ShadowCube)) { | ||||
| 
 | ||||
|                     s32 z_int = static_cast<s32>(std::min(shadow_z.ToFloat32(), 1.0f) * 0xFFFFFF); | ||||
|                     z_int -= regs.texturing.shadow.bias << 1; | ||||
|                     auto& color = texture_color[i]; | ||||
|                     s32 z_ref = (color.w << 16) | (color.z << 8) | color.y; | ||||
|                     u8 density; | ||||
|                     if (z_ref >= z_int) { | ||||
|                         density = color.x; | ||||
|                     } else { | ||||
|                         density = 0; | ||||
|                     } | ||||
|                     texture_color[i] = {density, density, density, density}; | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             // sample procedural texture
 | ||||
|             if (regs.texturing.main_config.texture3_enable) { | ||||
|                 const auto& proctex_uv = uv[regs.texturing.main_config.texture3_coordinates]; | ||||
|                 texture_color[3] = ProcTex(proctex_uv.u().ToFloat32(), proctex_uv.v().ToFloat32(), | ||||
|                                            g_state.regs.texturing, g_state.proctex); | ||||
|             } | ||||
| 
 | ||||
|             // Texture environment - consists of 6 stages of color and alpha combining.
 | ||||
|             //
 | ||||
|             // Color combiners take three input color values from some source (e.g. interpolated
 | ||||
|             // vertex color, texture color, previous stage, etc), perform some very simple
 | ||||
|             // operations on each of them (e.g. inversion) and then calculate the output color
 | ||||
|             // with some basic arithmetic. Alpha combiners can be configured separately but work
 | ||||
|             // analogously.
 | ||||
|             Common::Vec4<u8> combiner_output; | ||||
|             Common::Vec4<u8> combiner_buffer = {0, 0, 0, 0}; | ||||
|             Common::Vec4<u8> next_combiner_buffer = | ||||
|                 Common::MakeVec(regs.texturing.tev_combiner_buffer_color.r.Value(), | ||||
|                                 regs.texturing.tev_combiner_buffer_color.g.Value(), | ||||
|                                 regs.texturing.tev_combiner_buffer_color.b.Value(), | ||||
|                                 regs.texturing.tev_combiner_buffer_color.a.Value()) | ||||
|                     .Cast<u8>(); | ||||
| 
 | ||||
|             Common::Vec4<u8> primary_fragment_color = {0, 0, 0, 0}; | ||||
|             Common::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0}; | ||||
| 
 | ||||
|             if (!g_state.regs.lighting.disable) { | ||||
|                 Common::Quaternion<float> normquat = | ||||
|                     Common::Quaternion<float>{ | ||||
|                         {GetInterpolatedAttribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(), | ||||
|                          GetInterpolatedAttribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(), | ||||
|                          GetInterpolatedAttribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()}, | ||||
|                         GetInterpolatedAttribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(), | ||||
|                     } | ||||
|                         .Normalized(); | ||||
| 
 | ||||
|                 Common::Vec3<float> view{ | ||||
|                     GetInterpolatedAttribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(), | ||||
|                     GetInterpolatedAttribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(), | ||||
|                     GetInterpolatedAttribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(), | ||||
|                 }; | ||||
|                 std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors( | ||||
|                     g_state.regs.lighting, g_state.lighting, normquat, view, texture_color); | ||||
|             } | ||||
| 
 | ||||
|             for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size(); | ||||
|                  ++tev_stage_index) { | ||||
|                 const auto& tev_stage = tev_stages[tev_stage_index]; | ||||
|                 using Source = TexturingRegs::TevStageConfig::Source; | ||||
| 
 | ||||
|                 auto GetSource = [&](Source source) -> Common::Vec4<u8> { | ||||
|                     switch (source) { | ||||
|                     case Source::PrimaryColor: | ||||
|                         return primary_color; | ||||
| 
 | ||||
|                     case Source::PrimaryFragmentColor: | ||||
|                         return primary_fragment_color; | ||||
| 
 | ||||
|                     case Source::SecondaryFragmentColor: | ||||
|                         return secondary_fragment_color; | ||||
| 
 | ||||
|                     case Source::Texture0: | ||||
|                         return texture_color[0]; | ||||
| 
 | ||||
|                     case Source::Texture1: | ||||
|                         return texture_color[1]; | ||||
| 
 | ||||
|                     case Source::Texture2: | ||||
|                         return texture_color[2]; | ||||
| 
 | ||||
|                     case Source::Texture3: | ||||
|                         return texture_color[3]; | ||||
| 
 | ||||
|                     case Source::PreviousBuffer: | ||||
|                         return combiner_buffer; | ||||
| 
 | ||||
|                     case Source::Constant: | ||||
|                         return Common::MakeVec(tev_stage.const_r.Value(), tev_stage.const_g.Value(), | ||||
|                                                tev_stage.const_b.Value(), tev_stage.const_a.Value()) | ||||
|                             .Cast<u8>(); | ||||
| 
 | ||||
|                     case Source::Previous: | ||||
|                         return combiner_output; | ||||
| 
 | ||||
|                     default: | ||||
|                         LOG_ERROR(HW_GPU, "Unknown color combiner source {}", (int)source); | ||||
|                         UNIMPLEMENTED(); | ||||
|                         return {0, 0, 0, 0}; | ||||
|                     } | ||||
|                 }; | ||||
| 
 | ||||
|                 // color combiner
 | ||||
|                 // NOTE: Not sure if the alpha combiner might use the color output of the previous
 | ||||
|                 //       stage as input. Hence, we currently don't directly write the result to
 | ||||
|                 //       combiner_output.rgb(), but instead store it in a temporary variable until
 | ||||
|                 //       alpha combining has been done.
 | ||||
|                 Common::Vec3<u8> color_result[3] = { | ||||
|                     GetColorModifier(tev_stage.color_modifier1, GetSource(tev_stage.color_source1)), | ||||
|                     GetColorModifier(tev_stage.color_modifier2, GetSource(tev_stage.color_source2)), | ||||
|                     GetColorModifier(tev_stage.color_modifier3, GetSource(tev_stage.color_source3)), | ||||
|                 }; | ||||
|                 auto color_output = ColorCombine(tev_stage.color_op, color_result); | ||||
| 
 | ||||
|                 u8 alpha_output; | ||||
|                 if (tev_stage.color_op == TexturingRegs::TevStageConfig::Operation::Dot3_RGBA) { | ||||
|                     // result of Dot3_RGBA operation is also placed to the alpha component
 | ||||
|                     alpha_output = color_output.x; | ||||
|                 } else { | ||||
|                     // alpha combiner
 | ||||
|                     std::array<u8, 3> alpha_result = {{ | ||||
|                         GetAlphaModifier(tev_stage.alpha_modifier1, | ||||
|                                          GetSource(tev_stage.alpha_source1)), | ||||
|                         GetAlphaModifier(tev_stage.alpha_modifier2, | ||||
|                                          GetSource(tev_stage.alpha_source2)), | ||||
|                         GetAlphaModifier(tev_stage.alpha_modifier3, | ||||
|                                          GetSource(tev_stage.alpha_source3)), | ||||
|                     }}; | ||||
|                     alpha_output = AlphaCombine(tev_stage.alpha_op, alpha_result); | ||||
|                 } | ||||
| 
 | ||||
|                 combiner_output[0] = | ||||
|                     std::min((unsigned)255, color_output.r() * tev_stage.GetColorMultiplier()); | ||||
|                 combiner_output[1] = | ||||
|                     std::min((unsigned)255, color_output.g() * tev_stage.GetColorMultiplier()); | ||||
|                 combiner_output[2] = | ||||
|                     std::min((unsigned)255, color_output.b() * tev_stage.GetColorMultiplier()); | ||||
|                 combiner_output[3] = | ||||
|                     std::min((unsigned)255, alpha_output * tev_stage.GetAlphaMultiplier()); | ||||
| 
 | ||||
|                 combiner_buffer = next_combiner_buffer; | ||||
| 
 | ||||
|                 if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferColor( | ||||
|                         tev_stage_index)) { | ||||
|                     next_combiner_buffer.r() = combiner_output.r(); | ||||
|                     next_combiner_buffer.g() = combiner_output.g(); | ||||
|                     next_combiner_buffer.b() = combiner_output.b(); | ||||
|                 } | ||||
| 
 | ||||
|                 if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferAlpha( | ||||
|                         tev_stage_index)) { | ||||
|                     next_combiner_buffer.a() = combiner_output.a(); | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             const auto& output_merger = regs.framebuffer.output_merger; | ||||
| 
 | ||||
|             if (output_merger.fragment_operation_mode == | ||||
|                 FramebufferRegs::FragmentOperationMode::Shadow) { | ||||
|                 u32 depth_int = static_cast<u32>(depth * 0xFFFFFF); | ||||
|                 // use green color as the shadow intensity
 | ||||
|                 u8 stencil = combiner_output.y; | ||||
|                 DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil); | ||||
|                 // skip the normal output merger pipeline if it is in shadow mode
 | ||||
|                 continue; | ||||
|             } | ||||
| 
 | ||||
|             // TODO: Does alpha testing happen before or after stencil?
 | ||||
|             if (output_merger.alpha_test.enable) { | ||||
|                 bool pass = false; | ||||
| 
 | ||||
|                 switch (output_merger.alpha_test.func) { | ||||
|                 case FramebufferRegs::CompareFunc::Never: | ||||
|                     pass = false; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::Always: | ||||
|                     pass = true; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::Equal: | ||||
|                     pass = combiner_output.a() == output_merger.alpha_test.ref; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::NotEqual: | ||||
|                     pass = combiner_output.a() != output_merger.alpha_test.ref; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::LessThan: | ||||
|                     pass = combiner_output.a() < output_merger.alpha_test.ref; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::LessThanOrEqual: | ||||
|                     pass = combiner_output.a() <= output_merger.alpha_test.ref; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::GreaterThan: | ||||
|                     pass = combiner_output.a() > output_merger.alpha_test.ref; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::GreaterThanOrEqual: | ||||
|                     pass = combiner_output.a() >= output_merger.alpha_test.ref; | ||||
|                     break; | ||||
|                 } | ||||
| 
 | ||||
|                 if (!pass) | ||||
|                     continue; | ||||
|             } | ||||
| 
 | ||||
|             // Apply fog combiner
 | ||||
|             // Not fully accurate. We'd have to know what data type is used to
 | ||||
|             // store the depth etc. Using float for now until we know more
 | ||||
|             // about Pica datatypes
 | ||||
|             if (regs.texturing.fog_mode == TexturingRegs::FogMode::Fog) { | ||||
|                 const Common::Vec3<u8> fog_color = | ||||
|                     Common::MakeVec(regs.texturing.fog_color.r.Value(), | ||||
|                                     regs.texturing.fog_color.g.Value(), | ||||
|                                     regs.texturing.fog_color.b.Value()) | ||||
|                         .Cast<u8>(); | ||||
| 
 | ||||
|                 // Get index into fog LUT
 | ||||
|                 float fog_index; | ||||
|                 if (g_state.regs.texturing.fog_flip) { | ||||
|                     fog_index = (1.0f - depth) * 128.0f; | ||||
|                 } else { | ||||
|                     fog_index = depth * 128.0f; | ||||
|                 } | ||||
| 
 | ||||
|                 // Generate clamped fog factor from LUT for given fog index
 | ||||
|                 float fog_i = std::clamp(floorf(fog_index), 0.0f, 127.0f); | ||||
|                 float fog_f = fog_index - fog_i; | ||||
|                 const auto& fog_lut_entry = g_state.fog.lut[static_cast<unsigned int>(fog_i)]; | ||||
|                 float fog_factor = fog_lut_entry.ToFloat() + fog_lut_entry.DiffToFloat() * fog_f; | ||||
|                 fog_factor = std::clamp(fog_factor, 0.0f, 1.0f); | ||||
| 
 | ||||
|                 // Blend the fog
 | ||||
|                 for (unsigned i = 0; i < 3; i++) { | ||||
|                     combiner_output[i] = static_cast<u8>(fog_factor * combiner_output[i] + | ||||
|                                                          (1.0f - fog_factor) * fog_color[i]); | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             u8 old_stencil = 0; | ||||
| 
 | ||||
|             auto UpdateStencil = [stencil_test, x, y, | ||||
|                                   &old_stencil](Pica::FramebufferRegs::StencilAction action) { | ||||
|                 u8 new_stencil = | ||||
|                     PerformStencilAction(action, old_stencil, stencil_test.reference_value); | ||||
|                 if (g_state.regs.framebuffer.framebuffer.allow_depth_stencil_write != 0) | ||||
|                     SetStencil(x >> 4, y >> 4, | ||||
|                                (new_stencil & stencil_test.write_mask) | | ||||
|                                    (old_stencil & ~stencil_test.write_mask)); | ||||
|             }; | ||||
| 
 | ||||
|             if (stencil_action_enable) { | ||||
|                 old_stencil = GetStencil(x >> 4, y >> 4); | ||||
|                 u8 dest = old_stencil & stencil_test.input_mask; | ||||
|                 u8 ref = stencil_test.reference_value & stencil_test.input_mask; | ||||
| 
 | ||||
|                 bool pass = false; | ||||
|                 switch (stencil_test.func) { | ||||
|                 case FramebufferRegs::CompareFunc::Never: | ||||
|                     pass = false; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::Always: | ||||
|                     pass = true; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::Equal: | ||||
|                     pass = (ref == dest); | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::NotEqual: | ||||
|                     pass = (ref != dest); | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::LessThan: | ||||
|                     pass = (ref < dest); | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::LessThanOrEqual: | ||||
|                     pass = (ref <= dest); | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::GreaterThan: | ||||
|                     pass = (ref > dest); | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::GreaterThanOrEqual: | ||||
|                     pass = (ref >= dest); | ||||
|                     break; | ||||
|                 } | ||||
| 
 | ||||
|                 if (!pass) { | ||||
|                     UpdateStencil(stencil_test.action_stencil_fail); | ||||
|                     continue; | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             // Convert float to integer
 | ||||
|             unsigned num_bits = | ||||
|                 FramebufferRegs::DepthBitsPerPixel(regs.framebuffer.framebuffer.depth_format); | ||||
|             u32 z = (u32)(depth * ((1 << num_bits) - 1)); | ||||
| 
 | ||||
|             if (output_merger.depth_test_enable) { | ||||
|                 u32 ref_z = GetDepth(x >> 4, y >> 4); | ||||
| 
 | ||||
|                 bool pass = false; | ||||
| 
 | ||||
|                 switch (output_merger.depth_test_func) { | ||||
|                 case FramebufferRegs::CompareFunc::Never: | ||||
|                     pass = false; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::Always: | ||||
|                     pass = true; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::Equal: | ||||
|                     pass = z == ref_z; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::NotEqual: | ||||
|                     pass = z != ref_z; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::LessThan: | ||||
|                     pass = z < ref_z; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::LessThanOrEqual: | ||||
|                     pass = z <= ref_z; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::GreaterThan: | ||||
|                     pass = z > ref_z; | ||||
|                     break; | ||||
| 
 | ||||
|                 case FramebufferRegs::CompareFunc::GreaterThanOrEqual: | ||||
|                     pass = z >= ref_z; | ||||
|                     break; | ||||
|                 } | ||||
| 
 | ||||
|                 if (!pass) { | ||||
|                     if (stencil_action_enable) | ||||
|                         UpdateStencil(stencil_test.action_depth_fail); | ||||
|                     continue; | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             if (regs.framebuffer.framebuffer.allow_depth_stencil_write != 0 && | ||||
|                 output_merger.depth_write_enable) { | ||||
| 
 | ||||
|                 SetDepth(x >> 4, y >> 4, z); | ||||
|             } | ||||
| 
 | ||||
|             // The stencil depth_pass action is executed even if depth testing is disabled
 | ||||
|             if (stencil_action_enable) | ||||
|                 UpdateStencil(stencil_test.action_depth_pass); | ||||
| 
 | ||||
|             auto dest = GetPixel(x >> 4, y >> 4); | ||||
|             Common::Vec4<u8> blend_output = combiner_output; | ||||
| 
 | ||||
|             if (output_merger.alphablend_enable) { | ||||
|                 auto params = output_merger.alpha_blending; | ||||
| 
 | ||||
|                 auto LookupFactor = [&](unsigned channel, | ||||
|                                         FramebufferRegs::BlendFactor factor) -> u8 { | ||||
|                     DEBUG_ASSERT(channel < 4); | ||||
| 
 | ||||
|                     const Common::Vec4<u8> blend_const = | ||||
|                         Common::MakeVec(output_merger.blend_const.r.Value(), | ||||
|                                         output_merger.blend_const.g.Value(), | ||||
|                                         output_merger.blend_const.b.Value(), | ||||
|                                         output_merger.blend_const.a.Value()) | ||||
|                             .Cast<u8>(); | ||||
| 
 | ||||
|                     switch (factor) { | ||||
|                     case FramebufferRegs::BlendFactor::Zero: | ||||
|                         return 0; | ||||
| 
 | ||||
|                     case FramebufferRegs::BlendFactor::One: | ||||
|                         return 255; | ||||
| 
 | ||||
|                     case FramebufferRegs::BlendFactor::SourceColor: | ||||
|                         return combiner_output[channel]; | ||||
| 
 | ||||
|                     case FramebufferRegs::BlendFactor::OneMinusSourceColor: | ||||
|                         return 255 - combiner_output[channel]; | ||||
| 
 | ||||
|                     case FramebufferRegs::BlendFactor::DestColor: | ||||
|                         return dest[channel]; | ||||
| 
 | ||||
|                     case FramebufferRegs::BlendFactor::OneMinusDestColor: | ||||
|                         return 255 - dest[channel]; | ||||
| 
 | ||||
|                     case FramebufferRegs::BlendFactor::SourceAlpha: | ||||
|                         return combiner_output.a(); | ||||
| 
 | ||||
|                     case FramebufferRegs::BlendFactor::OneMinusSourceAlpha: | ||||
|                         return 255 - combiner_output.a(); | ||||
| 
 | ||||
|                     case FramebufferRegs::BlendFactor::DestAlpha: | ||||
|                         return dest.a(); | ||||
| 
 | ||||
|                     case FramebufferRegs::BlendFactor::OneMinusDestAlpha: | ||||
|                         return 255 - dest.a(); | ||||
| 
 | ||||
|                     case FramebufferRegs::BlendFactor::ConstantColor: | ||||
|                         return blend_const[channel]; | ||||
| 
 | ||||
|                     case FramebufferRegs::BlendFactor::OneMinusConstantColor: | ||||
|                         return 255 - blend_const[channel]; | ||||
| 
 | ||||
|                     case FramebufferRegs::BlendFactor::ConstantAlpha: | ||||
|                         return blend_const.a(); | ||||
| 
 | ||||
|                     case FramebufferRegs::BlendFactor::OneMinusConstantAlpha: | ||||
|                         return 255 - blend_const.a(); | ||||
| 
 | ||||
|                     case FramebufferRegs::BlendFactor::SourceAlphaSaturate: | ||||
|                         // Returns 1.0 for the alpha channel
 | ||||
|                         if (channel == 3) | ||||
|                             return 255; | ||||
|                         return std::min(combiner_output.a(), static_cast<u8>(255 - dest.a())); | ||||
| 
 | ||||
|                     default: | ||||
|                         LOG_CRITICAL(HW_GPU, "Unknown blend factor {:x}", factor); | ||||
|                         UNIMPLEMENTED(); | ||||
|                         break; | ||||
|                     } | ||||
| 
 | ||||
|                     return combiner_output[channel]; | ||||
|                 }; | ||||
| 
 | ||||
|                 auto srcfactor = Common::MakeVec(LookupFactor(0, params.factor_source_rgb), | ||||
|                                                  LookupFactor(1, params.factor_source_rgb), | ||||
|                                                  LookupFactor(2, params.factor_source_rgb), | ||||
|                                                  LookupFactor(3, params.factor_source_a)); | ||||
| 
 | ||||
|                 auto dstfactor = Common::MakeVec(LookupFactor(0, params.factor_dest_rgb), | ||||
|                                                  LookupFactor(1, params.factor_dest_rgb), | ||||
|                                                  LookupFactor(2, params.factor_dest_rgb), | ||||
|                                                  LookupFactor(3, params.factor_dest_a)); | ||||
| 
 | ||||
|                 blend_output = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, | ||||
|                                                      params.blend_equation_rgb); | ||||
|                 blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, | ||||
|                                                          dstfactor, params.blend_equation_a) | ||||
|                                        .a(); | ||||
|             } else { | ||||
|                 blend_output = | ||||
|                     Common::MakeVec(LogicOp(combiner_output.r(), dest.r(), output_merger.logic_op), | ||||
|                                     LogicOp(combiner_output.g(), dest.g(), output_merger.logic_op), | ||||
|                                     LogicOp(combiner_output.b(), dest.b(), output_merger.logic_op), | ||||
|                                     LogicOp(combiner_output.a(), dest.a(), output_merger.logic_op)); | ||||
|             } | ||||
| 
 | ||||
|             const Common::Vec4<u8> result = { | ||||
|                 output_merger.red_enable ? blend_output.r() : dest.r(), | ||||
|                 output_merger.green_enable ? blend_output.g() : dest.g(), | ||||
|                 output_merger.blue_enable ? blend_output.b() : dest.b(), | ||||
|                 output_merger.alpha_enable ? blend_output.a() : dest.a(), | ||||
|             }; | ||||
| 
 | ||||
|             if (regs.framebuffer.framebuffer.allow_color_write != 0) | ||||
|                 DrawPixel(x >> 4, y >> 4, result); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| void ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2) { | ||||
|     ProcessTriangleInternal(v0, v1, v2); | ||||
| } | ||||
| 
 | ||||
| } // namespace Pica::Rasterizer
 | ||||
|  | @ -1,44 +0,0 @@ | |||
| // Copyright 2014 Citra Emulator Project
 | ||||
| // Licensed under GPLv2 or any later version
 | ||||
| // Refer to the license.txt file included.
 | ||||
| 
 | ||||
| #pragma once | ||||
| 
 | ||||
| #include "video_core/shader/shader.h" | ||||
| 
 | ||||
| namespace Pica::Rasterizer { | ||||
| 
 | ||||
| struct Vertex : Shader::OutputVertex { | ||||
|     Vertex(const OutputVertex& v) : OutputVertex(v) {} | ||||
| 
 | ||||
|     // Attributes used to store intermediate results
 | ||||
|     // position after perspective divide
 | ||||
|     Common::Vec3<float24> screenpos; | ||||
| 
 | ||||
|     // Linear interpolation
 | ||||
|     // factor: 0=this, 1=vtx
 | ||||
|     // Note: This function cannot be called after perspective divide
 | ||||
|     void Lerp(float24 factor, const Vertex& vtx) { | ||||
|         pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); | ||||
|         quat = quat * factor + vtx.quat * (float24::FromFloat32(1) - factor); | ||||
|         color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); | ||||
|         tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); | ||||
|         tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); | ||||
|         tc0_w = tc0_w * factor + vtx.tc0_w * (float24::FromFloat32(1) - factor); | ||||
|         view = view * factor + vtx.view * (float24::FromFloat32(1) - factor); | ||||
|         tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); | ||||
|     } | ||||
| 
 | ||||
|     // Linear interpolation
 | ||||
|     // factor: 0=v0, 1=v1
 | ||||
|     // Note: This function cannot be called after perspective divide
 | ||||
|     static Vertex Lerp(float24 factor, const Vertex& v0, const Vertex& v1) { | ||||
|         Vertex ret = v0; | ||||
|         ret.Lerp(factor, v1); | ||||
|         return ret; | ||||
|     } | ||||
| }; | ||||
| 
 | ||||
| void ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2); | ||||
| 
 | ||||
| } // namespace Pica::Rasterizer
 | ||||
|  | @ -2,18 +2,86 @@ | |||
| // Licensed under GPLv2 or any later version
 | ||||
| // Refer to the license.txt file included.
 | ||||
| 
 | ||||
| #include "common/color.h" | ||||
| #include "core/core.h" | ||||
| #include "core/hw/gpu.h" | ||||
| #include "core/hw/hw.h" | ||||
| #include "core/hw/lcd.h" | ||||
| #include "video_core/renderer_software/renderer_software.h" | ||||
| 
 | ||||
| namespace VideoCore { | ||||
| namespace SwRenderer { | ||||
| 
 | ||||
| RendererSoftware::RendererSoftware(Core::System& system, Frontend::EmuWindow& window) | ||||
|     : VideoCore::RendererBase{system, window, nullptr}, | ||||
|       rasterizer{std::make_unique<RasterizerSoftware>()} {} | ||||
|     : VideoCore::RendererBase{system, window, nullptr}, memory{system.Memory()}, | ||||
|       rasterizer{std::make_unique<RasterizerSoftware>(system.Memory())} {} | ||||
| 
 | ||||
| RendererSoftware::~RendererSoftware() = default; | ||||
| 
 | ||||
| void RendererSoftware::SwapBuffers() { | ||||
|     PrepareRenderTarget(); | ||||
|     EndFrame(); | ||||
| } | ||||
| 
 | ||||
| } // namespace VideoCore
 | ||||
| void RendererSoftware::PrepareRenderTarget() { | ||||
|     for (int i : {0, 1, 2}) { | ||||
|         const int fb_id = i == 2 ? 1 : 0; | ||||
|         const auto& framebuffer = GPU::g_regs.framebuffer_config[fb_id]; | ||||
|         auto& info = screen_infos[i]; | ||||
| 
 | ||||
|         u32 lcd_color_addr = | ||||
|             (fb_id == 0) ? LCD_REG_INDEX(color_fill_top) : LCD_REG_INDEX(color_fill_bottom); | ||||
|         lcd_color_addr = HW::VADDR_LCD + 4 * lcd_color_addr; | ||||
|         LCD::Regs::ColorFill color_fill = {0}; | ||||
|         LCD::Read(color_fill.raw, lcd_color_addr); | ||||
| 
 | ||||
|         if (!color_fill.is_enabled) { | ||||
|             const u32 old_width = std::exchange(info.width, framebuffer.width); | ||||
|             const u32 old_height = std::exchange(info.height, framebuffer.height); | ||||
|             if (framebuffer.width != old_width || framebuffer.height != old_height) [[unlikely]] { | ||||
|                 info.pixels.resize(framebuffer.width * framebuffer.height * 4); | ||||
|             } | ||||
|             CopyPixels(i); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| void RendererSoftware::CopyPixels(int i) { | ||||
|     const u32 fb_id = i == 2 ? 1 : 0; | ||||
|     const auto& framebuffer = GPU::g_regs.framebuffer_config[fb_id]; | ||||
| 
 | ||||
|     const PAddr framebuffer_addr = | ||||
|         framebuffer.active_fb == 0 ? framebuffer.address_left1 : framebuffer.address_left2; | ||||
|     const s32 bpp = GPU::Regs::BytesPerPixel(framebuffer.color_format); | ||||
|     const u8* framebuffer_data = memory.GetPhysicalPointer(framebuffer_addr); | ||||
| 
 | ||||
|     const s32 stride = framebuffer.stride; | ||||
|     const s32 height = framebuffer.height; | ||||
|     ASSERT(stride * height != 0); | ||||
| 
 | ||||
|     u32 output_offset = 0; | ||||
|     for (u32 y = 0; y < framebuffer.height; y++) { | ||||
|         for (u32 x = 0; x < framebuffer.width; x++) { | ||||
|             const u8* pixel = framebuffer_data + (y * stride + x) * bpp; | ||||
|             const Common::Vec4 color = [&] { | ||||
|                 switch (framebuffer.color_format) { | ||||
|                 case GPU::Regs::PixelFormat::RGBA8: | ||||
|                     return Common::Color::DecodeRGBA8(pixel); | ||||
|                 case GPU::Regs::PixelFormat::RGB8: | ||||
|                     return Common::Color::DecodeRGB8(pixel); | ||||
|                 case GPU::Regs::PixelFormat::RGB565: | ||||
|                     return Common::Color::DecodeRGB565(pixel); | ||||
|                 case GPU::Regs::PixelFormat::RGB5A1: | ||||
|                     return Common::Color::DecodeRGB5A1(pixel); | ||||
|                 case GPU::Regs::PixelFormat::RGBA4: | ||||
|                     return Common::Color::DecodeRGBA4(pixel); | ||||
|                 } | ||||
|                 UNREACHABLE(); | ||||
|             }(); | ||||
|             u8* dest = screen_infos[i].pixels.data() + output_offset; | ||||
|             std::memcpy(dest, color.AsArray(), sizeof(color)); | ||||
|             output_offset += sizeof(color); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| } // namespace SwRenderer
 | ||||
|  |  | |||
|  | @ -11,7 +11,13 @@ namespace Core { | |||
| class System; | ||||
| } | ||||
| 
 | ||||
| namespace VideoCore { | ||||
| namespace SwRenderer { | ||||
| 
 | ||||
| struct ScreenInfo { | ||||
|     u32 width; | ||||
|     u32 height; | ||||
|     std::vector<u8> pixels; | ||||
| }; | ||||
| 
 | ||||
| class RendererSoftware : public VideoCore::RendererBase { | ||||
| public: | ||||
|  | @ -22,12 +28,22 @@ public: | |||
|         return rasterizer.get(); | ||||
|     } | ||||
| 
 | ||||
|     [[nodiscard]] const ScreenInfo& Screen(VideoCore::ScreenId id) const noexcept { | ||||
|         return screen_infos[static_cast<u32>(id)]; | ||||
|     } | ||||
| 
 | ||||
|     void SwapBuffers() override; | ||||
|     void TryPresent(int timeout_ms, bool is_secondary) override {} | ||||
|     void Sync() override {} | ||||
| 
 | ||||
| private: | ||||
|     void PrepareRenderTarget(); | ||||
|     void CopyPixels(int i); | ||||
| 
 | ||||
| private: | ||||
|     Memory::MemorySystem& memory; | ||||
|     std::unique_ptr<RasterizerSoftware> rasterizer; | ||||
|     std::array<ScreenInfo, 3> screen_infos{}; | ||||
| }; | ||||
| 
 | ||||
| } // namespace VideoCore
 | ||||
| } // namespace SwRenderer
 | ||||
|  |  | |||
|  | @ -1,196 +1,88 @@ | |||
| // Copyright 2014 Citra Emulator Project
 | ||||
| // Copyright 2023 Citra Emulator Project
 | ||||
| // Licensed under GPLv2 or any later version
 | ||||
| // Refer to the license.txt file included.
 | ||||
| 
 | ||||
| #include <algorithm> | ||||
| #include <array> | ||||
| #include <cstddef> | ||||
| #include <boost/container/static_vector.hpp> | ||||
| #include "common/bit_field.h" | ||||
| #include "common/common_types.h" | ||||
| #include "common/logging/log.h" | ||||
| #include "common/vector_math.h" | ||||
| #include "video_core/pica_state.h" | ||||
| #include "video_core/pica_types.h" | ||||
| #include "video_core/renderer_software/rasterizer.h" | ||||
| #include "video_core/regs_texturing.h" | ||||
| #include "video_core/renderer_software/sw_clipper.h" | ||||
| #include "video_core/shader/shader.h" | ||||
| 
 | ||||
| using Pica::Rasterizer::Vertex; | ||||
| namespace SwRenderer { | ||||
| 
 | ||||
| namespace Pica::Clipper { | ||||
| using Pica::TexturingRegs; | ||||
| 
 | ||||
| struct ClippingEdge { | ||||
| public: | ||||
|     ClippingEdge(Common::Vec4<float24> coeffs, | ||||
|                  Common::Vec4<float24> bias = Common::Vec4<float24>(float24::FromFloat32(0), | ||||
|                                                                     float24::FromFloat32(0), | ||||
|                                                                     float24::FromFloat32(0), | ||||
|                                                                     float24::FromFloat32(0))) | ||||
|         : coeffs(coeffs), bias(bias) {} | ||||
| 
 | ||||
|     bool IsInside(const Vertex& vertex) const { | ||||
|         return Common::Dot(vertex.pos + bias, coeffs) >= float24::FromFloat32(0); | ||||
| void FlipQuaternionIfOpposite(Common::Vec4<f24>& a, const Common::Vec4<f24>& b) { | ||||
|     if (Common::Dot(a, b) < f24::Zero()) { | ||||
|         a *= f24::FromFloat32(-1.0f); | ||||
|     } | ||||
| 
 | ||||
|     bool IsOutSide(const Vertex& vertex) const { | ||||
|         return !IsInside(vertex); | ||||
|     } | ||||
| 
 | ||||
|     Vertex GetIntersection(const Vertex& v0, const Vertex& v1) const { | ||||
|         float24 dp = Common::Dot(v0.pos + bias, coeffs); | ||||
|         float24 dp_prev = Common::Dot(v1.pos + bias, coeffs); | ||||
|         float24 factor = dp_prev / (dp_prev - dp); | ||||
| 
 | ||||
|         return Vertex::Lerp(factor, v0, v1); | ||||
|     } | ||||
| 
 | ||||
| private: | ||||
|     [[maybe_unused]] float24 pos; | ||||
|     Common::Vec4<float24> coeffs; | ||||
|     Common::Vec4<float24> bias; | ||||
| }; | ||||
| 
 | ||||
| static void InitScreenCoordinates(Vertex& vtx) { | ||||
|     struct { | ||||
|         float24 halfsize_x; | ||||
|         float24 offset_x; | ||||
|         float24 halfsize_y; | ||||
|         float24 offset_y; | ||||
|         float24 zscale; | ||||
|         float24 offset_z; | ||||
|     } viewport; | ||||
| int SignedArea(const Common::Vec2<Fix12P4>& vtx1, const Common::Vec2<Fix12P4>& vtx2, | ||||
|                const Common::Vec2<Fix12P4>& vtx3) { | ||||
|     const auto vec1 = Common::MakeVec(vtx2 - vtx1, 0); | ||||
|     const auto vec2 = Common::MakeVec(vtx3 - vtx1, 0); | ||||
|     // TODO: There is a very small chance this will overflow for sizeof(int) == 4
 | ||||
|     return Common::Cross(vec1, vec2).z; | ||||
| }; | ||||
| 
 | ||||
|     const auto& regs = g_state.regs; | ||||
|     viewport.halfsize_x = float24::FromRaw(regs.rasterizer.viewport_size_x); | ||||
|     viewport.halfsize_y = float24::FromRaw(regs.rasterizer.viewport_size_y); | ||||
|     viewport.offset_x = float24::FromFloat32(static_cast<float>(regs.rasterizer.viewport_corner.x)); | ||||
|     viewport.offset_y = float24::FromFloat32(static_cast<float>(regs.rasterizer.viewport_corner.y)); | ||||
| 
 | ||||
|     float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w; | ||||
|     vtx.pos.w = inv_w; | ||||
|     vtx.quat *= inv_w; | ||||
|     vtx.color *= inv_w; | ||||
|     vtx.tc0 *= inv_w; | ||||
|     vtx.tc1 *= inv_w; | ||||
|     vtx.tc0_w *= inv_w; | ||||
|     vtx.view *= inv_w; | ||||
|     vtx.tc2 *= inv_w; | ||||
| 
 | ||||
|     vtx.screenpos[0] = | ||||
|         (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x; | ||||
|     vtx.screenpos[1] = | ||||
|         (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y; | ||||
|     vtx.screenpos[2] = vtx.pos.z * inv_w; | ||||
| } | ||||
| 
 | ||||
| void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const OutputVertex& v2) { | ||||
|     using boost::container::static_vector; | ||||
| 
 | ||||
|     // Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at
 | ||||
|     // the new edge (or less in degenerate cases). As such, we can say that each clipping plane
 | ||||
|     // introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a
 | ||||
|     // fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9.
 | ||||
|     static const std::size_t MAX_VERTICES = 9; | ||||
|     static_vector<Vertex, MAX_VERTICES> buffer_a = {v0, v1, v2}; | ||||
|     static_vector<Vertex, MAX_VERTICES> buffer_b; | ||||
| 
 | ||||
|     auto FlipQuaternionIfOpposite = [](auto& a, const auto& b) { | ||||
|         if (Common::Dot(a, b) < float24::Zero()) | ||||
|             a = a * float24::FromFloat32(-1.0f); | ||||
|     }; | ||||
| 
 | ||||
|     // Flip the quaternions if they are opposite to prevent interpolating them over the wrong
 | ||||
|     // direction.
 | ||||
|     FlipQuaternionIfOpposite(buffer_a[1].quat, buffer_a[0].quat); | ||||
|     FlipQuaternionIfOpposite(buffer_a[2].quat, buffer_a[0].quat); | ||||
| 
 | ||||
|     auto* output_list = &buffer_a; | ||||
|     auto* input_list = &buffer_b; | ||||
| 
 | ||||
|     // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value.
 | ||||
|     // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest
 | ||||
|     //       epsilon possible within float24 accuracy.
 | ||||
|     static const float24 EPSILON = float24::FromFloat32(0.00001f); | ||||
|     static const float24 f0 = float24::FromFloat32(0.0); | ||||
|     static const float24 f1 = float24::FromFloat32(1.0); | ||||
|     static const std::array<ClippingEdge, 7> clipping_edges = {{ | ||||
|         {Common::MakeVec(-f1, f0, f0, f1)}, // x = +w
 | ||||
|         {Common::MakeVec(f1, f0, f0, f1)},  // x = -w
 | ||||
|         {Common::MakeVec(f0, -f1, f0, f1)}, // y = +w
 | ||||
|         {Common::MakeVec(f0, f1, f0, f1)},  // y = -w
 | ||||
|         {Common::MakeVec(f0, f0, -f1, f0)}, // z =  0
 | ||||
|         {Common::MakeVec(f0, f0, f1, f1)},  // z = -w
 | ||||
|         {Common::MakeVec(f0, f0, f0, f1), | ||||
|          Common::Vec4<float24>(f0, f0, f0, EPSILON)}, // w = EPSILON
 | ||||
|     }}; | ||||
| 
 | ||||
|     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
 | ||||
|     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
 | ||||
|     auto Clip = [&](const ClippingEdge& edge) { | ||||
|         std::swap(input_list, output_list); | ||||
|         output_list->clear(); | ||||
| 
 | ||||
|         const Vertex* reference_vertex = &input_list->back(); | ||||
| 
 | ||||
|         for (const auto& vertex : *input_list) { | ||||
|             // NOTE: This algorithm changes vertex order in some cases!
 | ||||
|             if (edge.IsInside(vertex)) { | ||||
|                 if (edge.IsOutSide(*reference_vertex)) { | ||||
|                     output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); | ||||
|                 } | ||||
| 
 | ||||
|                 output_list->push_back(vertex); | ||||
|             } else if (edge.IsInside(*reference_vertex)) { | ||||
|                 output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); | ||||
|             } | ||||
|             reference_vertex = &vertex; | ||||
| std::tuple<f24, f24, f24, PAddr> ConvertCubeCoord(f24 u, f24 v, f24 w, | ||||
|                                                   const Pica::TexturingRegs& regs) { | ||||
|     const float abs_u = std::abs(u.ToFloat32()); | ||||
|     const float abs_v = std::abs(v.ToFloat32()); | ||||
|     const float abs_w = std::abs(w.ToFloat32()); | ||||
|     f24 x, y, z; | ||||
|     PAddr addr; | ||||
|     if (abs_u > abs_v && abs_u > abs_w) { | ||||
|         if (u > f24::Zero()) { | ||||
|             addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::PositiveX); | ||||
|             y = -v; | ||||
|         } else { | ||||
|             addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::NegativeX); | ||||
|             y = v; | ||||
|         } | ||||
|     }; | ||||
| 
 | ||||
|     for (auto edge : clipping_edges) { | ||||
|         Clip(edge); | ||||
| 
 | ||||
|         // Need to have at least a full triangle to continue...
 | ||||
|         if (output_list->size() < 3) | ||||
|             return; | ||||
|         x = -w; | ||||
|         z = u; | ||||
|     } else if (abs_v > abs_w) { | ||||
|         if (v > f24::Zero()) { | ||||
|             addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::PositiveY); | ||||
|             x = u; | ||||
|         } else { | ||||
|             addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::NegativeY); | ||||
|             x = -u; | ||||
|         } | ||||
|         y = w; | ||||
|         z = v; | ||||
|     } else { | ||||
|         if (w > f24::Zero()) { | ||||
|             addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::PositiveZ); | ||||
|             y = -v; | ||||
|         } else { | ||||
|             addr = regs.GetCubePhysicalAddress(TexturingRegs::CubeFace::NegativeZ); | ||||
|             y = v; | ||||
|         } | ||||
|         x = u; | ||||
|         z = w; | ||||
|     } | ||||
|     const f24 z_abs = f24::FromFloat32(std::abs(z.ToFloat32())); | ||||
|     const f24 half = f24::FromFloat32(0.5f); | ||||
|     return std::make_tuple(x / z * half + half, y / z * half + half, z_abs, addr); | ||||
| } | ||||
| 
 | ||||
|     if (g_state.regs.rasterizer.clip_enable) { | ||||
|         ClippingEdge custom_edge{g_state.regs.rasterizer.GetClipCoef()}; | ||||
|         Clip(custom_edge); | ||||
| 
 | ||||
|         if (output_list->size() < 3) | ||||
|             return; | ||||
|     } | ||||
| 
 | ||||
|     InitScreenCoordinates((*output_list)[0]); | ||||
|     InitScreenCoordinates((*output_list)[1]); | ||||
| 
 | ||||
|     for (std::size_t i = 0; i < output_list->size() - 2; i++) { | ||||
|         Vertex& vtx0 = (*output_list)[0]; | ||||
|         Vertex& vtx1 = (*output_list)[i + 1]; | ||||
|         Vertex& vtx2 = (*output_list)[i + 2]; | ||||
| 
 | ||||
|         InitScreenCoordinates(vtx2); | ||||
| 
 | ||||
|         LOG_TRACE( | ||||
|             Render_Software, | ||||
|             "Triangle {}/{} at position ({:.3}, {:.3}, {:.3}, {:.3f}), " | ||||
|             "({:.3}, {:.3}, {:.3}, {:.3}), ({:.3}, {:.3}, {:.3}, {:.3}) and " | ||||
|             "screen position ({:.2}, {:.2}, {:.2}), ({:.2}, {:.2}, {:.2}), ({:.2}, {:.2}, {:.2})", | ||||
|             i + 1, output_list->size() - 2, vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), | ||||
|             vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(), vtx1.pos.x.ToFloat32(), | ||||
|             vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(), | ||||
|             vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), | ||||
|             vtx2.pos.w.ToFloat32(), vtx0.screenpos.x.ToFloat32(), vtx0.screenpos.y.ToFloat32(), | ||||
|             vtx0.screenpos.z.ToFloat32(), vtx1.screenpos.x.ToFloat32(), | ||||
|             vtx1.screenpos.y.ToFloat32(), vtx1.screenpos.z.ToFloat32(), | ||||
|             vtx2.screenpos.x.ToFloat32(), vtx2.screenpos.y.ToFloat32(), | ||||
|             vtx2.screenpos.z.ToFloat32()); | ||||
| 
 | ||||
|         Rasterizer::ProcessTriangle(vtx0, vtx1, vtx2); | ||||
| bool IsRightSideOrFlatBottomEdge(const Common::Vec2<Fix12P4>& vtx, | ||||
|                                  const Common::Vec2<Fix12P4>& line1, | ||||
|                                  const Common::Vec2<Fix12P4>& line2) { | ||||
|     if (line1.y == line2.y) { | ||||
|         // Just check if vertex is above us => bottom line parallel to x-axis
 | ||||
|         return vtx.y < line1.y; | ||||
|     } else { | ||||
|         // Check if vertex is on our left => right side
 | ||||
|         // TODO: Not sure how likely this is to overflow
 | ||||
|         const auto svtx = vtx.Cast<s32>(); | ||||
|         const auto sline1 = line1.Cast<s32>(); | ||||
|         const auto sline2 = line2.Cast<s32>(); | ||||
|         return svtx.x < | ||||
|                sline1.x + (sline2.x - sline1.x) * (svtx.y - sline1.y) / (sline2.y - sline1.y); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| } // namespace Pica::Clipper
 | ||||
| } // namespace SwRenderer
 | ||||
|  |  | |||
|  | @ -1,19 +1,87 @@ | |||
| // Copyright 2014 Citra Emulator Project
 | ||||
| // Copyright 2023 Citra Emulator Project
 | ||||
| // Licensed under GPLv2 or any later version
 | ||||
| // Refer to the license.txt file included.
 | ||||
| 
 | ||||
| #pragma once | ||||
| 
 | ||||
| #include "common/common_types.h" | ||||
| #include "common/vector_math.h" | ||||
| #include "video_core/pica_types.h" | ||||
| 
 | ||||
| namespace Pica { | ||||
| namespace Shader { | ||||
| struct OutputVertex; | ||||
| struct TexturingRegs; | ||||
| } | ||||
| 
 | ||||
| namespace Clipper { | ||||
| namespace SwRenderer { | ||||
| 
 | ||||
| using Shader::OutputVertex; | ||||
| using Pica::f24; | ||||
| 
 | ||||
| void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const OutputVertex& v2); | ||||
| // NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
 | ||||
| struct Fix12P4 { | ||||
|     Fix12P4() {} | ||||
|     Fix12P4(u16 val) : val(val) {} | ||||
| 
 | ||||
| } // namespace Clipper
 | ||||
| } // namespace Pica
 | ||||
|     static Fix12P4 FromFloat24(f24 flt) { | ||||
|         // TODO: Rounding here is necessary to prevent garbage pixels at
 | ||||
|         //       triangle borders. Is it that the correct solution, though?
 | ||||
|         return Fix12P4(static_cast<u16>(round(flt.ToFloat32() * 16.0f))); | ||||
|     } | ||||
| 
 | ||||
|     static u16 FracMask() { | ||||
|         return 0xF; | ||||
|     } | ||||
|     static u16 IntMask() { | ||||
|         return static_cast<u16>(~0xF); | ||||
|     } | ||||
| 
 | ||||
|     operator u16() const { | ||||
|         return val; | ||||
|     } | ||||
| 
 | ||||
|     bool operator<(const Fix12P4& oth) const { | ||||
|         return (u16) * this < (u16)oth; | ||||
|     } | ||||
| 
 | ||||
| private: | ||||
|     u16 val; | ||||
| }; | ||||
| 
 | ||||
| struct Viewport { | ||||
|     f24 halfsize_x; | ||||
|     f24 offset_x; | ||||
|     f24 halfsize_y; | ||||
|     f24 offset_y; | ||||
|     f24 zscale; | ||||
|     f24 offset_z; | ||||
| }; | ||||
| 
 | ||||
| /**
 | ||||
|  * Flips the quaternions if they are opposite to prevent | ||||
|  * interpolating them over the wrong direction. | ||||
|  */ | ||||
| void FlipQuaternionIfOpposite(Common::Vec4<f24>& a, const Common::Vec4<f24>& b); | ||||
| 
 | ||||
| /**
 | ||||
|  * Calculate signed area of the triangle spanned by the three argument vertices. | ||||
|  * The sign denotes an orientation. | ||||
|  **/ | ||||
| int SignedArea(const Common::Vec2<Fix12P4>& vtx1, const Common::Vec2<Fix12P4>& vtx2, | ||||
|                const Common::Vec2<Fix12P4>& vtx3); | ||||
| 
 | ||||
| /**
 | ||||
|  * Convert a 3D vector for cube map coordinates to 2D texture coordinates along with the face name. | ||||
|  **/ | ||||
| std::tuple<f24, f24, f24, PAddr> ConvertCubeCoord(f24 u, f24 v, f24 w, | ||||
|                                                   const Pica::TexturingRegs& regs); | ||||
| 
 | ||||
| /**
 | ||||
|  * Triangle filling rules: Pixels on the right-sided edge or on flat bottom edges are not | ||||
|  * drawn. Pixels on any other triangle border are drawn. This is implemented with three bias | ||||
|  * values which are added to the barycentric coordinates w0, w1 and w2, respectively. | ||||
|  * NOTE: These are the PSP filling rules. Not sure if the 3DS uses the same ones... | ||||
|  **/ | ||||
| bool IsRightSideOrFlatBottomEdge(const Common::Vec2<Fix12P4>& vtx, | ||||
|                                  const Common::Vec2<Fix12P4>& line1, | ||||
|                                  const Common::Vec2<Fix12P4>& line2); | ||||
| 
 | ||||
| } // namespace SwRenderer
 | ||||
|  |  | |||
|  | @ -3,23 +3,46 @@ | |||
| // Refer to the license.txt file included.
 | ||||
| 
 | ||||
| #include <algorithm> | ||||
| #include "common/assert.h" | ||||
| #include "common/color.h" | ||||
| #include "common/common_types.h" | ||||
| #include "common/logging/log.h" | ||||
| #include "common/vector_math.h" | ||||
| #include "core/hw/gpu.h" | ||||
| #include "core/memory.h" | ||||
| #include "video_core/pica_state.h" | ||||
| #include "video_core/pica_types.h" | ||||
| #include "video_core/regs_framebuffer.h" | ||||
| #include "video_core/renderer_software/sw_framebuffer.h" | ||||
| #include "video_core/utils.h" | ||||
| #include "video_core/video_core.h" | ||||
| 
 | ||||
| namespace Pica::Rasterizer { | ||||
| namespace SwRenderer { | ||||
| 
 | ||||
| void DrawPixel(int x, int y, const Common::Vec4<u8>& color) { | ||||
|     const auto& framebuffer = g_state.regs.framebuffer.framebuffer; | ||||
| using Pica::f16; | ||||
| using Pica::FramebufferRegs; | ||||
| 
 | ||||
| namespace { | ||||
| 
 | ||||
| /// Decode/Encode for shadow map format. It is similar to D24S8 format,
 | ||||
| /// but the depth field is in big-endian.
 | ||||
| const Common::Vec2<u32> DecodeD24S8Shadow(const u8* bytes) { | ||||
|     return {static_cast<u32>((bytes[0] << 16) | (bytes[1] << 8) | bytes[2]), bytes[3]}; | ||||
| } | ||||
| 
 | ||||
| void EncodeD24X8Shadow(u32 depth, u8* bytes) { | ||||
|     bytes[2] = depth & 0xFF; | ||||
|     bytes[1] = (depth >> 8) & 0xFF; | ||||
|     bytes[0] = (depth >> 16) & 0xFF; | ||||
| } | ||||
| 
 | ||||
| void EncodeX24S8Shadow(u8 stencil, u8* bytes) { | ||||
|     bytes[3] = stencil; | ||||
| } | ||||
| } // Anonymous namespace
 | ||||
| 
 | ||||
| Framebuffer::Framebuffer(Memory::MemorySystem& memory_, const Pica::FramebufferRegs& regs_) | ||||
|     : memory{memory_}, regs{regs_} {} | ||||
| 
 | ||||
| Framebuffer::~Framebuffer() = default; | ||||
| 
 | ||||
| void Framebuffer::DrawPixel(int x, int y, const Common::Vec4<u8>& color) const { | ||||
|     const auto& framebuffer = regs.framebuffer; | ||||
|     const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); | ||||
| 
 | ||||
|     // Similarly to textures, the render framebuffer is laid out from bottom to top, too.
 | ||||
|  | @ -27,33 +50,29 @@ void DrawPixel(int x, int y, const Common::Vec4<u8>& color) { | |||
|     y = framebuffer.height - y; | ||||
| 
 | ||||
|     const u32 coarse_y = y & ~7; | ||||
|     u32 bytes_per_pixel = | ||||
|     const u32 bytes_per_pixel = | ||||
|         GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value())); | ||||
|     u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + | ||||
|                      coarse_y * framebuffer.width * bytes_per_pixel; | ||||
|     u8* dst_pixel = VideoCore::g_memory->GetPhysicalPointer(addr) + dst_offset; | ||||
|     const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + | ||||
|                            coarse_y * framebuffer.width * bytes_per_pixel; | ||||
|     u8* depth_buffer = memory.GetPhysicalPointer(addr); | ||||
|     u8* dst_pixel = depth_buffer + dst_offset; | ||||
| 
 | ||||
|     switch (framebuffer.color_format) { | ||||
|     case FramebufferRegs::ColorFormat::RGBA8: | ||||
|         Common::Color::EncodeRGBA8(color, dst_pixel); | ||||
|         break; | ||||
| 
 | ||||
|     case FramebufferRegs::ColorFormat::RGB8: | ||||
|         Common::Color::EncodeRGB8(color, dst_pixel); | ||||
|         break; | ||||
| 
 | ||||
|     case FramebufferRegs::ColorFormat::RGB5A1: | ||||
|         Common::Color::EncodeRGB5A1(color, dst_pixel); | ||||
|         break; | ||||
| 
 | ||||
|     case FramebufferRegs::ColorFormat::RGB565: | ||||
|         Common::Color::EncodeRGB565(color, dst_pixel); | ||||
|         break; | ||||
| 
 | ||||
|     case FramebufferRegs::ColorFormat::RGBA4: | ||||
|         Common::Color::EncodeRGBA4(color, dst_pixel); | ||||
|         break; | ||||
| 
 | ||||
|     default: | ||||
|         LOG_CRITICAL(Render_Software, "Unknown framebuffer color format {:x}", | ||||
|                      static_cast<u32>(framebuffer.color_format.Value())); | ||||
|  | @ -61,35 +80,31 @@ void DrawPixel(int x, int y, const Common::Vec4<u8>& color) { | |||
|     } | ||||
| } | ||||
| 
 | ||||
| const Common::Vec4<u8> GetPixel(int x, int y) { | ||||
|     const auto& framebuffer = g_state.regs.framebuffer.framebuffer; | ||||
| const Common::Vec4<u8> Framebuffer::GetPixel(int x, int y) const { | ||||
|     const auto& framebuffer = regs.framebuffer; | ||||
|     const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); | ||||
| 
 | ||||
|     y = framebuffer.height - y; | ||||
| 
 | ||||
|     const u32 coarse_y = y & ~7; | ||||
|     u32 bytes_per_pixel = | ||||
|     const u32 bytes_per_pixel = | ||||
|         GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value())); | ||||
|     u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + | ||||
|                      coarse_y * framebuffer.width * bytes_per_pixel; | ||||
|     u8* src_pixel = VideoCore::g_memory->GetPhysicalPointer(addr) + src_offset; | ||||
|     const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + | ||||
|                            coarse_y * framebuffer.width * bytes_per_pixel; | ||||
|     const u8* color_buffer = memory.GetPhysicalPointer(addr); | ||||
|     const u8* src_pixel = color_buffer + src_offset; | ||||
| 
 | ||||
|     switch (framebuffer.color_format) { | ||||
|     case FramebufferRegs::ColorFormat::RGBA8: | ||||
|         return Common::Color::DecodeRGBA8(src_pixel); | ||||
| 
 | ||||
|     case FramebufferRegs::ColorFormat::RGB8: | ||||
|         return Common::Color::DecodeRGB8(src_pixel); | ||||
| 
 | ||||
|     case FramebufferRegs::ColorFormat::RGB5A1: | ||||
|         return Common::Color::DecodeRGB5A1(src_pixel); | ||||
| 
 | ||||
|     case FramebufferRegs::ColorFormat::RGB565: | ||||
|         return Common::Color::DecodeRGB565(src_pixel); | ||||
| 
 | ||||
|     case FramebufferRegs::ColorFormat::RGBA4: | ||||
|         return Common::Color::DecodeRGBA4(src_pixel); | ||||
| 
 | ||||
|     default: | ||||
|         LOG_CRITICAL(Render_Software, "Unknown framebuffer color format {:x}", | ||||
|                      static_cast<u32>(framebuffer.color_format.Value())); | ||||
|  | @ -99,19 +114,19 @@ const Common::Vec4<u8> GetPixel(int x, int y) { | |||
|     return {0, 0, 0, 0}; | ||||
| } | ||||
| 
 | ||||
| u32 GetDepth(int x, int y) { | ||||
|     const auto& framebuffer = g_state.regs.framebuffer.framebuffer; | ||||
| u32 Framebuffer::GetDepth(int x, int y) const { | ||||
|     const auto& framebuffer = regs.framebuffer; | ||||
|     const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); | ||||
|     u8* depth_buffer = VideoCore::g_memory->GetPhysicalPointer(addr); | ||||
| 
 | ||||
|     y = framebuffer.height - y; | ||||
| 
 | ||||
|     const u32 coarse_y = y & ~7; | ||||
|     u32 bytes_per_pixel = FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); | ||||
|     u32 stride = framebuffer.width * bytes_per_pixel; | ||||
|     const u32 bytes_per_pixel = FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); | ||||
|     const u32 stride = framebuffer.width * bytes_per_pixel; | ||||
| 
 | ||||
|     u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; | ||||
|     u8* src_pixel = depth_buffer + src_offset; | ||||
|     const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; | ||||
|     const u8* depth_buffer = memory.GetPhysicalPointer(addr); | ||||
|     const u8* src_pixel = depth_buffer + src_offset; | ||||
| 
 | ||||
|     switch (framebuffer.depth_format) { | ||||
|     case FramebufferRegs::DepthFormat::D16: | ||||
|  | @ -128,24 +143,23 @@ u32 GetDepth(int x, int y) { | |||
|     } | ||||
| } | ||||
| 
 | ||||
| u8 GetStencil(int x, int y) { | ||||
|     const auto& framebuffer = g_state.regs.framebuffer.framebuffer; | ||||
| u8 Framebuffer::GetStencil(int x, int y) const { | ||||
|     const auto& framebuffer = regs.framebuffer; | ||||
|     const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); | ||||
|     u8* depth_buffer = VideoCore::g_memory->GetPhysicalPointer(addr); | ||||
| 
 | ||||
|     y = framebuffer.height - y; | ||||
| 
 | ||||
|     const u32 coarse_y = y & ~7; | ||||
|     u32 bytes_per_pixel = Pica::FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); | ||||
|     u32 stride = framebuffer.width * bytes_per_pixel; | ||||
|     const u32 bytes_per_pixel = Pica::FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); | ||||
|     const u32 stride = framebuffer.width * bytes_per_pixel; | ||||
| 
 | ||||
|     u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; | ||||
|     u8* src_pixel = depth_buffer + src_offset; | ||||
|     const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; | ||||
|     const u8* depth_buffer = memory.GetPhysicalPointer(addr); | ||||
|     const u8* src_pixel = depth_buffer + src_offset; | ||||
| 
 | ||||
|     switch (framebuffer.depth_format) { | ||||
|     case FramebufferRegs::DepthFormat::D24S8: | ||||
|         return Common::Color::DecodeD24S8(src_pixel).y; | ||||
| 
 | ||||
|     default: | ||||
|         LOG_WARNING( | ||||
|             HW_GPU, | ||||
|  | @ -155,33 +169,30 @@ u8 GetStencil(int x, int y) { | |||
|     } | ||||
| } | ||||
| 
 | ||||
| void SetDepth(int x, int y, u32 value) { | ||||
|     const auto& framebuffer = g_state.regs.framebuffer.framebuffer; | ||||
| void Framebuffer::SetDepth(int x, int y, u32 value) const { | ||||
|     const auto& framebuffer = regs.framebuffer; | ||||
|     const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); | ||||
|     u8* depth_buffer = VideoCore::g_memory->GetPhysicalPointer(addr); | ||||
| 
 | ||||
|     y = framebuffer.height - y; | ||||
| 
 | ||||
|     const u32 coarse_y = y & ~7; | ||||
|     u32 bytes_per_pixel = FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); | ||||
|     u32 stride = framebuffer.width * bytes_per_pixel; | ||||
|     const u32 bytes_per_pixel = FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); | ||||
|     const u32 stride = framebuffer.width * bytes_per_pixel; | ||||
| 
 | ||||
|     u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; | ||||
|     const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; | ||||
|     u8* depth_buffer = memory.GetPhysicalPointer(addr); | ||||
|     u8* dst_pixel = depth_buffer + dst_offset; | ||||
| 
 | ||||
|     switch (framebuffer.depth_format) { | ||||
|     case FramebufferRegs::DepthFormat::D16: | ||||
|         Common::Color::EncodeD16(value, dst_pixel); | ||||
|         break; | ||||
| 
 | ||||
|     case FramebufferRegs::DepthFormat::D24: | ||||
|         Common::Color::EncodeD24(value, dst_pixel); | ||||
|         break; | ||||
| 
 | ||||
|     case FramebufferRegs::DepthFormat::D24S8: | ||||
|         Common::Color::EncodeD24X8(value, dst_pixel); | ||||
|         break; | ||||
| 
 | ||||
|     default: | ||||
|         LOG_CRITICAL(HW_GPU, "Unimplemented depth format {}", | ||||
|                      static_cast<u32>(framebuffer.depth_format.Value())); | ||||
|  | @ -190,18 +201,18 @@ void SetDepth(int x, int y, u32 value) { | |||
|     } | ||||
| } | ||||
| 
 | ||||
| void SetStencil(int x, int y, u8 value) { | ||||
|     const auto& framebuffer = g_state.regs.framebuffer.framebuffer; | ||||
| void Framebuffer::SetStencil(int x, int y, u8 value) const { | ||||
|     const auto& framebuffer = regs.framebuffer; | ||||
|     const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress(); | ||||
|     u8* depth_buffer = VideoCore::g_memory->GetPhysicalPointer(addr); | ||||
| 
 | ||||
|     y = framebuffer.height - y; | ||||
| 
 | ||||
|     const u32 coarse_y = y & ~7; | ||||
|     u32 bytes_per_pixel = Pica::FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); | ||||
|     u32 stride = framebuffer.width * bytes_per_pixel; | ||||
|     const u32 bytes_per_pixel = Pica::FramebufferRegs::BytesPerDepthPixel(framebuffer.depth_format); | ||||
|     const u32 stride = framebuffer.width * bytes_per_pixel; | ||||
| 
 | ||||
|     u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; | ||||
|     const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; | ||||
|     u8* depth_buffer = memory.GetPhysicalPointer(addr); | ||||
|     u8* dst_pixel = depth_buffer + dst_offset; | ||||
| 
 | ||||
|     switch (framebuffer.depth_format) { | ||||
|  | @ -209,11 +220,9 @@ void SetStencil(int x, int y, u8 value) { | |||
|     case Pica::FramebufferRegs::DepthFormat::D24: | ||||
|         // Nothing to do
 | ||||
|         break; | ||||
| 
 | ||||
|     case Pica::FramebufferRegs::DepthFormat::D24S8: | ||||
|         Common::Color::EncodeX24S8(value, dst_pixel); | ||||
|         break; | ||||
| 
 | ||||
|     default: | ||||
|         LOG_CRITICAL(HW_GPU, "Unimplemented depth format {}", | ||||
|                      static_cast<u32>(framebuffer.depth_format.Value())); | ||||
|  | @ -222,36 +231,65 @@ void SetStencil(int x, int y, u8 value) { | |||
|     } | ||||
| } | ||||
| 
 | ||||
| void Framebuffer::DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil) const { | ||||
|     const auto& framebuffer = regs.framebuffer; | ||||
|     const auto& shadow = regs.shadow; | ||||
|     const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); | ||||
| 
 | ||||
|     y = framebuffer.height - y; | ||||
| 
 | ||||
|     const u32 coarse_y = y & ~7; | ||||
|     u32 bytes_per_pixel = 4; | ||||
|     u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + | ||||
|                      coarse_y * framebuffer.width * bytes_per_pixel; | ||||
|     u8* shadow_buffer = memory.GetPhysicalPointer(addr); | ||||
|     u8* dst_pixel = shadow_buffer + dst_offset; | ||||
| 
 | ||||
|     const auto ref = DecodeD24S8Shadow(dst_pixel); | ||||
|     const u32 ref_z = ref.x; | ||||
|     const u32 ref_s = ref.y; | ||||
| 
 | ||||
|     if (depth >= ref_z) { | ||||
|         return; | ||||
|     } | ||||
| 
 | ||||
|     if (stencil == 0) { | ||||
|         EncodeD24X8Shadow(depth, dst_pixel); | ||||
|     } else { | ||||
|         const f16 constant = f16::FromRaw(shadow.constant); | ||||
|         const f16 linear = f16::FromRaw(shadow.linear); | ||||
|         const f16 x_ = f16::FromFloat32(static_cast<float>(depth) / ref_z); | ||||
|         const f16 stencil_new = f16::FromFloat32(stencil) / (constant + linear * x_); | ||||
|         stencil = static_cast<u8>(std::clamp(stencil_new.ToFloat32(), 0.0f, 255.0f)); | ||||
| 
 | ||||
|         if (stencil < ref_s) { | ||||
|             EncodeX24S8Shadow(stencil, dst_pixel); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| u8 PerformStencilAction(FramebufferRegs::StencilAction action, u8 old_stencil, u8 ref) { | ||||
|     switch (action) { | ||||
|     case FramebufferRegs::StencilAction::Keep: | ||||
|         return old_stencil; | ||||
| 
 | ||||
|     case FramebufferRegs::StencilAction::Zero: | ||||
|         return 0; | ||||
| 
 | ||||
|     case FramebufferRegs::StencilAction::Replace: | ||||
|         return ref; | ||||
| 
 | ||||
|     case FramebufferRegs::StencilAction::Increment: | ||||
|         // Saturated increment
 | ||||
|         return std::min<u8>(old_stencil, 254) + 1; | ||||
| 
 | ||||
|     case FramebufferRegs::StencilAction::Decrement: | ||||
|         // Saturated decrement
 | ||||
|         return std::max<u8>(old_stencil, 1) - 1; | ||||
| 
 | ||||
|     case FramebufferRegs::StencilAction::Invert: | ||||
|         return ~old_stencil; | ||||
| 
 | ||||
|     case FramebufferRegs::StencilAction::IncrementWrap: | ||||
|         return old_stencil + 1; | ||||
| 
 | ||||
|     case FramebufferRegs::StencilAction::DecrementWrap: | ||||
|         return old_stencil - 1; | ||||
| 
 | ||||
|     default: | ||||
|         LOG_CRITICAL(HW_GPU, "Unknown stencil action {:x}", (int)action); | ||||
|         LOG_CRITICAL(HW_GPU, "Unknown stencil action {:x}", static_cast<int>(action)); | ||||
|         UNIMPLEMENTED(); | ||||
|         return 0; | ||||
|     } | ||||
|  | @ -262,24 +300,21 @@ Common::Vec4<u8> EvaluateBlendEquation(const Common::Vec4<u8>& src, | |||
|                                        const Common::Vec4<u8>& dest, | ||||
|                                        const Common::Vec4<u8>& destfactor, | ||||
|                                        FramebufferRegs::BlendEquation equation) { | ||||
|     Common::Vec4<int> result; | ||||
|     Common::Vec4i result; | ||||
| 
 | ||||
|     auto src_result = (src * srcfactor).Cast<int>(); | ||||
|     auto dst_result = (dest * destfactor).Cast<int>(); | ||||
|     const auto src_result = (src * srcfactor).Cast<s32>(); | ||||
|     const auto dst_result = (dest * destfactor).Cast<s32>(); | ||||
| 
 | ||||
|     switch (equation) { | ||||
|     case FramebufferRegs::BlendEquation::Add: | ||||
|         result = (src_result + dst_result) / 255; | ||||
|         break; | ||||
| 
 | ||||
|     case FramebufferRegs::BlendEquation::Subtract: | ||||
|         result = (src_result - dst_result) / 255; | ||||
|         break; | ||||
| 
 | ||||
|     case FramebufferRegs::BlendEquation::ReverseSubtract: | ||||
|         result = (dst_result - src_result) / 255; | ||||
|         break; | ||||
| 
 | ||||
|     // TODO: How do these two actually work?  OpenGL doesn't include the blend factors in the
 | ||||
|     //       min/max computations, but is this what the 3DS actually does?
 | ||||
|     case FramebufferRegs::BlendEquation::Min: | ||||
|  | @ -288,14 +323,12 @@ Common::Vec4<u8> EvaluateBlendEquation(const Common::Vec4<u8>& src, | |||
|         result.b() = std::min(src.b(), dest.b()); | ||||
|         result.a() = std::min(src.a(), dest.a()); | ||||
|         break; | ||||
| 
 | ||||
|     case FramebufferRegs::BlendEquation::Max: | ||||
|         result.r() = std::max(src.r(), dest.r()); | ||||
|         result.g() = std::max(src.g(), dest.g()); | ||||
|         result.b() = std::max(src.b(), dest.b()); | ||||
|         result.a() = std::max(src.a(), dest.a()); | ||||
|         break; | ||||
| 
 | ||||
|     default: | ||||
|         LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation 0x{:x}", equation); | ||||
|         UNIMPLEMENTED(); | ||||
|  | @ -309,103 +342,38 @@ u8 LogicOp(u8 src, u8 dest, FramebufferRegs::LogicOp op) { | |||
|     switch (op) { | ||||
|     case FramebufferRegs::LogicOp::Clear: | ||||
|         return 0; | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::And: | ||||
|         return src & dest; | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::AndReverse: | ||||
|         return src & ~dest; | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::Copy: | ||||
|         return src; | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::Set: | ||||
|         return 255; | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::CopyInverted: | ||||
|         return ~src; | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::NoOp: | ||||
|         return dest; | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::Invert: | ||||
|         return ~dest; | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::Nand: | ||||
|         return ~(src & dest); | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::Or: | ||||
|         return src | dest; | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::Nor: | ||||
|         return ~(src | dest); | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::Xor: | ||||
|         return src ^ dest; | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::Equiv: | ||||
|         return ~(src ^ dest); | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::AndInverted: | ||||
|         return ~src & dest; | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::OrReverse: | ||||
|         return src | ~dest; | ||||
| 
 | ||||
|     case FramebufferRegs::LogicOp::OrInverted: | ||||
|         return ~src | dest; | ||||
|     } | ||||
| 
 | ||||
|     UNREACHABLE(); | ||||
| }; | ||||
| 
 | ||||
| // Decode/Encode for shadow map format. It is similar to D24S8 format, but the depth field is in
 | ||||
| // big-endian
 | ||||
| static const Common::Vec2<u32> DecodeD24S8Shadow(const u8* bytes) { | ||||
|     return {static_cast<u32>((bytes[0] << 16) | (bytes[1] << 8) | bytes[2]), bytes[3]}; | ||||
| } | ||||
| 
 | ||||
| static void EncodeD24X8Shadow(u32 depth, u8* bytes) { | ||||
|     bytes[2] = depth & 0xFF; | ||||
|     bytes[1] = (depth >> 8) & 0xFF; | ||||
|     bytes[0] = (depth >> 16) & 0xFF; | ||||
| } | ||||
| 
 | ||||
| static void EncodeX24S8Shadow(u8 stencil, u8* bytes) { | ||||
|     bytes[3] = stencil; | ||||
| } | ||||
| 
 | ||||
| void DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil) { | ||||
|     const auto& framebuffer = g_state.regs.framebuffer.framebuffer; | ||||
|     const auto& shadow = g_state.regs.framebuffer.shadow; | ||||
|     const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); | ||||
| 
 | ||||
|     y = framebuffer.height - y; | ||||
| 
 | ||||
|     const u32 coarse_y = y & ~7; | ||||
|     u32 bytes_per_pixel = 4; | ||||
|     u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + | ||||
|                      coarse_y * framebuffer.width * bytes_per_pixel; | ||||
|     u8* dst_pixel = VideoCore::g_memory->GetPhysicalPointer(addr) + dst_offset; | ||||
| 
 | ||||
|     auto ref = DecodeD24S8Shadow(dst_pixel); | ||||
|     u32 ref_z = ref.x; | ||||
|     u32 ref_s = ref.y; | ||||
| 
 | ||||
|     if (depth < ref_z) { | ||||
|         if (stencil == 0) { | ||||
|             EncodeD24X8Shadow(depth, dst_pixel); | ||||
|         } else { | ||||
|             float16 constant = float16::FromRaw(shadow.constant); | ||||
|             float16 linear = float16::FromRaw(shadow.linear); | ||||
|             float16 x_ = float16::FromFloat32(static_cast<float>(depth) / ref_z); | ||||
|             float16 stencil_new = float16::FromFloat32(stencil) / (constant + linear * x_); | ||||
|             stencil = static_cast<u8>(std::clamp(stencil_new.ToFloat32(), 0.0f, 255.0f)); | ||||
| 
 | ||||
|             if (stencil < ref_s) | ||||
|                 EncodeX24S8Shadow(stencil, dst_pixel); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| } // namespace Pica::Rasterizer
 | ||||
| } // namespace SwRenderer
 | ||||
|  |  | |||
|  | @ -8,24 +8,55 @@ | |||
| #include "common/vector_math.h" | ||||
| #include "video_core/regs_framebuffer.h" | ||||
| 
 | ||||
| namespace Pica::Rasterizer { | ||||
| namespace Memory { | ||||
| class MemorySystem; | ||||
| } | ||||
| 
 | ||||
| void DrawPixel(int x, int y, const Common::Vec4<u8>& color); | ||||
| const Common::Vec4<u8> GetPixel(int x, int y); | ||||
| u32 GetDepth(int x, int y); | ||||
| u8 GetStencil(int x, int y); | ||||
| void SetDepth(int x, int y, u32 value); | ||||
| void SetStencil(int x, int y, u8 value); | ||||
| u8 PerformStencilAction(FramebufferRegs::StencilAction action, u8 old_stencil, u8 ref); | ||||
| namespace Pica { | ||||
| struct FramebufferRegs; | ||||
| } | ||||
| 
 | ||||
| namespace SwRenderer { | ||||
| 
 | ||||
| class Framebuffer { | ||||
| public: | ||||
|     explicit Framebuffer(Memory::MemorySystem& memory, const Pica::FramebufferRegs& framebuffer); | ||||
|     ~Framebuffer(); | ||||
| 
 | ||||
|     /// Draws a pixel at the specified coordinates.
 | ||||
|     void DrawPixel(int x, int y, const Common::Vec4<u8>& color) const; | ||||
| 
 | ||||
|     /// Returns the current color at the specified coordinates.
 | ||||
|     [[nodiscard]] const Common::Vec4<u8> GetPixel(int x, int y) const; | ||||
| 
 | ||||
|     /// Returns the depth value at the specified coordinates.
 | ||||
|     [[nodiscard]] u32 GetDepth(int x, int y) const; | ||||
| 
 | ||||
|     /// Returns the stencil value at the specified coordinates.
 | ||||
|     [[nodiscard]] u8 GetStencil(int x, int y) const; | ||||
| 
 | ||||
|     /// Stores the provided depth value at the specified coordinates.
 | ||||
|     void SetDepth(int x, int y, u32 value) const; | ||||
| 
 | ||||
|     /// Stores the provided stencil value at the specified coordinates.
 | ||||
|     void SetStencil(int x, int y, u8 value) const; | ||||
| 
 | ||||
|     /// Draws a pixel to the shadow buffer.
 | ||||
|     void DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil) const; | ||||
| 
 | ||||
| private: | ||||
|     Memory::MemorySystem& memory; | ||||
|     const Pica::FramebufferRegs& regs; | ||||
| }; | ||||
| 
 | ||||
| u8 PerformStencilAction(Pica::FramebufferRegs::StencilAction action, u8 old_stencil, u8 ref); | ||||
| 
 | ||||
| Common::Vec4<u8> EvaluateBlendEquation(const Common::Vec4<u8>& src, | ||||
|                                        const Common::Vec4<u8>& srcfactor, | ||||
|                                        const Common::Vec4<u8>& dest, | ||||
|                                        const Common::Vec4<u8>& destfactor, | ||||
|                                        FramebufferRegs::BlendEquation equation); | ||||
|                                        Pica::FramebufferRegs::BlendEquation equation); | ||||
| 
 | ||||
| u8 LogicOp(u8 src, u8 dest, FramebufferRegs::LogicOp op); | ||||
| u8 LogicOp(u8 src, u8 dest, Pica::FramebufferRegs::LogicOp op); | ||||
| 
 | ||||
| void DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil); | ||||
| 
 | ||||
| } // namespace Pica::Rasterizer
 | ||||
| } // namespace SwRenderer
 | ||||
|  |  | |||
|  | @ -5,7 +5,10 @@ | |||
| #include <algorithm> | ||||
| #include "video_core/renderer_software/sw_lighting.h" | ||||
| 
 | ||||
| namespace Pica { | ||||
| namespace SwRenderer { | ||||
| 
 | ||||
| using Pica::f16; | ||||
| using Pica::LightingRegs; | ||||
| 
 | ||||
| static float LookupLightingLut(const Pica::State::Lighting& lighting, std::size_t lut_index, | ||||
|                                u8 index, float delta) { | ||||
|  | @ -14,18 +17,18 @@ static float LookupLightingLut(const Pica::State::Lighting& lighting, std::size_ | |||
| 
 | ||||
|     const auto& lut = lighting.luts[lut_index][index]; | ||||
| 
 | ||||
|     float lut_value = lut.ToFloat(); | ||||
|     float lut_diff = lut.DiffToFloat(); | ||||
|     const float lut_value = lut.ToFloat(); | ||||
|     const float lut_diff = lut.DiffToFloat(); | ||||
| 
 | ||||
|     return lut_value + lut_diff * delta; | ||||
| } | ||||
| 
 | ||||
| std::tuple<Common::Vec4<u8>, Common::Vec4<u8>> ComputeFragmentsColors( | ||||
| std::pair<Common::Vec4<u8>, Common::Vec4<u8>> ComputeFragmentsColors( | ||||
|     const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state, | ||||
|     const Common::Quaternion<float>& normquat, const Common::Vec3<float>& view, | ||||
|     const Common::Vec4<u8> (&texture_color)[4]) { | ||||
|     const Common::Quaternion<f32>& normquat, const Common::Vec3f& view, | ||||
|     std::span<const Common::Vec4<u8>, 4> texture_color) { | ||||
| 
 | ||||
|     Common::Vec4<float> shadow; | ||||
|     Common::Vec4f shadow; | ||||
|     if (lighting.config0.enable_shadow) { | ||||
|         shadow = texture_color[lighting.config0.shadow_selector].Cast<float>() / 255.0f; | ||||
|         if (lighting.config0.shadow_invert) { | ||||
|  | @ -35,16 +38,16 @@ std::tuple<Common::Vec4<u8>, Common::Vec4<u8>> ComputeFragmentsColors( | |||
|         shadow = Common::MakeVec(1.0f, 1.0f, 1.0f, 1.0f); | ||||
|     } | ||||
| 
 | ||||
|     Common::Vec3<float> surface_normal{}; | ||||
|     Common::Vec3<float> surface_tangent{}; | ||||
|     Common::Vec3f surface_normal{}; | ||||
|     Common::Vec3f surface_tangent{}; | ||||
| 
 | ||||
|     if (lighting.config0.bump_mode != LightingRegs::LightingBumpMode::None) { | ||||
|         Common::Vec3<float> perturbation = | ||||
|         Common::Vec3f perturbation = | ||||
|             texture_color[lighting.config0.bump_selector].xyz().Cast<float>() / 127.5f - | ||||
|             Common::MakeVec(1.0f, 1.0f, 1.0f); | ||||
|         if (lighting.config0.bump_mode == LightingRegs::LightingBumpMode::NormalMap) { | ||||
|             if (!lighting.config0.disable_bump_renorm) { | ||||
|                 const float z_square = 1 - perturbation.xy().Length2(); | ||||
|                 const f32 z_square = 1 - perturbation.xy().Length2(); | ||||
|                 perturbation.z = std::sqrt(std::max(z_square, 0.0f)); | ||||
|             } | ||||
|             surface_normal = perturbation; | ||||
|  | @ -65,66 +68,64 @@ std::tuple<Common::Vec4<u8>, Common::Vec4<u8>> ComputeFragmentsColors( | |||
|     auto normal = Common::QuaternionRotate(normquat, surface_normal); | ||||
|     auto tangent = Common::QuaternionRotate(normquat, surface_tangent); | ||||
| 
 | ||||
|     Common::Vec4<float> diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f}; | ||||
|     Common::Vec4<float> specular_sum = {0.0f, 0.0f, 0.0f, 1.0f}; | ||||
|     Common::Vec4f diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f}; | ||||
|     Common::Vec4f specular_sum = {0.0f, 0.0f, 0.0f, 1.0f}; | ||||
| 
 | ||||
|     for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) { | ||||
|         unsigned num = lighting.light_enable.GetNum(light_index); | ||||
|     for (u32 light_index = 0; light_index <= lighting.max_light_index; ++light_index) { | ||||
|         u32 num = lighting.light_enable.GetNum(light_index); | ||||
|         const auto& light_config = lighting.light[num]; | ||||
| 
 | ||||
|         Common::Vec3<float> refl_value = {}; | ||||
|         Common::Vec3<float> position = {float16::FromRaw(light_config.x).ToFloat32(), | ||||
|                                         float16::FromRaw(light_config.y).ToFloat32(), | ||||
|                                         float16::FromRaw(light_config.z).ToFloat32()}; | ||||
|         Common::Vec3<float> light_vector; | ||||
|         const Common::Vec3f position = {f16::FromRaw(light_config.x).ToFloat32(), | ||||
|                                         f16::FromRaw(light_config.y).ToFloat32(), | ||||
|                                         f16::FromRaw(light_config.z).ToFloat32()}; | ||||
|         Common::Vec3f refl_value{}; | ||||
|         Common::Vec3f light_vector{}; | ||||
| 
 | ||||
|         if (light_config.config.directional) | ||||
|         if (light_config.config.directional) { | ||||
|             light_vector = position; | ||||
|         else | ||||
|         } else { | ||||
|             light_vector = position + view; | ||||
|         } | ||||
| 
 | ||||
|         [[maybe_unused]] float length = light_vector.Normalize(); | ||||
|         [[maybe_unused]] const f32 length = light_vector.Normalize(); | ||||
| 
 | ||||
|         Common::Vec3<float> norm_view = view.Normalized(); | ||||
|         Common::Vec3<float> half_vector = norm_view + light_vector; | ||||
|         Common::Vec3f norm_view = view.Normalized(); | ||||
|         Common::Vec3f half_vector = norm_view + light_vector; | ||||
| 
 | ||||
|         float dist_atten = 1.0f; | ||||
|         f32 dist_atten = 1.0f; | ||||
|         if (!lighting.IsDistAttenDisabled(num)) { | ||||
|             float scale = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32(); | ||||
|             float bias = Pica::float20::FromRaw(light_config.dist_atten_bias).ToFloat32(); | ||||
|             std::size_t lut = | ||||
|             const f32 scale = Pica::f20::FromRaw(light_config.dist_atten_scale).ToFloat32(); | ||||
|             const f32 bias = Pica::f20::FromRaw(light_config.dist_atten_bias).ToFloat32(); | ||||
|             const std::size_t lut = | ||||
|                 static_cast<std::size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num; | ||||
| 
 | ||||
|             float sample_loc = std::clamp(scale * length + bias, 0.0f, 1.0f); | ||||
|             const f32 sample_loc = std::clamp(scale * length + bias, 0.0f, 1.0f); | ||||
| 
 | ||||
|             u8 lutindex = | ||||
|             const u8 lutindex = | ||||
|                 static_cast<u8>(std::clamp(std::floor(sample_loc * 256.0f), 0.0f, 255.0f)); | ||||
|             float delta = sample_loc * 256 - lutindex; | ||||
|             const f32 delta = sample_loc * 256 - lutindex; | ||||
| 
 | ||||
|             dist_atten = LookupLightingLut(lighting_state, lut, lutindex, delta); | ||||
|         } | ||||
| 
 | ||||
|         auto GetLutValue = [&](LightingRegs::LightingLutInput input, bool abs, | ||||
|                                LightingRegs::LightingScale scale_enum, | ||||
|                                LightingRegs::LightingSampler sampler) { | ||||
|             float result = 0.0f; | ||||
|         auto get_lut_value = [&](LightingRegs::LightingLutInput input, bool abs, | ||||
|                                  LightingRegs::LightingScale scale_enum, | ||||
|                                  LightingRegs::LightingSampler sampler) { | ||||
|             f32 result = 0.0f; | ||||
| 
 | ||||
|             switch (input) { | ||||
|             case LightingRegs::LightingLutInput::NH: | ||||
|                 result = Common::Dot(normal, half_vector.Normalized()); | ||||
|                 break; | ||||
| 
 | ||||
|             case LightingRegs::LightingLutInput::VH: | ||||
|                 result = Common::Dot(norm_view, half_vector.Normalized()); | ||||
|                 break; | ||||
| 
 | ||||
|             case LightingRegs::LightingLutInput::NV: | ||||
|                 result = Common::Dot(normal, norm_view); | ||||
|                 break; | ||||
| 
 | ||||
|             case LightingRegs::LightingLutInput::LN: | ||||
|                 result = Common::Dot(light_vector, normal); | ||||
|                 break; | ||||
| 
 | ||||
|             case LightingRegs::LightingLutInput::SP: { | ||||
|                 Common::Vec3<s32> spot_dir{light_config.spot_x.Value(), light_config.spot_y.Value(), | ||||
|                                            light_config.spot_z.Value()}; | ||||
|  | @ -133,8 +134,8 @@ std::tuple<Common::Vec4<u8>, Common::Vec4<u8>> ComputeFragmentsColors( | |||
|             } | ||||
|             case LightingRegs::LightingLutInput::CP: | ||||
|                 if (lighting.config0.config == LightingRegs::LightingConfig::Config7) { | ||||
|                     const Common::Vec3<float> norm_half_vector = half_vector.Normalized(); | ||||
|                     const Common::Vec3<float> half_vector_proj = | ||||
|                     const Common::Vec3f norm_half_vector = half_vector.Normalized(); | ||||
|                     const Common::Vec3f half_vector_proj = | ||||
|                         norm_half_vector - normal * Common::Dot(normal, norm_half_vector); | ||||
|                     result = Common::Dot(half_vector_proj, tangent); | ||||
|                 } else { | ||||
|  | @ -148,58 +149,60 @@ std::tuple<Common::Vec4<u8>, Common::Vec4<u8>> ComputeFragmentsColors( | |||
|             } | ||||
| 
 | ||||
|             u8 index; | ||||
|             float delta; | ||||
|             f32 delta; | ||||
| 
 | ||||
|             if (abs) { | ||||
|                 if (light_config.config.two_sided_diffuse) | ||||
|                 if (light_config.config.two_sided_diffuse) { | ||||
|                     result = std::abs(result); | ||||
|                 else | ||||
|                 } else { | ||||
|                     result = std::max(result, 0.0f); | ||||
|                 } | ||||
| 
 | ||||
|                 float flr = std::floor(result * 256.0f); | ||||
|                 const f32 flr = std::floor(result * 256.0f); | ||||
|                 index = static_cast<u8>(std::clamp(flr, 0.0f, 255.0f)); | ||||
|                 delta = result * 256 - index; | ||||
|             } else { | ||||
|                 float flr = std::floor(result * 128.0f); | ||||
|                 s8 signed_index = static_cast<s8>(std::clamp(flr, -128.0f, 127.0f)); | ||||
|                 const f32 flr = std::floor(result * 128.0f); | ||||
|                 const s8 signed_index = static_cast<s8>(std::clamp(flr, -128.0f, 127.0f)); | ||||
|                 delta = result * 128.0f - signed_index; | ||||
|                 index = static_cast<u8>(signed_index); | ||||
|             } | ||||
| 
 | ||||
|             float scale = lighting.lut_scale.GetScale(scale_enum); | ||||
|             const f32 scale = lighting.lut_scale.GetScale(scale_enum); | ||||
|             return scale * LookupLightingLut(lighting_state, static_cast<std::size_t>(sampler), | ||||
|                                              index, delta); | ||||
|         }; | ||||
| 
 | ||||
|         // If enabled, compute spot light attenuation value
 | ||||
|         float spot_atten = 1.0f; | ||||
|         f32 spot_atten = 1.0f; | ||||
|         if (!lighting.IsSpotAttenDisabled(num) && | ||||
|             LightingRegs::IsLightingSamplerSupported( | ||||
|                 lighting.config0.config, LightingRegs::LightingSampler::SpotlightAttenuation)) { | ||||
|             auto lut = LightingRegs::SpotlightAttenuationSampler(num); | ||||
|             spot_atten = GetLutValue(lighting.lut_input.sp, lighting.abs_lut_input.disable_sp == 0, | ||||
|                                      lighting.lut_scale.sp, lut); | ||||
|             spot_atten = | ||||
|                 get_lut_value(lighting.lut_input.sp, lighting.abs_lut_input.disable_sp == 0, | ||||
|                               lighting.lut_scale.sp, lut); | ||||
|         } | ||||
| 
 | ||||
|         // Specular 0 component
 | ||||
|         float d0_lut_value = 1.0f; | ||||
|         f32 d0_lut_value = 1.0f; | ||||
|         if (lighting.config1.disable_lut_d0 == 0 && | ||||
|             LightingRegs::IsLightingSamplerSupported( | ||||
|                 lighting.config0.config, LightingRegs::LightingSampler::Distribution0)) { | ||||
|             d0_lut_value = | ||||
|                 GetLutValue(lighting.lut_input.d0, lighting.abs_lut_input.disable_d0 == 0, | ||||
|                             lighting.lut_scale.d0, LightingRegs::LightingSampler::Distribution0); | ||||
|                 get_lut_value(lighting.lut_input.d0, lighting.abs_lut_input.disable_d0 == 0, | ||||
|                               lighting.lut_scale.d0, LightingRegs::LightingSampler::Distribution0); | ||||
|         } | ||||
| 
 | ||||
|         Common::Vec3<float> specular_0 = d0_lut_value * light_config.specular_0.ToVec3f(); | ||||
|         Common::Vec3f specular_0 = d0_lut_value * light_config.specular_0.ToVec3f(); | ||||
| 
 | ||||
|         // If enabled, lookup ReflectRed value, otherwise, 1.0 is used
 | ||||
|         if (lighting.config1.disable_lut_rr == 0 && | ||||
|             LightingRegs::IsLightingSamplerSupported(lighting.config0.config, | ||||
|                                                      LightingRegs::LightingSampler::ReflectRed)) { | ||||
|             refl_value.x = | ||||
|                 GetLutValue(lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0, | ||||
|                             lighting.lut_scale.rr, LightingRegs::LightingSampler::ReflectRed); | ||||
|                 get_lut_value(lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0, | ||||
|                               lighting.lut_scale.rr, LightingRegs::LightingSampler::ReflectRed); | ||||
|         } else { | ||||
|             refl_value.x = 1.0f; | ||||
|         } | ||||
|  | @ -209,8 +212,8 @@ std::tuple<Common::Vec4<u8>, Common::Vec4<u8>> ComputeFragmentsColors( | |||
|             LightingRegs::IsLightingSamplerSupported(lighting.config0.config, | ||||
|                                                      LightingRegs::LightingSampler::ReflectGreen)) { | ||||
|             refl_value.y = | ||||
|                 GetLutValue(lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0, | ||||
|                             lighting.lut_scale.rg, LightingRegs::LightingSampler::ReflectGreen); | ||||
|                 get_lut_value(lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0, | ||||
|                               lighting.lut_scale.rg, LightingRegs::LightingSampler::ReflectGreen); | ||||
|         } else { | ||||
|             refl_value.y = refl_value.x; | ||||
|         } | ||||
|  | @ -220,24 +223,23 @@ std::tuple<Common::Vec4<u8>, Common::Vec4<u8>> ComputeFragmentsColors( | |||
|             LightingRegs::IsLightingSamplerSupported(lighting.config0.config, | ||||
|                                                      LightingRegs::LightingSampler::ReflectBlue)) { | ||||
|             refl_value.z = | ||||
|                 GetLutValue(lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0, | ||||
|                             lighting.lut_scale.rb, LightingRegs::LightingSampler::ReflectBlue); | ||||
|                 get_lut_value(lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0, | ||||
|                               lighting.lut_scale.rb, LightingRegs::LightingSampler::ReflectBlue); | ||||
|         } else { | ||||
|             refl_value.z = refl_value.x; | ||||
|         } | ||||
| 
 | ||||
|         // Specular 1 component
 | ||||
|         float d1_lut_value = 1.0f; | ||||
|         f32 d1_lut_value = 1.0f; | ||||
|         if (lighting.config1.disable_lut_d1 == 0 && | ||||
|             LightingRegs::IsLightingSamplerSupported( | ||||
|                 lighting.config0.config, LightingRegs::LightingSampler::Distribution1)) { | ||||
|             d1_lut_value = | ||||
|                 GetLutValue(lighting.lut_input.d1, lighting.abs_lut_input.disable_d1 == 0, | ||||
|                             lighting.lut_scale.d1, LightingRegs::LightingSampler::Distribution1); | ||||
|                 get_lut_value(lighting.lut_input.d1, lighting.abs_lut_input.disable_d1 == 0, | ||||
|                               lighting.lut_scale.d1, LightingRegs::LightingSampler::Distribution1); | ||||
|         } | ||||
| 
 | ||||
|         Common::Vec3<float> specular_1 = | ||||
|             d1_lut_value * refl_value * light_config.specular_1.ToVec3f(); | ||||
|         Common::Vec3f specular_1 = d1_lut_value * refl_value * light_config.specular_1.ToVec3f(); | ||||
| 
 | ||||
|         // Fresnel
 | ||||
|         // Note: only the last entry in the light slots applies the Fresnel factor
 | ||||
|  | @ -245,9 +247,9 @@ std::tuple<Common::Vec4<u8>, Common::Vec4<u8>> ComputeFragmentsColors( | |||
|             LightingRegs::IsLightingSamplerSupported(lighting.config0.config, | ||||
|                                                      LightingRegs::LightingSampler::Fresnel)) { | ||||
| 
 | ||||
|             float lut_value = | ||||
|                 GetLutValue(lighting.lut_input.fr, lighting.abs_lut_input.disable_fr == 0, | ||||
|                             lighting.lut_scale.fr, LightingRegs::LightingSampler::Fresnel); | ||||
|             const f32 lut_value = | ||||
|                 get_lut_value(lighting.lut_input.fr, lighting.abs_lut_input.disable_fr == 0, | ||||
|                               lighting.lut_scale.fr, LightingRegs::LightingSampler::Fresnel); | ||||
| 
 | ||||
|             // Enabled for diffuse lighting alpha component
 | ||||
|             if (lighting.config0.enable_primary_alpha) { | ||||
|  | @ -261,18 +263,19 @@ std::tuple<Common::Vec4<u8>, Common::Vec4<u8>> ComputeFragmentsColors( | |||
|         } | ||||
| 
 | ||||
|         auto dot_product = Common::Dot(light_vector, normal); | ||||
|         if (light_config.config.two_sided_diffuse) | ||||
|         if (light_config.config.two_sided_diffuse) { | ||||
|             dot_product = std::abs(dot_product); | ||||
|         else | ||||
|         } else { | ||||
|             dot_product = std::max(dot_product, 0.0f); | ||||
|         } | ||||
| 
 | ||||
|         float clamp_highlights = 1.0f; | ||||
|         f32 clamp_highlights = 1.0f; | ||||
|         if (lighting.config0.clamp_highlights) { | ||||
|             clamp_highlights = dot_product == 0.0f ? 0.0f : 1.0f; | ||||
|         } | ||||
| 
 | ||||
|         if (light_config.config.geometric_factor_0 || light_config.config.geometric_factor_1) { | ||||
|             float geo_factor = half_vector.Length2(); | ||||
|             f32 geo_factor = half_vector.Length2(); | ||||
|             geo_factor = geo_factor == 0.0f ? 0.0f : std::min(dot_product / geo_factor, 1.0f); | ||||
|             if (light_config.config.geometric_factor_0) { | ||||
|                 specular_0 *= geo_factor; | ||||
|  | @ -315,17 +318,17 @@ std::tuple<Common::Vec4<u8>, Common::Vec4<u8>> ComputeFragmentsColors( | |||
| 
 | ||||
|     diffuse_sum += Common::MakeVec(lighting.global_ambient.ToVec3f(), 0.0f); | ||||
| 
 | ||||
|     auto diffuse = Common::MakeVec<float>(std::clamp(diffuse_sum.x, 0.0f, 1.0f) * 255, | ||||
|                                           std::clamp(diffuse_sum.y, 0.0f, 1.0f) * 255, | ||||
|                                           std::clamp(diffuse_sum.z, 0.0f, 1.0f) * 255, | ||||
|                                           std::clamp(diffuse_sum.w, 0.0f, 1.0f) * 255) | ||||
|                        .Cast<u8>(); | ||||
|     auto specular = Common::MakeVec<float>(std::clamp(specular_sum.x, 0.0f, 1.0f) * 255, | ||||
|                                            std::clamp(specular_sum.y, 0.0f, 1.0f) * 255, | ||||
|                                            std::clamp(specular_sum.z, 0.0f, 1.0f) * 255, | ||||
|                                            std::clamp(specular_sum.w, 0.0f, 1.0f) * 255) | ||||
|                         .Cast<u8>(); | ||||
|     return std::make_tuple(diffuse, specular); | ||||
|     const auto diffuse = Common::MakeVec(std::clamp(diffuse_sum.x, 0.0f, 1.0f) * 255, | ||||
|                                          std::clamp(diffuse_sum.y, 0.0f, 1.0f) * 255, | ||||
|                                          std::clamp(diffuse_sum.z, 0.0f, 1.0f) * 255, | ||||
|                                          std::clamp(diffuse_sum.w, 0.0f, 1.0f) * 255) | ||||
|                              .Cast<u8>(); | ||||
|     const auto specular = Common::MakeVec(std::clamp(specular_sum.x, 0.0f, 1.0f) * 255, | ||||
|                                           std::clamp(specular_sum.y, 0.0f, 1.0f) * 255, | ||||
|                                           std::clamp(specular_sum.z, 0.0f, 1.0f) * 255, | ||||
|                                           std::clamp(specular_sum.w, 0.0f, 1.0f) * 255) | ||||
|                               .Cast<u8>(); | ||||
|     return std::make_pair(diffuse, specular); | ||||
| } | ||||
| 
 | ||||
| } // namespace Pica
 | ||||
| } // namespace SwRenderer
 | ||||
|  |  | |||
|  | @ -4,16 +4,18 @@ | |||
| 
 | ||||
| #pragma once | ||||
| 
 | ||||
| #include <tuple> | ||||
| #include <span> | ||||
| #include <utility> | ||||
| 
 | ||||
| #include "common/quaternion.h" | ||||
| #include "common/vector_math.h" | ||||
| #include "video_core/pica_state.h" | ||||
| 
 | ||||
| namespace Pica { | ||||
| namespace SwRenderer { | ||||
| 
 | ||||
| std::tuple<Common::Vec4<u8>, Common::Vec4<u8>> ComputeFragmentsColors( | ||||
| std::pair<Common::Vec4<u8>, Common::Vec4<u8>> ComputeFragmentsColors( | ||||
|     const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state, | ||||
|     const Common::Quaternion<float>& normquat, const Common::Vec3<float>& view, | ||||
|     const Common::Vec4<u8> (&texture_color)[4]); | ||||
|     const Common::Quaternion<f32>& normquat, const Common::Vec3f& view, | ||||
|     std::span<const Common::Vec4<u8>, 4> texture_color); | ||||
| 
 | ||||
| } // namespace Pica
 | ||||
| } // namespace SwRenderer
 | ||||
|  |  | |||
|  | @ -4,17 +4,18 @@ | |||
| 
 | ||||
| #include <array> | ||||
| #include <cmath> | ||||
| #include "common/math_util.h" | ||||
| #include "video_core/renderer_software/sw_proctex.h" | ||||
| 
 | ||||
| namespace Pica::Rasterizer { | ||||
| namespace SwRenderer { | ||||
| 
 | ||||
| using ProcTexClamp = TexturingRegs::ProcTexClamp; | ||||
| using ProcTexShift = TexturingRegs::ProcTexShift; | ||||
| using ProcTexCombiner = TexturingRegs::ProcTexCombiner; | ||||
| using ProcTexFilter = TexturingRegs::ProcTexFilter; | ||||
| namespace { | ||||
| using ProcTexClamp = Pica::TexturingRegs::ProcTexClamp; | ||||
| using ProcTexShift = Pica::TexturingRegs::ProcTexShift; | ||||
| using ProcTexCombiner = Pica::TexturingRegs::ProcTexCombiner; | ||||
| using ProcTexFilter = Pica::TexturingRegs::ProcTexFilter; | ||||
| using Pica::f16; | ||||
| 
 | ||||
| static float LookupLUT(const std::array<State::ProcTex::ValueEntry, 128>& lut, float coord) { | ||||
| float LookupLUT(const std::array<Pica::State::ProcTex::ValueEntry, 128>& lut, float coord) { | ||||
|     // For NoiseLUT/ColorMap/AlphaMap, coord=0.0 is lut[0], coord=127.0/128.0 is lut[127] and
 | ||||
|     // coord=1.0 is lut[127]+lut_diff[127]. For other indices, the result is interpolated using
 | ||||
|     // value entries and difference entries.
 | ||||
|  | @ -26,13 +27,13 @@ static float LookupLUT(const std::array<State::ProcTex::ValueEntry, 128>& lut, f | |||
| 
 | ||||
| // These function are used to generate random noise for procedural texture. Their results are
 | ||||
| // verified against real hardware, but it's not known if the algorithm is the same as hardware.
 | ||||
| static unsigned int NoiseRand1D(unsigned int v) { | ||||
| unsigned int NoiseRand1D(unsigned int v) { | ||||
|     static constexpr std::array<unsigned int, 16> table{ | ||||
|         {0, 4, 10, 8, 4, 9, 7, 12, 5, 15, 13, 14, 11, 15, 2, 11}}; | ||||
|     return ((v % 9 + 2) * 3 & 0xF) ^ table[(v / 9) & 0xF]; | ||||
| } | ||||
| 
 | ||||
| static float NoiseRand2D(unsigned int x, unsigned int y) { | ||||
| float NoiseRand2D(unsigned int x, unsigned int y) { | ||||
|     static constexpr std::array<unsigned int, 16> table{ | ||||
|         {10, 2, 15, 8, 0, 7, 4, 5, 5, 13, 2, 6, 13, 9, 3, 14}}; | ||||
|     unsigned int u2 = NoiseRand1D(x); | ||||
|  | @ -45,11 +46,12 @@ static float NoiseRand2D(unsigned int x, unsigned int y) { | |||
|     return -1.0f + v2 * 2.0f / 15.0f; | ||||
| } | ||||
| 
 | ||||
| static float NoiseCoef(float u, float v, const TexturingRegs& regs, const State::ProcTex& state) { | ||||
|     const float freq_u = float16::FromRaw(regs.proctex_noise_frequency.u).ToFloat32(); | ||||
|     const float freq_v = float16::FromRaw(regs.proctex_noise_frequency.v).ToFloat32(); | ||||
|     const float phase_u = float16::FromRaw(regs.proctex_noise_u.phase).ToFloat32(); | ||||
|     const float phase_v = float16::FromRaw(regs.proctex_noise_v.phase).ToFloat32(); | ||||
| float NoiseCoef(float u, float v, const Pica::TexturingRegs& regs, | ||||
|                 const Pica::State::ProcTex& state) { | ||||
|     const float freq_u = f16::FromRaw(regs.proctex_noise_frequency.u).ToFloat32(); | ||||
|     const float freq_v = f16::FromRaw(regs.proctex_noise_frequency.v).ToFloat32(); | ||||
|     const float phase_u = f16::FromRaw(regs.proctex_noise_u.phase).ToFloat32(); | ||||
|     const float phase_v = f16::FromRaw(regs.proctex_noise_v.phase).ToFloat32(); | ||||
|     const float x = 9 * freq_u * std::abs(u + phase_u); | ||||
|     const float y = 9 * freq_v * std::abs(v + phase_v); | ||||
|     const int x_int = static_cast<int>(x); | ||||
|  | @ -66,7 +68,7 @@ static float NoiseCoef(float u, float v, const TexturingRegs& regs, const State: | |||
|     return Common::BilinearInterp(g0, g1, g2, g3, x_noise, y_noise); | ||||
| } | ||||
| 
 | ||||
| static float GetShiftOffset(float v, ProcTexShift mode, ProcTexClamp clamp_mode) { | ||||
| float GetShiftOffset(float v, ProcTexShift mode, ProcTexClamp clamp_mode) { | ||||
|     const float offset = (clamp_mode == ProcTexClamp::MirroredRepeat) ? 1 : 0.5f; | ||||
|     switch (mode) { | ||||
|     case ProcTexShift::None: | ||||
|  | @ -81,7 +83,7 @@ static float GetShiftOffset(float v, ProcTexShift mode, ProcTexClamp clamp_mode) | |||
|     } | ||||
| }; | ||||
| 
 | ||||
| static void ClampCoord(float& coord, ProcTexClamp mode) { | ||||
| void ClampCoord(float& coord, ProcTexClamp mode) { | ||||
|     switch (mode) { | ||||
|     case ProcTexClamp::ToZero: | ||||
|         if (coord > 1.0f) | ||||
|  | @ -112,8 +114,8 @@ static void ClampCoord(float& coord, ProcTexClamp mode) { | |||
|     } | ||||
| } | ||||
| 
 | ||||
| static float CombineAndMap(float u, float v, ProcTexCombiner combiner, | ||||
|                            const std::array<State::ProcTex::ValueEntry, 128>& map_table) { | ||||
| float CombineAndMap(float u, float v, ProcTexCombiner combiner, | ||||
|                     const std::array<Pica::State::ProcTex::ValueEntry, 128>& map_table) { | ||||
|     float f; | ||||
|     switch (combiner) { | ||||
|     case ProcTexCombiner::U: | ||||
|  | @ -122,28 +124,28 @@ static float CombineAndMap(float u, float v, ProcTexCombiner combiner, | |||
|     case ProcTexCombiner::U2: | ||||
|         f = u * u; | ||||
|         break; | ||||
|     case TexturingRegs::ProcTexCombiner::V: | ||||
|     case ProcTexCombiner::V: | ||||
|         f = v; | ||||
|         break; | ||||
|     case TexturingRegs::ProcTexCombiner::V2: | ||||
|     case ProcTexCombiner::V2: | ||||
|         f = v * v; | ||||
|         break; | ||||
|     case TexturingRegs::ProcTexCombiner::Add: | ||||
|     case ProcTexCombiner::Add: | ||||
|         f = (u + v) * 0.5f; | ||||
|         break; | ||||
|     case TexturingRegs::ProcTexCombiner::Add2: | ||||
|     case ProcTexCombiner::Add2: | ||||
|         f = (u * u + v * v) * 0.5f; | ||||
|         break; | ||||
|     case TexturingRegs::ProcTexCombiner::SqrtAdd2: | ||||
|     case ProcTexCombiner::SqrtAdd2: | ||||
|         f = std::min(std::sqrt(u * u + v * v), 1.0f); | ||||
|         break; | ||||
|     case TexturingRegs::ProcTexCombiner::Min: | ||||
|     case ProcTexCombiner::Min: | ||||
|         f = std::min(u, v); | ||||
|         break; | ||||
|     case TexturingRegs::ProcTexCombiner::Max: | ||||
|     case ProcTexCombiner::Max: | ||||
|         f = std::max(u, v); | ||||
|         break; | ||||
|     case TexturingRegs::ProcTexCombiner::RMax: | ||||
|     case ProcTexCombiner::RMax: | ||||
|         f = std::min(((u + v) * 0.5f + std::sqrt(u * u + v * v)) * 0.5f, 1.0f); | ||||
|         break; | ||||
|     default: | ||||
|  | @ -153,8 +155,10 @@ static float CombineAndMap(float u, float v, ProcTexCombiner combiner, | |||
|     } | ||||
|     return LookupLUT(map_table, f); | ||||
| } | ||||
| } // Anonymous namespace
 | ||||
| 
 | ||||
| Common::Vec4<u8> ProcTex(float u, float v, const TexturingRegs& regs, const State::ProcTex& state) { | ||||
| Common::Vec4<u8> ProcTex(float u, float v, const Pica::TexturingRegs& regs, | ||||
|                          const Pica::State::ProcTex& state) { | ||||
|     u = std::abs(u); | ||||
|     v = std::abs(v); | ||||
| 
 | ||||
|  | @ -218,4 +222,4 @@ Common::Vec4<u8> ProcTex(float u, float v, const TexturingRegs& regs, const Stat | |||
|     } | ||||
| } | ||||
| 
 | ||||
| } // namespace Pica::Rasterizer
 | ||||
| } // namespace SwRenderer
 | ||||
|  |  | |||
|  | @ -8,9 +8,10 @@ | |||
| #include "common/vector_math.h" | ||||
| #include "video_core/pica_state.h" | ||||
| 
 | ||||
| namespace Pica::Rasterizer { | ||||
| namespace SwRenderer { | ||||
| 
 | ||||
| /// Generates procedural texture color for the given coordinates
 | ||||
| Common::Vec4<u8> ProcTex(float u, float v, const TexturingRegs& regs, const State::ProcTex& state); | ||||
| Common::Vec4<u8> ProcTex(float u, float v, const Pica::TexturingRegs& regs, | ||||
|                          const Pica::State::ProcTex& state); | ||||
| 
 | ||||
| } // namespace Pica::Rasterizer
 | ||||
| } // namespace SwRenderer
 | ||||
|  |  | |||
|  | @ -2,15 +2,937 @@ | |||
| // Licensed under GPLv2 or any later version
 | ||||
| // Refer to the license.txt file included.
 | ||||
| 
 | ||||
| #include "video_core/renderer_software/sw_clipper.h" | ||||
| #include <boost/container/static_vector.hpp> | ||||
| #include "common/logging/log.h" | ||||
| #include "common/microprofile.h" | ||||
| #include "common/quaternion.h" | ||||
| #include "common/vector_math.h" | ||||
| #include "core/memory.h" | ||||
| #include "video_core/pica_state.h" | ||||
| #include "video_core/pica_types.h" | ||||
| #include "video_core/renderer_software/sw_framebuffer.h" | ||||
| #include "video_core/renderer_software/sw_lighting.h" | ||||
| #include "video_core/renderer_software/sw_proctex.h" | ||||
| #include "video_core/renderer_software/sw_rasterizer.h" | ||||
| #include "video_core/renderer_software/sw_texturing.h" | ||||
| #include "video_core/shader/shader.h" | ||||
| #include "video_core/texture/texture_decode.h" | ||||
| 
 | ||||
| namespace VideoCore { | ||||
| namespace SwRenderer { | ||||
| 
 | ||||
| using Pica::f24; | ||||
| using Pica::FramebufferRegs; | ||||
| using Pica::RasterizerRegs; | ||||
| using Pica::TexturingRegs; | ||||
| using Pica::Texture::LookupTexture; | ||||
| using Pica::Texture::TextureInfo; | ||||
| 
 | ||||
| struct Vertex : Pica::Shader::OutputVertex { | ||||
|     Vertex(const OutputVertex& v) : OutputVertex(v) {} | ||||
| 
 | ||||
|     /// Attributes used to store intermediate results position after perspective divide.
 | ||||
|     Common::Vec3<f24> screenpos; | ||||
| 
 | ||||
|     /**
 | ||||
|      * Linear interpolation | ||||
|      * factor: 0=this, 1=vtx | ||||
|      * Note: This function cannot be called after perspective divide. | ||||
|      **/ | ||||
|     void Lerp(f24 factor, const Vertex& vtx) { | ||||
|         pos = pos * factor + vtx.pos * (f24::One() - factor); | ||||
|         quat = quat * factor + vtx.quat * (f24::One() - factor); | ||||
|         color = color * factor + vtx.color * (f24::One() - factor); | ||||
|         tc0 = tc0 * factor + vtx.tc0 * (f24::One() - factor); | ||||
|         tc1 = tc1 * factor + vtx.tc1 * (f24::One() - factor); | ||||
|         tc0_w = tc0_w * factor + vtx.tc0_w * (f24::One() - factor); | ||||
|         view = view * factor + vtx.view * (f24::One() - factor); | ||||
|         tc2 = tc2 * factor + vtx.tc2 * (f24::One() - factor); | ||||
|     } | ||||
| 
 | ||||
|     /**
 | ||||
|      * Linear interpolation | ||||
|      * factor: 0=v0, 1=v1 | ||||
|      * Note: This function cannot be called after perspective divide. | ||||
|      **/ | ||||
|     static Vertex Lerp(f24 factor, const Vertex& v0, const Vertex& v1) { | ||||
|         Vertex ret = v0; | ||||
|         ret.Lerp(factor, v1); | ||||
|         return ret; | ||||
|     } | ||||
| }; | ||||
| 
 | ||||
| namespace { | ||||
| 
 | ||||
| MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240)); | ||||
| 
 | ||||
| struct ClippingEdge { | ||||
| public: | ||||
|     constexpr ClippingEdge(Common::Vec4<f24> coeffs, | ||||
|                            Common::Vec4<f24> bias = Common::Vec4<f24>(f24::Zero(), f24::Zero(), | ||||
|                                                                       f24::Zero(), f24::Zero())) | ||||
|         : pos(f24::Zero()), coeffs(coeffs), bias(bias) {} | ||||
| 
 | ||||
|     bool IsInside(const Vertex& vertex) const { | ||||
|         return Common::Dot(vertex.pos + bias, coeffs) >= f24::Zero(); | ||||
|     } | ||||
| 
 | ||||
|     bool IsOutSide(const Vertex& vertex) const { | ||||
|         return !IsInside(vertex); | ||||
|     } | ||||
| 
 | ||||
|     Vertex GetIntersection(const Vertex& v0, const Vertex& v1) const { | ||||
|         const f24 dp = Common::Dot(v0.pos + bias, coeffs); | ||||
|         const f24 dp_prev = Common::Dot(v1.pos + bias, coeffs); | ||||
|         const f24 factor = dp_prev / (dp_prev - dp); | ||||
|         return Vertex::Lerp(factor, v0, v1); | ||||
|     } | ||||
| 
 | ||||
| private: | ||||
|     [[maybe_unused]] f24 pos; | ||||
|     Common::Vec4<f24> coeffs; | ||||
|     Common::Vec4<f24> bias; | ||||
| }; | ||||
| 
 | ||||
| } // Anonymous namespace
 | ||||
| 
 | ||||
| RasterizerSoftware::RasterizerSoftware(Memory::MemorySystem& memory_) | ||||
|     : memory{memory_}, state{Pica::g_state}, regs{state.regs}, fb{memory, regs.framebuffer} {} | ||||
| 
 | ||||
| void RasterizerSoftware::AddTriangle(const Pica::Shader::OutputVertex& v0, | ||||
|                                      const Pica::Shader::OutputVertex& v1, | ||||
|                                      const Pica::Shader::OutputVertex& v2) { | ||||
|     Pica::Clipper::ProcessTriangle(v0, v1, v2); | ||||
|     /**
 | ||||
|      * Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at | ||||
|      * the new edge (or less in degenerate cases). As such, we can say that each clipping plane | ||||
|      * introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a | ||||
|      * fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9. | ||||
|      **/ | ||||
|     static constexpr std::size_t MAX_VERTICES = 9; | ||||
| 
 | ||||
|     boost::container::static_vector<Vertex, MAX_VERTICES> buffer_a = {v0, v1, v2}; | ||||
|     boost::container::static_vector<Vertex, MAX_VERTICES> buffer_b; | ||||
| 
 | ||||
|     FlipQuaternionIfOpposite(buffer_a[1].quat, buffer_a[0].quat); | ||||
|     FlipQuaternionIfOpposite(buffer_a[2].quat, buffer_a[0].quat); | ||||
| 
 | ||||
|     auto* output_list = &buffer_a; | ||||
|     auto* input_list = &buffer_b; | ||||
| 
 | ||||
|     // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value.
 | ||||
|     // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest
 | ||||
|     //       epsilon possible within f24 accuracy.
 | ||||
|     static constexpr f24 EPSILON = f24::FromFloat32(0.00001f); | ||||
|     static constexpr f24 f0 = f24::Zero(); | ||||
|     static constexpr f24 f1 = f24::One(); | ||||
|     static constexpr std::array<ClippingEdge, 7> clipping_edges = {{ | ||||
|         {Common::MakeVec(-f1, f0, f0, f1)},                                        // x = +w
 | ||||
|         {Common::MakeVec(f1, f0, f0, f1)},                                         // x = -w
 | ||||
|         {Common::MakeVec(f0, -f1, f0, f1)},                                        // y = +w
 | ||||
|         {Common::MakeVec(f0, f1, f0, f1)},                                         // y = -w
 | ||||
|         {Common::MakeVec(f0, f0, -f1, f0)},                                        // z =  0
 | ||||
|         {Common::MakeVec(f0, f0, f1, f1)},                                         // z = -w
 | ||||
|         {Common::MakeVec(f0, f0, f0, f1), Common::Vec4<f24>(f0, f0, f0, EPSILON)}, // w = EPSILON
 | ||||
|     }}; | ||||
| 
 | ||||
|     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
 | ||||
|     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
 | ||||
|     const auto clip = [&](const ClippingEdge& edge) { | ||||
|         std::swap(input_list, output_list); | ||||
|         output_list->clear(); | ||||
| 
 | ||||
|         const Vertex* reference_vertex = &input_list->back(); | ||||
|         for (const auto& vertex : *input_list) { | ||||
|             // NOTE: This algorithm changes vertex order in some cases!
 | ||||
|             if (edge.IsInside(vertex)) { | ||||
|                 if (edge.IsOutSide(*reference_vertex)) { | ||||
|                     output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); | ||||
|                 } | ||||
|                 output_list->push_back(vertex); | ||||
|             } else if (edge.IsInside(*reference_vertex)) { | ||||
|                 output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); | ||||
|             } | ||||
|             reference_vertex = &vertex; | ||||
|         } | ||||
|     }; | ||||
| 
 | ||||
|     for (const ClippingEdge& edge : clipping_edges) { | ||||
|         clip(edge); | ||||
|         if (output_list->size() < 3) { | ||||
|             return; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     if (state.regs.rasterizer.clip_enable) { | ||||
|         const ClippingEdge custom_edge{state.regs.rasterizer.GetClipCoef()}; | ||||
|         clip(custom_edge); | ||||
|         if (output_list->size() < 3) { | ||||
|             return; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     MakeScreenCoords((*output_list)[0]); | ||||
|     MakeScreenCoords((*output_list)[1]); | ||||
| 
 | ||||
|     for (std::size_t i = 0; i < output_list->size() - 2; i++) { | ||||
|         Vertex& vtx0 = (*output_list)[0]; | ||||
|         Vertex& vtx1 = (*output_list)[i + 1]; | ||||
|         Vertex& vtx2 = (*output_list)[i + 2]; | ||||
| 
 | ||||
|         MakeScreenCoords(vtx2); | ||||
| 
 | ||||
|         LOG_TRACE( | ||||
|             Render_Software, | ||||
|             "Triangle {}/{} at position ({:.3}, {:.3}, {:.3}, {:.3f}), " | ||||
|             "({:.3}, {:.3}, {:.3}, {:.3}), ({:.3}, {:.3}, {:.3}, {:.3}) and " | ||||
|             "screen position ({:.2}, {:.2}, {:.2}), ({:.2}, {:.2}, {:.2}), ({:.2}, {:.2}, {:.2})", | ||||
|             i + 1, output_list->size() - 2, vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), | ||||
|             vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(), vtx1.pos.x.ToFloat32(), | ||||
|             vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(), | ||||
|             vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), | ||||
|             vtx2.pos.w.ToFloat32(), vtx0.screenpos.x.ToFloat32(), vtx0.screenpos.y.ToFloat32(), | ||||
|             vtx0.screenpos.z.ToFloat32(), vtx1.screenpos.x.ToFloat32(), | ||||
|             vtx1.screenpos.y.ToFloat32(), vtx1.screenpos.z.ToFloat32(), | ||||
|             vtx2.screenpos.x.ToFloat32(), vtx2.screenpos.y.ToFloat32(), | ||||
|             vtx2.screenpos.z.ToFloat32()); | ||||
| 
 | ||||
|         ProcessTriangle(vtx0, vtx1, vtx2); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| } // namespace VideoCore
 | ||||
| void RasterizerSoftware::MakeScreenCoords(Vertex& vtx) { | ||||
|     Viewport viewport{}; | ||||
|     viewport.halfsize_x = f24::FromRaw(regs.rasterizer.viewport_size_x); | ||||
|     viewport.halfsize_y = f24::FromRaw(regs.rasterizer.viewport_size_y); | ||||
|     viewport.offset_x = f24::FromFloat32(static_cast<f32>(regs.rasterizer.viewport_corner.x)); | ||||
|     viewport.offset_y = f24::FromFloat32(static_cast<f32>(regs.rasterizer.viewport_corner.y)); | ||||
| 
 | ||||
|     f24 inv_w = f24::One() / vtx.pos.w; | ||||
|     vtx.pos.w = inv_w; | ||||
|     vtx.quat *= inv_w; | ||||
|     vtx.color *= inv_w; | ||||
|     vtx.tc0 *= inv_w; | ||||
|     vtx.tc1 *= inv_w; | ||||
|     vtx.tc0_w *= inv_w; | ||||
|     vtx.view *= inv_w; | ||||
|     vtx.tc2 *= inv_w; | ||||
| 
 | ||||
|     vtx.screenpos[0] = (vtx.pos.x * inv_w + f24::One()) * viewport.halfsize_x + viewport.offset_x; | ||||
|     vtx.screenpos[1] = (vtx.pos.y * inv_w + f24::One()) * viewport.halfsize_y + viewport.offset_y; | ||||
|     vtx.screenpos[2] = vtx.pos.z * inv_w; | ||||
| } | ||||
| 
 | ||||
| void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2, | ||||
|                                          bool reversed) { | ||||
|     MICROPROFILE_SCOPE(GPU_Rasterization); | ||||
| 
 | ||||
|     // Vertex positions in rasterizer coordinates
 | ||||
|     static auto screen_to_rasterizer_coords = [](const Common::Vec3<f24>& vec) { | ||||
|         return Common::Vec3{Fix12P4::FromFloat24(vec.x), Fix12P4::FromFloat24(vec.y), | ||||
|                             Fix12P4::FromFloat24(vec.z)}; | ||||
|     }; | ||||
| 
 | ||||
|     const std::array<Common::Vec3<Fix12P4>, 3> vtxpos = { | ||||
|         screen_to_rasterizer_coords(v0.screenpos), | ||||
|         screen_to_rasterizer_coords(v1.screenpos), | ||||
|         screen_to_rasterizer_coords(v2.screenpos), | ||||
|     }; | ||||
| 
 | ||||
|     if (regs.rasterizer.cull_mode == RasterizerRegs::CullMode::KeepAll) { | ||||
|         // Make sure we always end up with a triangle wound counter-clockwise
 | ||||
|         if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) { | ||||
|             ProcessTriangle(v0, v2, v1, true); | ||||
|             return; | ||||
|         } | ||||
|     } else { | ||||
|         if (!reversed && regs.rasterizer.cull_mode == RasterizerRegs::CullMode::KeepClockWise) { | ||||
|             // Reverse vertex order and use the CCW code path.
 | ||||
|             ProcessTriangle(v0, v2, v1, true); | ||||
|             return; | ||||
|         } | ||||
|         // Cull away triangles which are wound clockwise.
 | ||||
|         if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) { | ||||
|             return; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     u16 min_x = std::min({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x}); | ||||
|     u16 min_y = std::min({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y}); | ||||
|     u16 max_x = std::max({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x}); | ||||
|     u16 max_y = std::max({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y}); | ||||
| 
 | ||||
|     // Convert the scissor box coordinates to 12.4 fixed point
 | ||||
|     const u16 scissor_x1 = static_cast<u16>(regs.rasterizer.scissor_test.x1 << 4); | ||||
|     const u16 scissor_y1 = static_cast<u16>(regs.rasterizer.scissor_test.y1 << 4); | ||||
|     // x2,y2 have +1 added to cover the entire sub-pixel area
 | ||||
|     const u16 scissor_x2 = static_cast<u16>((regs.rasterizer.scissor_test.x2 + 1) << 4); | ||||
|     const u16 scissor_y2 = static_cast<u16>((regs.rasterizer.scissor_test.y2 + 1) << 4); | ||||
| 
 | ||||
|     if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Include) { | ||||
|         // Calculate the new bounds
 | ||||
|         min_x = std::max(min_x, scissor_x1); | ||||
|         min_y = std::max(min_y, scissor_y1); | ||||
|         max_x = std::min(max_x, scissor_x2); | ||||
|         max_y = std::min(max_y, scissor_y2); | ||||
|     } | ||||
| 
 | ||||
|     min_x &= Fix12P4::IntMask(); | ||||
|     min_y &= Fix12P4::IntMask(); | ||||
|     max_x = ((max_x + Fix12P4::FracMask()) & Fix12P4::IntMask()); | ||||
|     max_y = ((max_y + Fix12P4::FracMask()) & Fix12P4::IntMask()); | ||||
| 
 | ||||
|     const int bias0 = | ||||
|         IsRightSideOrFlatBottomEdge(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) ? -1 : 0; | ||||
|     const int bias1 = | ||||
|         IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0; | ||||
|     const int bias2 = | ||||
|         IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0; | ||||
| 
 | ||||
|     const auto w_inverse = Common::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w); | ||||
| 
 | ||||
|     auto textures = regs.texturing.GetTextures(); | ||||
|     const auto tev_stages = regs.texturing.GetTevStages(); | ||||
| 
 | ||||
|     const bool stencil_action_enable = | ||||
|         regs.framebuffer.output_merger.stencil_test.enable && | ||||
|         regs.framebuffer.framebuffer.depth_format == FramebufferRegs::DepthFormat::D24S8; | ||||
|     const auto stencil_test = regs.framebuffer.output_merger.stencil_test; | ||||
| 
 | ||||
|     // Enter rasterization loop, starting at the center of the topleft bounding box corner.
 | ||||
|     // TODO: Not sure if looping through x first might be faster
 | ||||
|     for (u16 y = min_y + 8; y < max_y; y += 0x10) { | ||||
|         for (u16 x = min_x + 8; x < max_x; x += 0x10) { | ||||
|             // Do not process the pixel if it's inside the scissor box and the scissor mode is set
 | ||||
|             // to Exclude.
 | ||||
|             if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) { | ||||
|                 if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) { | ||||
|                     continue; | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             // Calculate the barycentric coordinates w0, w1 and w2
 | ||||
|             const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); | ||||
|             const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y}); | ||||
|             const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y}); | ||||
|             const s32 wsum = w0 + w1 + w2; | ||||
| 
 | ||||
|             // If current pixel is not covered by the current primitive
 | ||||
|             if (w0 < 0 || w1 < 0 || w2 < 0) { | ||||
|                 continue; | ||||
|             } | ||||
| 
 | ||||
|             const auto baricentric_coordinates = Common::MakeVec( | ||||
|                 f24::FromFloat32(static_cast<f32>(w0)), f24::FromFloat32(static_cast<f32>(w1)), | ||||
|                 f24::FromFloat32(static_cast<f32>(w2))); | ||||
|             const f24 interpolated_w_inverse = | ||||
|                 f24::One() / Common::Dot(w_inverse, baricentric_coordinates); | ||||
| 
 | ||||
|             // interpolated_z = z / w
 | ||||
|             const float interpolated_z_over_w = | ||||
|                 (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 + | ||||
|                  v2.screenpos[2].ToFloat32() * w2) / | ||||
|                 wsum; | ||||
| 
 | ||||
|             // Not fully accurate. About 3 bits in precision are missing.
 | ||||
|             // Z-Buffer (z / w * scale + offset)
 | ||||
|             const float depth_scale = | ||||
|                 f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32(); | ||||
|             const float depth_offset = | ||||
|                 f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32(); | ||||
|             float depth = interpolated_z_over_w * depth_scale + depth_offset; | ||||
| 
 | ||||
|             // Potentially switch to W-Buffer
 | ||||
|             if (regs.rasterizer.depthmap_enable == | ||||
|                 Pica::RasterizerRegs::DepthBuffering::WBuffering) { | ||||
|                 // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w)
 | ||||
|                 depth *= interpolated_w_inverse.ToFloat32() * wsum; | ||||
|             } | ||||
| 
 | ||||
|             // Clamp the result
 | ||||
|             depth = std::clamp(depth, 0.0f, 1.0f); | ||||
| 
 | ||||
|             /**
 | ||||
|              * Perspective correct attribute interpolation: | ||||
|              * Attribute values cannot be calculated by simple linear interpolation since | ||||
|              * they are not linear in screen space. For example, when interpolating a | ||||
|              * texture coordinate across two vertices, something simple like | ||||
|              *     u = (u0*w0 + u1*w1)/(w0+w1) | ||||
|              * will not work. However, the attribute value divided by the | ||||
|              * clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear | ||||
|              * in screenspace. Hence, we can linearly interpolate these two independently and | ||||
|              * calculate the interpolated attribute by dividing the results. | ||||
|              * I.e. | ||||
|              *     u_over_w   = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1) | ||||
|              *     one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1) | ||||
|              *     u = u_over_w / one_over_w | ||||
|              * | ||||
|              * The generalization to three vertices is straightforward in baricentric coordinates. | ||||
|              **/ | ||||
|             const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) { | ||||
|                 auto attr_over_w = Common::MakeVec(attr0, attr1, attr2); | ||||
|                 f24 interpolated_attr_over_w = Common::Dot(attr_over_w, baricentric_coordinates); | ||||
|                 return interpolated_attr_over_w * interpolated_w_inverse; | ||||
|             }; | ||||
| 
 | ||||
|             const Common::Vec4<u8> primary_color{ | ||||
|                 static_cast<u8>( | ||||
|                     round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r()) | ||||
|                               .ToFloat32() * | ||||
|                           255)), | ||||
|                 static_cast<u8>( | ||||
|                     round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g()) | ||||
|                               .ToFloat32() * | ||||
|                           255)), | ||||
|                 static_cast<u8>( | ||||
|                     round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b()) | ||||
|                               .ToFloat32() * | ||||
|                           255)), | ||||
|                 static_cast<u8>( | ||||
|                     round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a()) | ||||
|                               .ToFloat32() * | ||||
|                           255)), | ||||
|             }; | ||||
| 
 | ||||
|             std::array<Common::Vec2<f24>, 3> uv; | ||||
|             uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u()); | ||||
|             uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v()); | ||||
|             uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u()); | ||||
|             uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v()); | ||||
|             uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u()); | ||||
|             uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v()); | ||||
| 
 | ||||
|             // Sample bound texture units.
 | ||||
|             const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w); | ||||
|             const auto texture_color = TextureColor(uv, textures, tc0_w); | ||||
| 
 | ||||
|             Common::Vec4<u8> primary_fragment_color{0, 0, 0, 0}; | ||||
|             Common::Vec4<u8> secondary_fragment_color{0, 0, 0, 0}; | ||||
|             if (!regs.lighting.disable) { | ||||
|                 const auto normquat = | ||||
|                     Common::Quaternion<f32>{ | ||||
|                         {get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(), | ||||
|                          get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(), | ||||
|                          get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()}, | ||||
|                         get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(), | ||||
|                     } | ||||
|                         .Normalized(); | ||||
| 
 | ||||
|                 const Common::Vec3f view{ | ||||
|                     get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(), | ||||
|                     get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(), | ||||
|                     get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(), | ||||
|                 }; | ||||
|                 std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors( | ||||
|                     regs.lighting, state.lighting, normquat, view, texture_color); | ||||
|             } | ||||
| 
 | ||||
|             // Write the TEV stages.
 | ||||
|             Common::Vec4<u8> combiner_output = | ||||
|                 WriteTevConfig(texture_color, tev_stages, primary_color, primary_fragment_color, | ||||
|                                secondary_fragment_color); | ||||
| 
 | ||||
|             const auto& output_merger = regs.framebuffer.output_merger; | ||||
|             if (output_merger.fragment_operation_mode == | ||||
|                 FramebufferRegs::FragmentOperationMode::Shadow) { | ||||
|                 u32 depth_int = static_cast<u32>(depth * 0xFFFFFF); | ||||
|                 // Use green color as the shadow intensity
 | ||||
|                 u8 stencil = combiner_output.y; | ||||
|                 fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil); | ||||
|                 // Skip the normal output merger pipeline if it is in shadow mode
 | ||||
|                 continue; | ||||
|             } | ||||
| 
 | ||||
|             // Does alpha testing happen before or after stencil?
 | ||||
|             if (!DoAlphaTest(combiner_output.a())) { | ||||
|                 continue; | ||||
|             } | ||||
|             WriteFog(combiner_output, depth); | ||||
|             if (!DoDepthStencilTest(x, y, depth, stencil_action_enable)) { | ||||
|                 continue; | ||||
|             } | ||||
|             const auto result = PixelColor(x, y, combiner_output); | ||||
|             if (regs.framebuffer.framebuffer.allow_color_write != 0) { | ||||
|                 fb.DrawPixel(x >> 4, y >> 4, result); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor( | ||||
|     std::span<const Common::Vec2<f24>, 3> uv, | ||||
|     std::span<const Pica::TexturingRegs::FullTextureConfig, 3> textures, f24 tc0_w) const { | ||||
|     std::array<Common::Vec4<u8>, 4> texture_color{}; | ||||
|     for (u32 i = 0; i < 3; ++i) { | ||||
|         const auto& texture = textures[i]; | ||||
|         if (!texture.enabled) [[unlikely]] { | ||||
|             continue; | ||||
|         } | ||||
|         if (texture.config.address == 0) [[unlikely]] { | ||||
|             texture_color[i] = {0, 0, 0, 255}; | ||||
|             continue; | ||||
|         } | ||||
| 
 | ||||
|         const s32 coordinate_i = (i == 2 && regs.texturing.main_config.texture2_use_coord1) ? 1 : i; | ||||
|         f24 u = uv[coordinate_i].u(); | ||||
|         f24 v = uv[coordinate_i].v(); | ||||
| 
 | ||||
|         // Only unit 0 respects the texturing type (according to 3DBrew)
 | ||||
|         PAddr texture_address = texture.config.GetPhysicalAddress(); | ||||
|         f24 shadow_z; | ||||
|         if (i == 0) { | ||||
|             switch (texture.config.type) { | ||||
|             case TexturingRegs::TextureConfig::Texture2D: | ||||
|                 break; | ||||
|             case TexturingRegs::TextureConfig::ShadowCube: | ||||
|             case TexturingRegs::TextureConfig::TextureCube: { | ||||
|                 std::tie(u, v, shadow_z, texture_address) = | ||||
|                     ConvertCubeCoord(u, v, tc0_w, regs.texturing); | ||||
|                 break; | ||||
|             } | ||||
|             case TexturingRegs::TextureConfig::Projection2D: { | ||||
|                 u /= tc0_w; | ||||
|                 v /= tc0_w; | ||||
|                 break; | ||||
|             } | ||||
|             case TexturingRegs::TextureConfig::Shadow2D: { | ||||
|                 if (!regs.texturing.shadow.orthographic) { | ||||
|                     u /= tc0_w; | ||||
|                     v /= tc0_w; | ||||
|                 } | ||||
|                 shadow_z = f24::FromFloat32(std::abs(tc0_w.ToFloat32())); | ||||
|                 break; | ||||
|             } | ||||
|             case TexturingRegs::TextureConfig::Disabled: | ||||
|                 continue; // skip this unit and continue to the next unit
 | ||||
|             default: | ||||
|                 LOG_ERROR(HW_GPU, "Unhandled texture type {:x}", (int)texture.config.type); | ||||
|                 UNIMPLEMENTED(); | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         const f24 width = f24::FromFloat32(static_cast<f32>(texture.config.width)); | ||||
|         const f24 height = f24::FromFloat32(static_cast<f32>(texture.config.height)); | ||||
|         s32 s = static_cast<s32>((u * width).ToFloat32()); | ||||
|         s32 t = static_cast<s32>((v * height).ToFloat32()); | ||||
| 
 | ||||
|         bool use_border_s = false; | ||||
|         bool use_border_t = false; | ||||
| 
 | ||||
|         if (texture.config.wrap_s == TexturingRegs::TextureConfig::ClampToBorder) { | ||||
|             use_border_s = s < 0 || s >= static_cast<s32>(texture.config.width); | ||||
|         } else if (texture.config.wrap_s == TexturingRegs::TextureConfig::ClampToBorder2) { | ||||
|             use_border_s = s >= static_cast<s32>(texture.config.width); | ||||
|         } | ||||
| 
 | ||||
|         if (texture.config.wrap_t == TexturingRegs::TextureConfig::ClampToBorder) { | ||||
|             use_border_t = t < 0 || t >= static_cast<s32>(texture.config.height); | ||||
|         } else if (texture.config.wrap_t == TexturingRegs::TextureConfig::ClampToBorder2) { | ||||
|             use_border_t = t >= static_cast<s32>(texture.config.height); | ||||
|         } | ||||
| 
 | ||||
|         if (use_border_s || use_border_t) { | ||||
|             const auto border_color = texture.config.border_color; | ||||
|             texture_color[i] = Common::MakeVec(border_color.r.Value(), border_color.g.Value(), | ||||
|                                                border_color.b.Value(), border_color.a.Value()) | ||||
|                                    .Cast<u8>(); | ||||
|         } else { | ||||
|             // Textures are laid out from bottom to top, hence we invert the t coordinate.
 | ||||
|             // NOTE: This may not be the right place for the inversion.
 | ||||
|             // TODO: Check if this applies to ETC textures, too.
 | ||||
|             s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width); | ||||
|             t = texture.config.height - 1 - | ||||
|                 GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height); | ||||
| 
 | ||||
|             const u8* texture_data = memory.GetPhysicalPointer(texture_address); | ||||
|             const auto info = TextureInfo::FromPicaRegister(texture.config, texture.format); | ||||
| 
 | ||||
|             // TODO: Apply the min and mag filters to the texture
 | ||||
|             texture_color[i] = LookupTexture(texture_data, s, t, info); | ||||
|         } | ||||
| 
 | ||||
|         if (i == 0 && (texture.config.type == TexturingRegs::TextureConfig::Shadow2D || | ||||
|                        texture.config.type == TexturingRegs::TextureConfig::ShadowCube)) { | ||||
| 
 | ||||
|             s32 z_int = static_cast<s32>(std::min(shadow_z.ToFloat32(), 1.0f) * 0xFFFFFF); | ||||
|             z_int -= regs.texturing.shadow.bias << 1; | ||||
|             const auto& color = texture_color[i]; | ||||
|             const s32 z_ref = (color.w << 16) | (color.z << 8) | color.y; | ||||
|             u8 density; | ||||
|             if (z_ref >= z_int) { | ||||
|                 density = color.x; | ||||
|             } else { | ||||
|                 density = 0; | ||||
|             } | ||||
|             texture_color[i] = {density, density, density, density}; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     // Sample procedural texture
 | ||||
|     if (regs.texturing.main_config.texture3_enable) { | ||||
|         const auto& proctex_uv = uv[regs.texturing.main_config.texture3_coordinates]; | ||||
|         texture_color[3] = ProcTex(proctex_uv.u().ToFloat32(), proctex_uv.v().ToFloat32(), | ||||
|                                    regs.texturing, state.proctex); | ||||
|     } | ||||
| 
 | ||||
|     return texture_color; | ||||
| } | ||||
| 
 | ||||
| Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y, | ||||
|                                                 Common::Vec4<u8>& combiner_output) const { | ||||
|     const auto dest = fb.GetPixel(x >> 4, y >> 4); | ||||
|     Common::Vec4<u8> blend_output = combiner_output; | ||||
| 
 | ||||
|     const auto& output_merger = regs.framebuffer.output_merger; | ||||
|     if (output_merger.alphablend_enable) { | ||||
|         const auto params = output_merger.alpha_blending; | ||||
|         const auto lookup_factor = [&](u32 channel, FramebufferRegs::BlendFactor factor) -> u8 { | ||||
|             DEBUG_ASSERT(channel < 4); | ||||
| 
 | ||||
|             const Common::Vec4<u8> blend_const = | ||||
|                 Common::MakeVec( | ||||
|                     output_merger.blend_const.r.Value(), output_merger.blend_const.g.Value(), | ||||
|                     output_merger.blend_const.b.Value(), output_merger.blend_const.a.Value()) | ||||
|                     .Cast<u8>(); | ||||
| 
 | ||||
|             switch (factor) { | ||||
|             case FramebufferRegs::BlendFactor::Zero: | ||||
|                 return 0; | ||||
|             case FramebufferRegs::BlendFactor::One: | ||||
|                 return 255; | ||||
|             case FramebufferRegs::BlendFactor::SourceColor: | ||||
|                 return combiner_output[channel]; | ||||
|             case FramebufferRegs::BlendFactor::OneMinusSourceColor: | ||||
|                 return 255 - combiner_output[channel]; | ||||
|             case FramebufferRegs::BlendFactor::DestColor: | ||||
|                 return dest[channel]; | ||||
|             case FramebufferRegs::BlendFactor::OneMinusDestColor: | ||||
|                 return 255 - dest[channel]; | ||||
|             case FramebufferRegs::BlendFactor::SourceAlpha: | ||||
|                 return combiner_output.a(); | ||||
|             case FramebufferRegs::BlendFactor::OneMinusSourceAlpha: | ||||
|                 return 255 - combiner_output.a(); | ||||
|             case FramebufferRegs::BlendFactor::DestAlpha: | ||||
|                 return dest.a(); | ||||
|             case FramebufferRegs::BlendFactor::OneMinusDestAlpha: | ||||
|                 return 255 - dest.a(); | ||||
|             case FramebufferRegs::BlendFactor::ConstantColor: | ||||
|                 return blend_const[channel]; | ||||
|             case FramebufferRegs::BlendFactor::OneMinusConstantColor: | ||||
|                 return 255 - blend_const[channel]; | ||||
|             case FramebufferRegs::BlendFactor::ConstantAlpha: | ||||
|                 return blend_const.a(); | ||||
|             case FramebufferRegs::BlendFactor::OneMinusConstantAlpha: | ||||
|                 return 255 - blend_const.a(); | ||||
|             case FramebufferRegs::BlendFactor::SourceAlphaSaturate: | ||||
|                 // Returns 1.0 for the alpha channel
 | ||||
|                 if (channel == 3) { | ||||
|                     return 255; | ||||
|                 } | ||||
|                 return std::min(combiner_output.a(), static_cast<u8>(255 - dest.a())); | ||||
|             default: | ||||
|                 LOG_CRITICAL(HW_GPU, "Unknown blend factor {:x}", factor); | ||||
|                 UNIMPLEMENTED(); | ||||
|                 break; | ||||
|             } | ||||
|             return combiner_output[channel]; | ||||
|         }; | ||||
| 
 | ||||
|         const auto srcfactor = Common::MakeVec( | ||||
|             lookup_factor(0, params.factor_source_rgb), lookup_factor(1, params.factor_source_rgb), | ||||
|             lookup_factor(2, params.factor_source_rgb), lookup_factor(3, params.factor_source_a)); | ||||
| 
 | ||||
|         const auto dstfactor = Common::MakeVec( | ||||
|             lookup_factor(0, params.factor_dest_rgb), lookup_factor(1, params.factor_dest_rgb), | ||||
|             lookup_factor(2, params.factor_dest_rgb), lookup_factor(3, params.factor_dest_a)); | ||||
| 
 | ||||
|         blend_output = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, | ||||
|                                              params.blend_equation_rgb); | ||||
|         blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, | ||||
|                                                  params.blend_equation_a) | ||||
|                                .a(); | ||||
|     } else { | ||||
|         blend_output = | ||||
|             Common::MakeVec(LogicOp(combiner_output.r(), dest.r(), output_merger.logic_op), | ||||
|                             LogicOp(combiner_output.g(), dest.g(), output_merger.logic_op), | ||||
|                             LogicOp(combiner_output.b(), dest.b(), output_merger.logic_op), | ||||
|                             LogicOp(combiner_output.a(), dest.a(), output_merger.logic_op)); | ||||
|     } | ||||
| 
 | ||||
|     const Common::Vec4<u8> result = { | ||||
|         output_merger.red_enable ? blend_output.r() : dest.r(), | ||||
|         output_merger.green_enable ? blend_output.g() : dest.g(), | ||||
|         output_merger.blue_enable ? blend_output.b() : dest.b(), | ||||
|         output_merger.alpha_enable ? blend_output.a() : dest.a(), | ||||
|     }; | ||||
| 
 | ||||
|     return result; | ||||
| } | ||||
| 
 | ||||
| Common::Vec4<u8> RasterizerSoftware::WriteTevConfig( | ||||
|     std::span<const Common::Vec4<u8>, 4> texture_color, | ||||
|     std::span<const Pica::TexturingRegs::TevStageConfig, 6> tev_stages, | ||||
|     Common::Vec4<u8> primary_color, Common::Vec4<u8> primary_fragment_color, | ||||
|     Common::Vec4<u8> secondary_fragment_color) const { | ||||
|     /**
 | ||||
|      * Texture environment - consists of 6 stages of color and alpha combining. | ||||
|      * Color combiners take three input color values from some source (e.g. interpolated | ||||
|      * vertex color, texture color, previous stage, etc), perform some very simple | ||||
|      * operations on each of them (e.g. inversion) and then calculate the output color | ||||
|      * with some basic arithmetic. Alpha combiners can be configured separately but work | ||||
|      * analogously. | ||||
|      **/ | ||||
|     Common::Vec4<u8> combiner_output; | ||||
|     Common::Vec4<u8> combiner_buffer = {0, 0, 0, 0}; | ||||
|     Common::Vec4<u8> next_combiner_buffer = | ||||
|         Common::MakeVec(regs.texturing.tev_combiner_buffer_color.r.Value(), | ||||
|                         regs.texturing.tev_combiner_buffer_color.g.Value(), | ||||
|                         regs.texturing.tev_combiner_buffer_color.b.Value(), | ||||
|                         regs.texturing.tev_combiner_buffer_color.a.Value()) | ||||
|             .Cast<u8>(); | ||||
| 
 | ||||
|     for (u32 tev_stage_index = 0; tev_stage_index < tev_stages.size(); ++tev_stage_index) { | ||||
|         const auto& tev_stage = tev_stages[tev_stage_index]; | ||||
|         using Source = TexturingRegs::TevStageConfig::Source; | ||||
| 
 | ||||
|         auto get_source = [&](Source source) -> Common::Vec4<u8> { | ||||
|             switch (source) { | ||||
|             case Source::PrimaryColor: | ||||
|                 return primary_color; | ||||
|             case Source::PrimaryFragmentColor: | ||||
|                 return primary_fragment_color; | ||||
|             case Source::SecondaryFragmentColor: | ||||
|                 return secondary_fragment_color; | ||||
|             case Source::Texture0: | ||||
|                 return texture_color[0]; | ||||
|             case Source::Texture1: | ||||
|                 return texture_color[1]; | ||||
|             case Source::Texture2: | ||||
|                 return texture_color[2]; | ||||
|             case Source::Texture3: | ||||
|                 return texture_color[3]; | ||||
|             case Source::PreviousBuffer: | ||||
|                 return combiner_buffer; | ||||
|             case Source::Constant: | ||||
|                 return Common::MakeVec(tev_stage.const_r.Value(), tev_stage.const_g.Value(), | ||||
|                                        tev_stage.const_b.Value(), tev_stage.const_a.Value()) | ||||
|                     .Cast<u8>(); | ||||
|             case Source::Previous: | ||||
|                 return combiner_output; | ||||
|             default: | ||||
|                 LOG_ERROR(HW_GPU, "Unknown color combiner source {}", (int)source); | ||||
|                 UNIMPLEMENTED(); | ||||
|                 return {0, 0, 0, 0}; | ||||
|             } | ||||
|         }; | ||||
| 
 | ||||
|         /**
 | ||||
|          * Color combiner | ||||
|          * NOTE: Not sure if the alpha combiner might use the color output of the previous | ||||
|          *       stage as input. Hence, we currently don't directly write the result to | ||||
|          *       combiner_output.rgb(), but instead store it in a temporary variable until | ||||
|          *       alpha combining has been done. | ||||
|          **/ | ||||
|         const std::array<Common::Vec3<u8>, 3> color_result = { | ||||
|             GetColorModifier(tev_stage.color_modifier1, get_source(tev_stage.color_source1)), | ||||
|             GetColorModifier(tev_stage.color_modifier2, get_source(tev_stage.color_source2)), | ||||
|             GetColorModifier(tev_stage.color_modifier3, get_source(tev_stage.color_source3)), | ||||
|         }; | ||||
|         const Common::Vec3<u8> color_output = ColorCombine(tev_stage.color_op, color_result); | ||||
| 
 | ||||
|         u8 alpha_output; | ||||
|         if (tev_stage.color_op == TexturingRegs::TevStageConfig::Operation::Dot3_RGBA) { | ||||
|             // result of Dot3_RGBA operation is also placed to the alpha component
 | ||||
|             alpha_output = color_output.x; | ||||
|         } else { | ||||
|             // alpha combiner
 | ||||
|             const std::array<u8, 3> alpha_result = {{ | ||||
|                 GetAlphaModifier(tev_stage.alpha_modifier1, get_source(tev_stage.alpha_source1)), | ||||
|                 GetAlphaModifier(tev_stage.alpha_modifier2, get_source(tev_stage.alpha_source2)), | ||||
|                 GetAlphaModifier(tev_stage.alpha_modifier3, get_source(tev_stage.alpha_source3)), | ||||
|             }}; | ||||
|             alpha_output = AlphaCombine(tev_stage.alpha_op, alpha_result); | ||||
|         } | ||||
| 
 | ||||
|         combiner_output[0] = std::min(255U, color_output.r() * tev_stage.GetColorMultiplier()); | ||||
|         combiner_output[1] = std::min(255U, color_output.g() * tev_stage.GetColorMultiplier()); | ||||
|         combiner_output[2] = std::min(255U, color_output.b() * tev_stage.GetColorMultiplier()); | ||||
|         combiner_output[3] = std::min(255U, alpha_output * tev_stage.GetAlphaMultiplier()); | ||||
| 
 | ||||
|         combiner_buffer = next_combiner_buffer; | ||||
| 
 | ||||
|         if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferColor( | ||||
|                 tev_stage_index)) { | ||||
|             next_combiner_buffer.r() = combiner_output.r(); | ||||
|             next_combiner_buffer.g() = combiner_output.g(); | ||||
|             next_combiner_buffer.b() = combiner_output.b(); | ||||
|         } | ||||
| 
 | ||||
|         if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferAlpha( | ||||
|                 tev_stage_index)) { | ||||
|             next_combiner_buffer.a() = combiner_output.a(); | ||||
|         } | ||||
|     } | ||||
|     return combiner_output; | ||||
| } | ||||
| 
 | ||||
| void RasterizerSoftware::WriteFog(Common::Vec4<u8>& combiner_output, float depth) const { | ||||
|     /**
 | ||||
|      * Apply fog combiner. Not fully accurate. We'd have to know what data type is used to | ||||
|      * store the depth etc. Using float for now until we know more about Pica datatypes. | ||||
|      **/ | ||||
|     if (regs.texturing.fog_mode == TexturingRegs::FogMode::Fog) { | ||||
|         const Common::Vec3<u8> fog_color = | ||||
|             Common::MakeVec(regs.texturing.fog_color.r.Value(), regs.texturing.fog_color.g.Value(), | ||||
|                             regs.texturing.fog_color.b.Value()) | ||||
|                 .Cast<u8>(); | ||||
| 
 | ||||
|         float fog_index; | ||||
|         if (regs.texturing.fog_flip) { | ||||
|             fog_index = (1.0f - depth) * 128.0f; | ||||
|         } else { | ||||
|             fog_index = depth * 128.0f; | ||||
|         } | ||||
| 
 | ||||
|         // Generate clamped fog factor from LUT for given fog index
 | ||||
|         const f32 fog_i = std::clamp(floorf(fog_index), 0.0f, 127.0f); | ||||
|         const f32 fog_f = fog_index - fog_i; | ||||
|         const auto& fog_lut_entry = state.fog.lut[static_cast<u32>(fog_i)]; | ||||
|         f32 fog_factor = fog_lut_entry.ToFloat() + fog_lut_entry.DiffToFloat() * fog_f; | ||||
|         fog_factor = std::clamp(fog_factor, 0.0f, 1.0f); | ||||
|         for (u32 i = 0; i < 3; i++) { | ||||
|             combiner_output[i] = static_cast<u8>(fog_factor * combiner_output[i] + | ||||
|                                                  (1.0f - fog_factor) * fog_color[i]); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| bool RasterizerSoftware::DoAlphaTest(u8 alpha) const { | ||||
|     const auto& output_merger = regs.framebuffer.output_merger; | ||||
|     if (!output_merger.alpha_test.enable) { | ||||
|         return true; | ||||
|     } | ||||
|     switch (output_merger.alpha_test.func) { | ||||
|     case FramebufferRegs::CompareFunc::Never: | ||||
|         return false; | ||||
|     case FramebufferRegs::CompareFunc::Always: | ||||
|         return true; | ||||
|     case FramebufferRegs::CompareFunc::Equal: | ||||
|         return alpha == output_merger.alpha_test.ref; | ||||
|     case FramebufferRegs::CompareFunc::NotEqual: | ||||
|         return alpha != output_merger.alpha_test.ref; | ||||
|     case FramebufferRegs::CompareFunc::LessThan: | ||||
|         return alpha < output_merger.alpha_test.ref; | ||||
|     case FramebufferRegs::CompareFunc::LessThanOrEqual: | ||||
|         return alpha <= output_merger.alpha_test.ref; | ||||
|     case FramebufferRegs::CompareFunc::GreaterThan: | ||||
|         return alpha > output_merger.alpha_test.ref; | ||||
|     case FramebufferRegs::CompareFunc::GreaterThanOrEqual: | ||||
|         return alpha >= output_merger.alpha_test.ref; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| bool RasterizerSoftware::DoDepthStencilTest(u16 x, u16 y, float depth, | ||||
|                                             bool stencil_action_enable) const { | ||||
|     const auto& framebuffer = regs.framebuffer.framebuffer; | ||||
|     const auto stencil_test = regs.framebuffer.output_merger.stencil_test; | ||||
|     u8 old_stencil = 0; | ||||
| 
 | ||||
|     const auto update_stencil = [&](Pica::FramebufferRegs::StencilAction action) { | ||||
|         const u8 new_stencil = | ||||
|             PerformStencilAction(action, old_stencil, stencil_test.reference_value); | ||||
|         if (framebuffer.allow_depth_stencil_write != 0) { | ||||
|             const u8 stencil = | ||||
|                 (new_stencil & stencil_test.write_mask) | (old_stencil & ~stencil_test.write_mask); | ||||
|             fb.SetStencil(x >> 4, y >> 4, stencil); | ||||
|         } | ||||
|     }; | ||||
| 
 | ||||
|     if (stencil_action_enable) { | ||||
|         old_stencil = fb.GetStencil(x >> 4, y >> 4); | ||||
|         const u8 dest = old_stencil & stencil_test.input_mask; | ||||
|         const u8 ref = stencil_test.reference_value & stencil_test.input_mask; | ||||
|         bool pass = false; | ||||
|         switch (stencil_test.func) { | ||||
|         case FramebufferRegs::CompareFunc::Never: | ||||
|             pass = false; | ||||
|             break; | ||||
|         case FramebufferRegs::CompareFunc::Always: | ||||
|             pass = true; | ||||
|             break; | ||||
|         case FramebufferRegs::CompareFunc::Equal: | ||||
|             pass = (ref == dest); | ||||
|             break; | ||||
|         case FramebufferRegs::CompareFunc::NotEqual: | ||||
|             pass = (ref != dest); | ||||
|             break; | ||||
|         case FramebufferRegs::CompareFunc::LessThan: | ||||
|             pass = (ref < dest); | ||||
|             break; | ||||
|         case FramebufferRegs::CompareFunc::LessThanOrEqual: | ||||
|             pass = (ref <= dest); | ||||
|             break; | ||||
|         case FramebufferRegs::CompareFunc::GreaterThan: | ||||
|             pass = (ref > dest); | ||||
|             break; | ||||
|         case FramebufferRegs::CompareFunc::GreaterThanOrEqual: | ||||
|             pass = (ref >= dest); | ||||
|             break; | ||||
|         } | ||||
|         if (!pass) { | ||||
|             update_stencil(stencil_test.action_stencil_fail); | ||||
|             return false; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     const u32 num_bits = FramebufferRegs::DepthBitsPerPixel(framebuffer.depth_format); | ||||
|     const u32 z = static_cast<u32>(depth * ((1 << num_bits) - 1)); | ||||
| 
 | ||||
|     const auto& output_merger = regs.framebuffer.output_merger; | ||||
|     if (output_merger.depth_test_enable) { | ||||
|         const u32 ref_z = fb.GetDepth(x >> 4, y >> 4); | ||||
|         bool pass = false; | ||||
|         switch (output_merger.depth_test_func) { | ||||
|         case FramebufferRegs::CompareFunc::Never: | ||||
|             pass = false; | ||||
|             break; | ||||
|         case FramebufferRegs::CompareFunc::Always: | ||||
|             pass = true; | ||||
|             break; | ||||
|         case FramebufferRegs::CompareFunc::Equal: | ||||
|             pass = z == ref_z; | ||||
|             break; | ||||
|         case FramebufferRegs::CompareFunc::NotEqual: | ||||
|             pass = z != ref_z; | ||||
|             break; | ||||
|         case FramebufferRegs::CompareFunc::LessThan: | ||||
|             pass = z < ref_z; | ||||
|             break; | ||||
|         case FramebufferRegs::CompareFunc::LessThanOrEqual: | ||||
|             pass = z <= ref_z; | ||||
|             break; | ||||
|         case FramebufferRegs::CompareFunc::GreaterThan: | ||||
|             pass = z > ref_z; | ||||
|             break; | ||||
|         case FramebufferRegs::CompareFunc::GreaterThanOrEqual: | ||||
|             pass = z >= ref_z; | ||||
|             break; | ||||
|         } | ||||
|         if (!pass) { | ||||
|             if (stencil_action_enable) { | ||||
|                 update_stencil(stencil_test.action_depth_fail); | ||||
|             } | ||||
|             return false; | ||||
|         } | ||||
|     } | ||||
|     if (framebuffer.allow_depth_stencil_write != 0 && output_merger.depth_write_enable) { | ||||
|         fb.SetDepth(x >> 4, y >> 4, z); | ||||
|     } | ||||
|     // The stencil depth_pass action is executed even if depth testing is disabled
 | ||||
|     if (stencil_action_enable) { | ||||
|         update_stencil(stencil_test.action_depth_pass); | ||||
|     } | ||||
| 
 | ||||
|     return true; | ||||
| } | ||||
| 
 | ||||
| } // namespace SwRenderer
 | ||||
|  |  | |||
|  | @ -4,16 +4,30 @@ | |||
| 
 | ||||
| #pragma once | ||||
| 
 | ||||
| #include "common/common_types.h" | ||||
| #include <span> | ||||
| 
 | ||||
| #include "video_core/rasterizer_interface.h" | ||||
| #include "video_core/regs_texturing.h" | ||||
| #include "video_core/renderer_software/sw_clipper.h" | ||||
| #include "video_core/renderer_software/sw_framebuffer.h" | ||||
| 
 | ||||
| namespace Pica::Shader { | ||||
| struct OutputVertex; | ||||
| } // namespace Pica::Shader
 | ||||
| } | ||||
| 
 | ||||
| namespace VideoCore { | ||||
| namespace Pica { | ||||
| struct State; | ||||
| struct Regs; | ||||
| } // namespace Pica
 | ||||
| 
 | ||||
| namespace SwRenderer { | ||||
| 
 | ||||
| struct Vertex; | ||||
| 
 | ||||
| class RasterizerSoftware : public VideoCore::RasterizerInterface { | ||||
| public: | ||||
|     explicit RasterizerSoftware(Memory::MemorySystem& memory); | ||||
| 
 | ||||
| class RasterizerSoftware : public RasterizerInterface { | ||||
|     void AddTriangle(const Pica::Shader::OutputVertex& v0, const Pica::Shader::OutputVertex& v1, | ||||
|                      const Pica::Shader::OutputVertex& v2) override; | ||||
|     void DrawTriangles() override {} | ||||
|  | @ -23,6 +37,44 @@ class RasterizerSoftware : public RasterizerInterface { | |||
|     void InvalidateRegion(PAddr addr, u32 size) override {} | ||||
|     void FlushAndInvalidateRegion(PAddr addr, u32 size) override {} | ||||
|     void ClearAll(bool flush) override {} | ||||
| 
 | ||||
| private: | ||||
|     /// Computes the screen coordinates of the provided vertex.
 | ||||
|     void MakeScreenCoords(Vertex& vtx); | ||||
| 
 | ||||
|     /// Processes the triangle defined by the provided vertices.
 | ||||
|     void ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2, | ||||
|                          bool reversed = false); | ||||
| 
 | ||||
|     /// Returns the texture color of the currently processed pixel.
 | ||||
|     std::array<Common::Vec4<u8>, 4> TextureColor( | ||||
|         std::span<const Common::Vec2<f24>, 3> uv, | ||||
|         std::span<const Pica::TexturingRegs::FullTextureConfig, 3> textures, f24 tc0_w) const; | ||||
| 
 | ||||
|     /// Returns the final pixel color with blending or logic ops applied.
 | ||||
|     Common::Vec4<u8> PixelColor(u16 x, u16 y, Common::Vec4<u8>& combiner_output) const; | ||||
| 
 | ||||
|     /// Emulates the TEV configuration and returns the combiner output.
 | ||||
|     Common::Vec4<u8> WriteTevConfig( | ||||
|         std::span<const Common::Vec4<u8>, 4> texture_color, | ||||
|         std::span<const Pica::TexturingRegs::TevStageConfig, 6> tev_stages, | ||||
|         Common::Vec4<u8> primary_color, Common::Vec4<u8> primary_fragment_color, | ||||
|         Common::Vec4<u8> secondary_fragment_color) const; | ||||
| 
 | ||||
|     /// Blends fog to the combiner output if enabled.
 | ||||
|     void WriteFog(Common::Vec4<u8>& combiner_output, float depth) const; | ||||
| 
 | ||||
|     /// Performs the alpha test. Returns false if the test failed.
 | ||||
|     bool DoAlphaTest(u8 alpha) const; | ||||
| 
 | ||||
|     /// Performs the depth stencil test. Returns false if the test failed.
 | ||||
|     bool DoDepthStencilTest(u16 x, u16 y, float depth, bool stencil_action_enable) const; | ||||
| 
 | ||||
| private: | ||||
|     Memory::MemorySystem& memory; | ||||
|     Pica::State& state; | ||||
|     const Pica::Regs& regs; | ||||
|     Framebuffer fb; | ||||
| }; | ||||
| 
 | ||||
| } // namespace VideoCore
 | ||||
| } // namespace SwRenderer
 | ||||
|  |  | |||
|  | @ -9,41 +9,40 @@ | |||
| #include "video_core/regs_texturing.h" | ||||
| #include "video_core/renderer_software/sw_texturing.h" | ||||
| 
 | ||||
| namespace Pica::Rasterizer { | ||||
| namespace SwRenderer { | ||||
| 
 | ||||
| using TevStageConfig = TexturingRegs::TevStageConfig; | ||||
| using TevStageConfig = Pica::TexturingRegs::TevStageConfig; | ||||
| 
 | ||||
| int GetWrappedTexCoord(Pica::TexturingRegs::TextureConfig::WrapMode mode, s32 val, u32 size) { | ||||
|     using TextureConfig = Pica::TexturingRegs::TextureConfig; | ||||
| 
 | ||||
| int GetWrappedTexCoord(TexturingRegs::TextureConfig::WrapMode mode, int val, unsigned size) { | ||||
|     switch (mode) { | ||||
|     case TexturingRegs::TextureConfig::ClampToEdge2: | ||||
|     case TextureConfig::ClampToEdge2: | ||||
|         // For negative coordinate, ClampToEdge2 behaves the same as Repeat
 | ||||
|         if (val < 0) { | ||||
|             return static_cast<int>(static_cast<unsigned>(val) % size); | ||||
|             return static_cast<s32>(static_cast<u32>(val) % size); | ||||
|         } | ||||
|     // [[fallthrough]]
 | ||||
|     case TexturingRegs::TextureConfig::ClampToEdge: | ||||
|         [[fallthrough]]; | ||||
|     case TextureConfig::ClampToEdge: | ||||
|         val = std::max(val, 0); | ||||
|         val = std::min(val, static_cast<int>(size) - 1); | ||||
|         val = std::min(val, static_cast<s32>(size) - 1); | ||||
|         return val; | ||||
| 
 | ||||
|     case TexturingRegs::TextureConfig::ClampToBorder: | ||||
|     case TextureConfig::ClampToBorder: | ||||
|         return val; | ||||
| 
 | ||||
|     case TexturingRegs::TextureConfig::ClampToBorder2: | ||||
|     case TextureConfig::ClampToBorder2: | ||||
|     // For ClampToBorder2, the case of positive coordinate beyond the texture size is already
 | ||||
|     // handled outside. Here we only handle the negative coordinate in the same way as Repeat.
 | ||||
|     case TexturingRegs::TextureConfig::Repeat2: | ||||
|     case TexturingRegs::TextureConfig::Repeat3: | ||||
|     case TexturingRegs::TextureConfig::Repeat: | ||||
|         return static_cast<int>(static_cast<unsigned>(val) % size); | ||||
| 
 | ||||
|     case TexturingRegs::TextureConfig::MirroredRepeat: { | ||||
|         unsigned int coord = (static_cast<unsigned>(val) % (2 * size)); | ||||
|         if (coord >= size) | ||||
|     case TextureConfig::Repeat2: | ||||
|     case TextureConfig::Repeat3: | ||||
|     case TextureConfig::Repeat: | ||||
|         return static_cast<s32>(static_cast<u32>(val) % size); | ||||
|     case TextureConfig::MirroredRepeat: { | ||||
|         u32 coord = (static_cast<u32>(val) % (2 * size)); | ||||
|         if (coord >= size) { | ||||
|             coord = 2 * size - 1 - coord; | ||||
|         return static_cast<int>(coord); | ||||
|         } | ||||
|         return static_cast<s32>(coord); | ||||
|     } | ||||
| 
 | ||||
|     default: | ||||
|         LOG_ERROR(HW_GPU, "Unknown texture coordinate wrapping mode {:x}", (int)mode); | ||||
|         UNIMPLEMENTED(); | ||||
|  | @ -58,35 +57,25 @@ Common::Vec3<u8> GetColorModifier(TevStageConfig::ColorModifier factor, | |||
|     switch (factor) { | ||||
|     case ColorModifier::SourceColor: | ||||
|         return values.rgb(); | ||||
| 
 | ||||
|     case ColorModifier::OneMinusSourceColor: | ||||
|         return (Common::Vec3<u8>(255, 255, 255) - values.rgb()).Cast<u8>(); | ||||
| 
 | ||||
|     case ColorModifier::SourceAlpha: | ||||
|         return values.aaa(); | ||||
| 
 | ||||
|     case ColorModifier::OneMinusSourceAlpha: | ||||
|         return (Common::Vec3<u8>(255, 255, 255) - values.aaa()).Cast<u8>(); | ||||
| 
 | ||||
|     case ColorModifier::SourceRed: | ||||
|         return values.rrr(); | ||||
| 
 | ||||
|     case ColorModifier::OneMinusSourceRed: | ||||
|         return (Common::Vec3<u8>(255, 255, 255) - values.rrr()).Cast<u8>(); | ||||
| 
 | ||||
|     case ColorModifier::SourceGreen: | ||||
|         return values.ggg(); | ||||
| 
 | ||||
|     case ColorModifier::OneMinusSourceGreen: | ||||
|         return (Common::Vec3<u8>(255, 255, 255) - values.ggg()).Cast<u8>(); | ||||
| 
 | ||||
|     case ColorModifier::SourceBlue: | ||||
|         return values.bbb(); | ||||
| 
 | ||||
|     case ColorModifier::OneMinusSourceBlue: | ||||
|         return (Common::Vec3<u8>(255, 255, 255) - values.bbb()).Cast<u8>(); | ||||
|     } | ||||
| 
 | ||||
|     UNREACHABLE(); | ||||
| }; | ||||
| 
 | ||||
|  | @ -96,42 +85,33 @@ u8 GetAlphaModifier(TevStageConfig::AlphaModifier factor, const Common::Vec4<u8> | |||
|     switch (factor) { | ||||
|     case AlphaModifier::SourceAlpha: | ||||
|         return values.a(); | ||||
| 
 | ||||
|     case AlphaModifier::OneMinusSourceAlpha: | ||||
|         return 255 - values.a(); | ||||
| 
 | ||||
|     case AlphaModifier::SourceRed: | ||||
|         return values.r(); | ||||
| 
 | ||||
|     case AlphaModifier::OneMinusSourceRed: | ||||
|         return 255 - values.r(); | ||||
| 
 | ||||
|     case AlphaModifier::SourceGreen: | ||||
|         return values.g(); | ||||
| 
 | ||||
|     case AlphaModifier::OneMinusSourceGreen: | ||||
|         return 255 - values.g(); | ||||
| 
 | ||||
|     case AlphaModifier::SourceBlue: | ||||
|         return values.b(); | ||||
| 
 | ||||
|     case AlphaModifier::OneMinusSourceBlue: | ||||
|         return 255 - values.b(); | ||||
|     } | ||||
| 
 | ||||
|     UNREACHABLE(); | ||||
| }; | ||||
| 
 | ||||
| Common::Vec3<u8> ColorCombine(TevStageConfig::Operation op, const Common::Vec3<u8> input[3]) { | ||||
| Common::Vec3<u8> ColorCombine(TevStageConfig::Operation op, | ||||
|                               std::span<const Common::Vec3<u8>, 3> input) { | ||||
|     using Operation = TevStageConfig::Operation; | ||||
| 
 | ||||
|     switch (op) { | ||||
|     case Operation::Replace: | ||||
|         return input[0]; | ||||
| 
 | ||||
|     case Operation::Modulate: | ||||
|         return ((input[0] * input[1]) / 255).Cast<u8>(); | ||||
| 
 | ||||
|     case Operation::Add: { | ||||
|         auto result = input[0] + input[1]; | ||||
|         result.r() = std::min(255, result.r()); | ||||
|  | @ -139,46 +119,41 @@ Common::Vec3<u8> ColorCombine(TevStageConfig::Operation op, const Common::Vec3<u | |||
|         result.b() = std::min(255, result.b()); | ||||
|         return result.Cast<u8>(); | ||||
|     } | ||||
| 
 | ||||
|     case Operation::AddSigned: { | ||||
|         // TODO(bunnei): Verify that the color conversion from (float) 0.5f to
 | ||||
|         // (byte) 128 is correct
 | ||||
|         auto result = | ||||
|             input[0].Cast<int>() + input[1].Cast<int>() - Common::MakeVec<int>(128, 128, 128); | ||||
|         result.r() = std::clamp<int>(result.r(), 0, 255); | ||||
|         result.g() = std::clamp<int>(result.g(), 0, 255); | ||||
|         result.b() = std::clamp<int>(result.b(), 0, 255); | ||||
|         Common::Vec3i result = | ||||
|             input[0].Cast<s32>() + input[1].Cast<s32>() - Common::MakeVec<s32>(128, 128, 128); | ||||
|         result.r() = std::clamp<s32>(result.r(), 0, 255); | ||||
|         result.g() = std::clamp<s32>(result.g(), 0, 255); | ||||
|         result.b() = std::clamp<s32>(result.b(), 0, 255); | ||||
|         return result.Cast<u8>(); | ||||
|     } | ||||
| 
 | ||||
|     case Operation::Lerp: | ||||
|         return ((input[0] * input[2] + | ||||
|                  input[1] * (Common::MakeVec<u8>(255, 255, 255) - input[2]).Cast<u8>()) / | ||||
|                 255) | ||||
|             .Cast<u8>(); | ||||
| 
 | ||||
|     case Operation::Subtract: { | ||||
|         auto result = input[0].Cast<int>() - input[1].Cast<int>(); | ||||
|         auto result = input[0].Cast<s32>() - input[1].Cast<s32>(); | ||||
|         result.r() = std::max(0, result.r()); | ||||
|         result.g() = std::max(0, result.g()); | ||||
|         result.b() = std::max(0, result.b()); | ||||
|         return result.Cast<u8>(); | ||||
|     } | ||||
| 
 | ||||
|     case Operation::MultiplyThenAdd: { | ||||
|         auto result = (input[0] * input[1] + 255 * input[2].Cast<int>()) / 255; | ||||
|         auto result = (input[0] * input[1] + 255 * input[2].Cast<s32>()) / 255; | ||||
|         result.r() = std::min(255, result.r()); | ||||
|         result.g() = std::min(255, result.g()); | ||||
|         result.b() = std::min(255, result.b()); | ||||
|         return result.Cast<u8>(); | ||||
|     } | ||||
| 
 | ||||
|     case Operation::AddThenMultiply: { | ||||
|         auto result = input[0] + input[1]; | ||||
|         result.r() = std::min(255, result.r()); | ||||
|         result.g() = std::min(255, result.g()); | ||||
|         result.b() = std::min(255, result.b()); | ||||
|         result = (result * input[2].Cast<int>()) / 255; | ||||
|         result = (result * input[2].Cast<s32>()) / 255; | ||||
|         return result.Cast<u8>(); | ||||
|     } | ||||
|     case Operation::Dot3_RGB: | ||||
|  | @ -187,11 +162,11 @@ Common::Vec3<u8> ColorCombine(TevStageConfig::Operation op, const Common::Vec3<u | |||
|         // indicate that the per-component computation can't have a higher precision than 1/256,
 | ||||
|         // while dot3_rgb((0x80,g0,b0), (0x7F,g1,b1)) and dot3_rgb((0x80,g0,b0), (0x80,g1,b1)) give
 | ||||
|         // different results.
 | ||||
|         int result = ((input[0].r() * 2 - 255) * (input[1].r() * 2 - 255) + 128) / 256 + | ||||
|         s32 result = ((input[0].r() * 2 - 255) * (input[1].r() * 2 - 255) + 128) / 256 + | ||||
|                      ((input[0].g() * 2 - 255) * (input[1].g() * 2 - 255) + 128) / 256 + | ||||
|                      ((input[0].b() * 2 - 255) * (input[1].b() * 2 - 255) + 128) / 256; | ||||
|         result = std::max(0, std::min(255, result)); | ||||
|         return {(u8)result, (u8)result, (u8)result}; | ||||
|         result = std::clamp(result, 0, 255); | ||||
|         return Common::Vec3{result, result, result}.Cast<u8>(); | ||||
|     } | ||||
|     default: | ||||
|         LOG_ERROR(HW_GPU, "Unknown color combiner operation {}", (int)op); | ||||
|  | @ -205,31 +180,23 @@ u8 AlphaCombine(TevStageConfig::Operation op, const std::array<u8, 3>& input) { | |||
|         using Operation = TevStageConfig::Operation; | ||||
|     case Operation::Replace: | ||||
|         return input[0]; | ||||
| 
 | ||||
|     case Operation::Modulate: | ||||
|         return input[0] * input[1] / 255; | ||||
| 
 | ||||
|     case Operation::Add: | ||||
|         return std::min(255, input[0] + input[1]); | ||||
| 
 | ||||
|     case Operation::AddSigned: { | ||||
|         // TODO(bunnei): Verify that the color conversion from (float) 0.5f to (byte) 128 is correct
 | ||||
|         auto result = static_cast<int>(input[0]) + static_cast<int>(input[1]) - 128; | ||||
|         return static_cast<u8>(std::clamp<int>(result, 0, 255)); | ||||
|         auto result = static_cast<s32>(input[0]) + static_cast<s32>(input[1]) - 128; | ||||
|         return static_cast<u8>(std::clamp<s32>(result, 0, 255)); | ||||
|     } | ||||
| 
 | ||||
|     case Operation::Lerp: | ||||
|         return (input[0] * input[2] + input[1] * (255 - input[2])) / 255; | ||||
| 
 | ||||
|     case Operation::Subtract: | ||||
|         return std::max(0, (int)input[0] - (int)input[1]); | ||||
| 
 | ||||
|         return std::max(0, static_cast<s32>(input[0]) - static_cast<s32>(input[1])); | ||||
|     case Operation::MultiplyThenAdd: | ||||
|         return std::min(255, (input[0] * input[1] + 255 * input[2]) / 255); | ||||
| 
 | ||||
|     case Operation::AddThenMultiply: | ||||
|         return (std::min(255, (input[0] + input[1])) * input[2]) / 255; | ||||
| 
 | ||||
|     default: | ||||
|         LOG_ERROR(HW_GPU, "Unknown alpha combiner operation {}", (int)op); | ||||
|         UNIMPLEMENTED(); | ||||
|  | @ -237,4 +204,4 @@ u8 AlphaCombine(TevStageConfig::Operation op, const std::array<u8, 3>& input) { | |||
|     } | ||||
| }; | ||||
| 
 | ||||
| } // namespace Pica::Rasterizer
 | ||||
| } // namespace SwRenderer
 | ||||
|  |  | |||
|  | @ -4,23 +4,25 @@ | |||
| 
 | ||||
| #pragma once | ||||
| 
 | ||||
| #include <span> | ||||
| 
 | ||||
| #include "common/common_types.h" | ||||
| #include "common/vector_math.h" | ||||
| #include "video_core/regs_texturing.h" | ||||
| 
 | ||||
| namespace Pica::Rasterizer { | ||||
| namespace SwRenderer { | ||||
| 
 | ||||
| int GetWrappedTexCoord(TexturingRegs::TextureConfig::WrapMode mode, int val, unsigned size); | ||||
| int GetWrappedTexCoord(Pica::TexturingRegs::TextureConfig::WrapMode mode, s32 val, u32 size); | ||||
| 
 | ||||
| Common::Vec3<u8> GetColorModifier(TexturingRegs::TevStageConfig::ColorModifier factor, | ||||
| Common::Vec3<u8> GetColorModifier(Pica::TexturingRegs::TevStageConfig::ColorModifier factor, | ||||
|                                   const Common::Vec4<u8>& values); | ||||
| 
 | ||||
| u8 GetAlphaModifier(TexturingRegs::TevStageConfig::AlphaModifier factor, | ||||
| u8 GetAlphaModifier(Pica::TexturingRegs::TevStageConfig::AlphaModifier factor, | ||||
|                     const Common::Vec4<u8>& values); | ||||
| 
 | ||||
| Common::Vec3<u8> ColorCombine(TexturingRegs::TevStageConfig::Operation op, | ||||
|                               const Common::Vec3<u8> input[3]); | ||||
| Common::Vec3<u8> ColorCombine(Pica::TexturingRegs::TevStageConfig::Operation op, | ||||
|                               std::span<const Common::Vec3<u8>, 3> input); | ||||
| 
 | ||||
| u8 AlphaCombine(TexturingRegs::TevStageConfig::Operation op, const std::array<u8, 3>& input); | ||||
| u8 AlphaCombine(Pica::TexturingRegs::TevStageConfig::Operation op, const std::array<u8, 3>& input); | ||||
| 
 | ||||
| } // namespace Pica::Rasterizer
 | ||||
| } // namespace SwRenderer
 | ||||
|  |  | |||
|  | @ -54,12 +54,12 @@ struct DebugData<true> { | |||
|             LOOP_INT_IN = 0x800, | ||||
|         }; | ||||
| 
 | ||||
|         Common::Vec4<float24> src1; | ||||
|         Common::Vec4<float24> src2; | ||||
|         Common::Vec4<float24> src3; | ||||
|         Common::Vec4<f24> src1; | ||||
|         Common::Vec4<f24> src2; | ||||
|         Common::Vec4<f24> src3; | ||||
| 
 | ||||
|         Common::Vec4<float24> dest_in; | ||||
|         Common::Vec4<float24> dest_out; | ||||
|         Common::Vec4<f24> dest_in; | ||||
|         Common::Vec4<f24> dest_out; | ||||
| 
 | ||||
|         s32 address_registers[2]; | ||||
|         bool conditional_code[2]; | ||||
|  | @ -89,7 +89,7 @@ template <DebugDataRecord::Type type, typename ValueType> | |||
| inline void SetField(DebugDataRecord& record, ValueType value); | ||||
| 
 | ||||
| template <> | ||||
| inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, float24* value) { | ||||
| inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, f24* value) { | ||||
|     record.src1.x = value[0]; | ||||
|     record.src1.y = value[1]; | ||||
|     record.src1.z = value[2]; | ||||
|  | @ -97,7 +97,7 @@ inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, float24* va | |||
| } | ||||
| 
 | ||||
| template <> | ||||
| inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, float24* value) { | ||||
| inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, f24* value) { | ||||
|     record.src2.x = value[0]; | ||||
|     record.src2.y = value[1]; | ||||
|     record.src2.z = value[2]; | ||||
|  | @ -105,7 +105,7 @@ inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, float24* va | |||
| } | ||||
| 
 | ||||
| template <> | ||||
| inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* value) { | ||||
| inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, f24* value) { | ||||
|     record.src3.x = value[0]; | ||||
|     record.src3.y = value[1]; | ||||
|     record.src3.z = value[2]; | ||||
|  | @ -113,7 +113,7 @@ inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* va | |||
| } | ||||
| 
 | ||||
| template <> | ||||
| inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24* value) { | ||||
| inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, f24* value) { | ||||
|     record.dest_in.x = value[0]; | ||||
|     record.dest_in.y = value[1]; | ||||
|     record.dest_in.z = value[2]; | ||||
|  | @ -121,7 +121,7 @@ inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24* | |||
| } | ||||
| 
 | ||||
| template <> | ||||
| inline void SetField<DebugDataRecord::DEST_OUT>(DebugDataRecord& record, float24* value) { | ||||
| inline void SetField<DebugDataRecord::DEST_OUT>(DebugDataRecord& record, f24* value) { | ||||
|     record.dest_out.x = value[0]; | ||||
|     record.dest_out.y = value[1]; | ||||
|     record.dest_out.z = value[2]; | ||||
|  |  | |||
|  | @ -5,10 +5,10 @@ | |||
| #include <cmath> | ||||
| #include <cstring> | ||||
| #include "common/arch.h" | ||||
| #include "common/assert.h" | ||||
| #include "common/bit_set.h" | ||||
| #include "common/logging/log.h" | ||||
| #include "common/microprofile.h" | ||||
| #include "video_core/pica_state.h" | ||||
| #include "video_core/regs_rasterizer.h" | ||||
| #include "video_core/regs_shader.h" | ||||
| #include "video_core/shader/shader.h" | ||||
|  | @ -41,11 +41,11 @@ OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, | |||
|         // Allow us to overflow OutputVertex to avoid branches, since
 | ||||
|         // RasterizerRegs::VSOutputAttributes::INVALID would write to slot 31, which
 | ||||
|         // would be out of bounds otherwise.
 | ||||
|         std::array<float24, 32> vertex_slots_overflow; | ||||
|         std::array<f24, 32> vertex_slots_overflow; | ||||
|     }; | ||||
| 
 | ||||
|     // Assert that OutputVertex has enough space for 24 semantic registers
 | ||||
|     static_assert(sizeof(std::array<float24, 24>) == sizeof(ret), | ||||
|     static_assert(sizeof(std::array<f24, 24>) == sizeof(ret), | ||||
|                   "Struct and array have different sizes."); | ||||
| 
 | ||||
|     unsigned int num_attributes = regs.vs_output_total & 7; | ||||
|  | @ -61,7 +61,7 @@ OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, | |||
|     // interpolation
 | ||||
|     for (unsigned i = 0; i < 4; ++i) { | ||||
|         float c = std::fabs(ret.color[i].ToFloat32()); | ||||
|         ret.color[i] = float24::FromFloat32(c < 1.0f ? c : 1.0f); | ||||
|         ret.color[i] = f24::FromFloat32(c < 1.0f ? c : 1.0f); | ||||
|     } | ||||
| 
 | ||||
|     LOG_TRACE(HW_GPU, | ||||
|  | @ -86,7 +86,7 @@ void UnitState::LoadInput(const ShaderRegs& config, const AttributeBuffer& input | |||
|     } | ||||
| } | ||||
| 
 | ||||
| static void CopyRegistersToOutput(std::span<Common::Vec4<float24>, 16> regs, u32 mask, | ||||
| static void CopyRegistersToOutput(std::span<Common::Vec4<f24>, 16> regs, u32 mask, | ||||
|                                   AttributeBuffer& buffer) { | ||||
|     int output_i = 0; | ||||
|     for (int reg : Common::BitSet<u32>(mask)) { | ||||
|  | @ -108,7 +108,7 @@ GSEmitter::~GSEmitter() { | |||
|     delete handlers; | ||||
| } | ||||
| 
 | ||||
| void GSEmitter::Emit(std::span<Common::Vec4<float24>, 16> output_regs) { | ||||
| void GSEmitter::Emit(std::span<Common::Vec4<f24>, 16> output_regs) { | ||||
|     ASSERT(vertex_id < 3); | ||||
|     // TODO: This should be merged with UnitState::WriteOutput somehow
 | ||||
|     CopyRegistersToOutput(output_regs, output_mask, buffer[vertex_id]); | ||||
|  |  | |||
|  | @ -12,7 +12,6 @@ | |||
| #include <boost/serialization/access.hpp> | ||||
| #include <boost/serialization/array.hpp> | ||||
| #include <boost/serialization/base_object.hpp> | ||||
| #include "common/assert.h" | ||||
| #include "common/common_funcs.h" | ||||
| #include "common/common_types.h" | ||||
| #include "common/hash.h" | ||||
|  | @ -29,7 +28,7 @@ using ProgramCode = std::array<u32, MAX_PROGRAM_CODE_LENGTH>; | |||
| using SwizzleData = std::array<u32, MAX_SWIZZLE_DATA_LENGTH>; | ||||
| 
 | ||||
| struct AttributeBuffer { | ||||
|     alignas(16) Common::Vec4<float24> attr[16]; | ||||
|     alignas(16) Common::Vec4<f24> attr[16]; | ||||
| 
 | ||||
| private: | ||||
|     friend class boost::serialization::access; | ||||
|  | @ -46,16 +45,16 @@ using VertexHandler = std::function<void(const AttributeBuffer&)>; | |||
| using WindingSetter = std::function<void()>; | ||||
| 
 | ||||
| struct OutputVertex { | ||||
|     Common::Vec4<float24> pos; | ||||
|     Common::Vec4<float24> quat; | ||||
|     Common::Vec4<float24> color; | ||||
|     Common::Vec2<float24> tc0; | ||||
|     Common::Vec2<float24> tc1; | ||||
|     float24 tc0_w; | ||||
|     Common::Vec4<f24> pos; | ||||
|     Common::Vec4<f24> quat; | ||||
|     Common::Vec4<f24> color; | ||||
|     Common::Vec2<f24> tc0; | ||||
|     Common::Vec2<f24> tc1; | ||||
|     f24 tc0_w; | ||||
|     INSERT_PADDING_WORDS(1); | ||||
|     Common::Vec3<float24> view; | ||||
|     Common::Vec3<f24> view; | ||||
|     INSERT_PADDING_WORDS(1); | ||||
|     Common::Vec2<float24> tc2; | ||||
|     Common::Vec2<f24> tc2; | ||||
| 
 | ||||
|     static void ValidateSemantics(const RasterizerRegs& regs); | ||||
|     static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs, | ||||
|  | @ -76,8 +75,8 @@ private: | |||
|     friend class boost::serialization::access; | ||||
| }; | ||||
| #define ASSERT_POS(var, pos)                                                                       \ | ||||
|     static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong "       \ | ||||
|                                                                         "offset.") | ||||
|     static_assert(offsetof(OutputVertex, var) == pos * sizeof(f24), "Semantic at wrong "           \ | ||||
|                                                                     "offset.") | ||||
| ASSERT_POS(pos, RasterizerRegs::VSOutputAttributes::POSITION_X); | ||||
| ASSERT_POS(quat, RasterizerRegs::VSOutputAttributes::QUATERNION_X); | ||||
| ASSERT_POS(color, RasterizerRegs::VSOutputAttributes::COLOR_R); | ||||
|  | @ -109,7 +108,7 @@ struct GSEmitter { | |||
| 
 | ||||
|     GSEmitter(); | ||||
|     ~GSEmitter(); | ||||
|     void Emit(std::span<Common::Vec4<float24>, 16> output_regs); | ||||
|     void Emit(std::span<Common::Vec4<f24>, 16> output_regs); | ||||
| 
 | ||||
| private: | ||||
|     friend class boost::serialization::access; | ||||
|  | @ -136,9 +135,9 @@ struct UnitState { | |||
|     struct Registers { | ||||
|         // The registers are accessed by the shader JIT using SSE instructions, and are therefore
 | ||||
|         // required to be 16-byte aligned.
 | ||||
|         alignas(16) std::array<Common::Vec4<float24>, 16> input; | ||||
|         alignas(16) std::array<Common::Vec4<float24>, 16> temporary; | ||||
|         alignas(16) std::array<Common::Vec4<float24>, 16> output; | ||||
|         alignas(16) std::array<Common::Vec4<f24>, 16> input; | ||||
|         alignas(16) std::array<Common::Vec4<f24>, 16> temporary; | ||||
|         alignas(16) std::array<Common::Vec4<f24>, 16> output; | ||||
| 
 | ||||
|     private: | ||||
|         friend class boost::serialization::access; | ||||
|  | @ -160,18 +159,16 @@ struct UnitState { | |||
|     GSEmitter* emitter_ptr; | ||||
| 
 | ||||
|     static std::size_t InputOffset(int register_index) { | ||||
|         return offsetof(UnitState, registers.input) + | ||||
|                register_index * sizeof(Common::Vec4<float24>); | ||||
|         return offsetof(UnitState, registers.input) + register_index * sizeof(Common::Vec4<f24>); | ||||
|     } | ||||
| 
 | ||||
|     static std::size_t OutputOffset(int register_index) { | ||||
|         return offsetof(UnitState, registers.output) + | ||||
|                register_index * sizeof(Common::Vec4<float24>); | ||||
|         return offsetof(UnitState, registers.output) + register_index * sizeof(Common::Vec4<f24>); | ||||
|     } | ||||
| 
 | ||||
|     static std::size_t TemporaryOffset(int register_index) { | ||||
|         return offsetof(UnitState, registers.temporary) + | ||||
|                register_index * sizeof(Common::Vec4<float24>); | ||||
|                register_index * sizeof(Common::Vec4<f24>); | ||||
|     } | ||||
| 
 | ||||
|     /**
 | ||||
|  | @ -219,13 +216,13 @@ private: | |||
| struct Uniforms { | ||||
|     // The float uniforms are accessed by the shader JIT using SSE instructions, and are
 | ||||
|     // therefore required to be 16-byte aligned.
 | ||||
|     alignas(16) std::array<Common::Vec4<float24>, 96> f; | ||||
|     alignas(16) std::array<Common::Vec4<f24>, 96> f; | ||||
| 
 | ||||
|     std::array<bool, 16> b; | ||||
|     std::array<Common::Vec4<u8>, 4> i; | ||||
| 
 | ||||
|     static std::size_t GetFloatUniformOffset(unsigned index) { | ||||
|         return offsetof(Uniforms, f) + index * sizeof(Common::Vec4<float24>); | ||||
|         return offsetof(Uniforms, f) + index * sizeof(Common::Vec4<f24>); | ||||
|     } | ||||
| 
 | ||||
|     static std::size_t GetBoolUniformOffset(unsigned index) { | ||||
|  |  | |||
|  | @ -80,7 +80,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|     const auto& program_code = setup.program_code; | ||||
| 
 | ||||
|     // Placeholder for invalid inputs
 | ||||
|     static float24 dummy_vec4_float24[4]; | ||||
|     static f24 dummy_vec4_float24[4]; | ||||
| 
 | ||||
|     unsigned iteration = 0; | ||||
|     bool exit_loop = false; | ||||
|  | @ -111,7 +111,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
| 
 | ||||
|         debug_data.max_offset = std::max<u32>(debug_data.max_offset, 1 + program_counter); | ||||
| 
 | ||||
|         auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { | ||||
|         auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const f24* { | ||||
|             switch (source_reg.GetRegisterType()) { | ||||
|             case RegisterType::Input: | ||||
|                 return &state.registers.input[source_reg.GetIndex()].x; | ||||
|  | @ -137,15 +137,15 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|                     ? 0 | ||||
|                     : state.address_registers[instr.common.address_register_index - 1]; | ||||
| 
 | ||||
|             const float24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) + | ||||
|                                                         (is_inverted ? 0 : address_offset)); | ||||
|             const float24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted) + | ||||
|                                                         (is_inverted ? address_offset : 0)); | ||||
|             const f24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) + | ||||
|                                                     (is_inverted ? 0 : address_offset)); | ||||
|             const f24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted) + | ||||
|                                                     (is_inverted ? address_offset : 0)); | ||||
| 
 | ||||
|             const bool negate_src1 = ((bool)swizzle.negate_src1 != false); | ||||
|             const bool negate_src2 = ((bool)swizzle.negate_src2 != false); | ||||
| 
 | ||||
|             float24 src1[4] = { | ||||
|             f24 src1[4] = { | ||||
|                 src1_[(int)swizzle.src1_selector_0.Value()], | ||||
|                 src1_[(int)swizzle.src1_selector_1.Value()], | ||||
|                 src1_[(int)swizzle.src1_selector_2.Value()], | ||||
|  | @ -157,7 +157,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|                 src1[2] = -src1[2]; | ||||
|                 src1[3] = -src1[3]; | ||||
|             } | ||||
|             float24 src2[4] = { | ||||
|             f24 src2[4] = { | ||||
|                 src2_[(int)swizzle.src2_selector_0.Value()], | ||||
|                 src2_[(int)swizzle.src2_selector_1.Value()], | ||||
|                 src2_[(int)swizzle.src2_selector_2.Value()], | ||||
|  | @ -170,12 +170,11 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|                 src2[3] = -src2[3]; | ||||
|             } | ||||
| 
 | ||||
|             float24* dest = | ||||
|                 (instr.common.dest.Value() < 0x10) | ||||
|                     ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] | ||||
|                 : (instr.common.dest.Value() < 0x20) | ||||
|                     ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] | ||||
|                     : dummy_vec4_float24; | ||||
|             f24* dest = (instr.common.dest.Value() < 0x10) | ||||
|                             ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] | ||||
|                         : (instr.common.dest.Value() < 0x20) | ||||
|                             ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] | ||||
|                             : dummy_vec4_float24; | ||||
| 
 | ||||
|             debug_data.max_opdesc_id = | ||||
|                 std::max<u32>(debug_data.max_opdesc_id, 1 + instr.common.operand_desc_id); | ||||
|  | @ -216,7 +215,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|                     if (!swizzle.DestComponentEnabled(i)) | ||||
|                         continue; | ||||
| 
 | ||||
|                     dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32())); | ||||
|                     dest[i] = f24::FromFloat32(std::floor(src1[i].ToFloat32())); | ||||
|                 } | ||||
|                 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||||
|                 break; | ||||
|  | @ -263,11 +262,10 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
| 
 | ||||
|                 OpCode::Id opcode = instr.opcode.Value().EffectiveOpCode(); | ||||
|                 if (opcode == OpCode::Id::DPH || opcode == OpCode::Id::DPHI) | ||||
|                     src1[3] = float24::FromFloat32(1.0f); | ||||
|                     src1[3] = f24::One(); | ||||
| 
 | ||||
|                 int num_components = (opcode == OpCode::Id::DP3) ? 3 : 4; | ||||
|                 float24 dot = std::inner_product(src1, src1 + num_components, src2, | ||||
|                                                  float24::FromFloat32(0.f)); | ||||
|                 f24 dot = std::inner_product(src1, src1 + num_components, src2, f24::Zero()); | ||||
| 
 | ||||
|                 for (int i = 0; i < 4; ++i) { | ||||
|                     if (!swizzle.DestComponentEnabled(i)) | ||||
|  | @ -283,7 +281,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|             case OpCode::Id::RCP: { | ||||
|                 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||||
|                 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||||
|                 float24 rcp_res = float24::FromFloat32(1.0f / src1[0].ToFloat32()); | ||||
|                 f24 rcp_res = f24::FromFloat32(1.0f / src1[0].ToFloat32()); | ||||
|                 for (int i = 0; i < 4; ++i) { | ||||
|                     if (!swizzle.DestComponentEnabled(i)) | ||||
|                         continue; | ||||
|  | @ -298,7 +296,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|             case OpCode::Id::RSQ: { | ||||
|                 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||||
|                 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||||
|                 float24 rsq_res = float24::FromFloat32(1.0f / std::sqrt(src1[0].ToFloat32())); | ||||
|                 f24 rsq_res = f24::FromFloat32(1.0f / std::sqrt(src1[0].ToFloat32())); | ||||
|                 for (int i = 0; i < 4; ++i) { | ||||
|                     if (!swizzle.DestComponentEnabled(i)) | ||||
|                         continue; | ||||
|  | @ -345,8 +343,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|                     if (!swizzle.DestComponentEnabled(i)) | ||||
|                         continue; | ||||
| 
 | ||||
|                     dest[i] = (src1[i] >= src2[i]) ? float24::FromFloat32(1.0f) | ||||
|                                                    : float24::FromFloat32(0.0f); | ||||
|                     dest[i] = (src1[i] >= src2[i]) ? f24::One() : f24::Zero(); | ||||
|                 } | ||||
|                 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||||
|                 break; | ||||
|  | @ -360,8 +357,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|                     if (!swizzle.DestComponentEnabled(i)) | ||||
|                         continue; | ||||
| 
 | ||||
|                     dest[i] = (src1[i] < src2[i]) ? float24::FromFloat32(1.0f) | ||||
|                                                   : float24::FromFloat32(0.0f); | ||||
|                     dest[i] = (src1[i] < src2[i]) ? f24::One() : f24::Zero(); | ||||
|                 } | ||||
|                 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||||
|                 break; | ||||
|  | @ -413,7 +409,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|                 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||||
| 
 | ||||
|                 // EX2 only takes first component exp2 and writes it to all dest components
 | ||||
|                 float24 ex2_res = float24::FromFloat32(std::exp2(src1[0].ToFloat32())); | ||||
|                 f24 ex2_res = f24::FromFloat32(std::exp2(src1[0].ToFloat32())); | ||||
|                 for (int i = 0; i < 4; ++i) { | ||||
|                     if (!swizzle.DestComponentEnabled(i)) | ||||
|                         continue; | ||||
|  | @ -430,7 +426,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|                 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||||
| 
 | ||||
|                 // LG2 only takes the first component log2 and writes it to all dest components
 | ||||
|                 float24 lg2_res = float24::FromFloat32(std::log2(src1[0].ToFloat32())); | ||||
|                 f24 lg2_res = f24::FromFloat32(std::log2(src1[0].ToFloat32())); | ||||
|                 for (int i = 0; i < 4; ++i) { | ||||
|                     if (!swizzle.DestComponentEnabled(i)) | ||||
|                         continue; | ||||
|  | @ -466,17 +462,17 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|                         ? 0 | ||||
|                         : state.address_registers[instr.mad.address_register_index - 1]; | ||||
| 
 | ||||
|                 const float24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted)); | ||||
|                 const float24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted) + | ||||
|                                                             (!is_inverted * address_offset)); | ||||
|                 const float24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted) + | ||||
|                                                             (is_inverted * address_offset)); | ||||
|                 const f24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted)); | ||||
|                 const f24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted) + | ||||
|                                                         (!is_inverted * address_offset)); | ||||
|                 const f24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted) + | ||||
|                                                         (is_inverted * address_offset)); | ||||
| 
 | ||||
|                 const bool negate_src1 = ((bool)mad_swizzle.negate_src1 != false); | ||||
|                 const bool negate_src2 = ((bool)mad_swizzle.negate_src2 != false); | ||||
|                 const bool negate_src3 = ((bool)mad_swizzle.negate_src3 != false); | ||||
| 
 | ||||
|                 float24 src1[4] = { | ||||
|                 f24 src1[4] = { | ||||
|                     src1_[(int)mad_swizzle.src1_selector_0.Value()], | ||||
|                     src1_[(int)mad_swizzle.src1_selector_1.Value()], | ||||
|                     src1_[(int)mad_swizzle.src1_selector_2.Value()], | ||||
|  | @ -488,7 +484,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|                     src1[2] = -src1[2]; | ||||
|                     src1[3] = -src1[3]; | ||||
|                 } | ||||
|                 float24 src2[4] = { | ||||
|                 f24 src2[4] = { | ||||
|                     src2_[(int)mad_swizzle.src2_selector_0.Value()], | ||||
|                     src2_[(int)mad_swizzle.src2_selector_1.Value()], | ||||
|                     src2_[(int)mad_swizzle.src2_selector_2.Value()], | ||||
|  | @ -500,7 +496,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|                     src2[2] = -src2[2]; | ||||
|                     src2[3] = -src2[3]; | ||||
|                 } | ||||
|                 float24 src3[4] = { | ||||
|                 f24 src3[4] = { | ||||
|                     src3_[(int)mad_swizzle.src3_selector_0.Value()], | ||||
|                     src3_[(int)mad_swizzle.src3_selector_1.Value()], | ||||
|                     src3_[(int)mad_swizzle.src3_selector_2.Value()], | ||||
|  | @ -513,12 +509,11 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData | |||
|                     src3[3] = -src3[3]; | ||||
|                 } | ||||
| 
 | ||||
|                 float24* dest = | ||||
|                     (instr.mad.dest.Value() < 0x10) | ||||
|                         ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] | ||||
|                     : (instr.mad.dest.Value() < 0x20) | ||||
|                         ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] | ||||
|                         : dummy_vec4_float24; | ||||
|                 f24* dest = (instr.mad.dest.Value() < 0x10) | ||||
|                                 ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] | ||||
|                             : (instr.mad.dest.Value() < 0x20) | ||||
|                                 ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] | ||||
|                                 : dummy_vec4_float24; | ||||
| 
 | ||||
|                 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||||
|                 Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); | ||||
|  | @ -687,7 +682,7 @@ DebugData<true> InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup, | |||
|     DebugData<true> debug_data; | ||||
| 
 | ||||
|     // Setup input register table
 | ||||
|     state.registers.input.fill(Common::Vec4<float24>::AssignToAll(float24::Zero())); | ||||
|     state.registers.input.fill(Common::Vec4<f24>::AssignToAll(f24::Zero())); | ||||
|     state.LoadInput(config, input); | ||||
|     RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point); | ||||
|     return debug_data; | ||||
|  |  | |||
|  | @ -5,6 +5,7 @@ | |||
| #include "common/arch.h" | ||||
| #if CITRA_ARCH(x86_64) | ||||
| 
 | ||||
| #include "common/assert.h" | ||||
| #include "common/microprofile.h" | ||||
| #include "video_core/shader/shader.h" | ||||
| #include "video_core/shader/shader_jit_x64.h" | ||||
|  |  | |||
|  | @ -813,7 +813,7 @@ void JitShader::Compile_JMP(Instruction instr) { | |||
|     } | ||||
| } | ||||
| 
 | ||||
| static void Emit(GSEmitter* emitter, Common::Vec4<float24> (*output)[16]) { | ||||
| static void Emit(GSEmitter* emitter, Common::Vec4<f24> (*output)[16]) { | ||||
|     emitter->Emit(*output); | ||||
| } | ||||
| 
 | ||||
|  |  | |||
|  | @ -98,7 +98,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, | |||
|                 const s8* srcdata = reinterpret_cast<const s8*>( | ||||
|                     VideoCore::g_memory->GetPhysicalPointer(source_addr)); | ||||
|                 for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { | ||||
|                     input.attr[i][comp] = float24::FromFloat32(srcdata[comp]); | ||||
|                     input.attr[i][comp] = f24::FromFloat32(srcdata[comp]); | ||||
|                 } | ||||
|                 break; | ||||
|             } | ||||
|  | @ -106,7 +106,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, | |||
|                 const u8* srcdata = reinterpret_cast<const u8*>( | ||||
|                     VideoCore::g_memory->GetPhysicalPointer(source_addr)); | ||||
|                 for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { | ||||
|                     input.attr[i][comp] = float24::FromFloat32(srcdata[comp]); | ||||
|                     input.attr[i][comp] = f24::FromFloat32(srcdata[comp]); | ||||
|                 } | ||||
|                 break; | ||||
|             } | ||||
|  | @ -114,7 +114,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, | |||
|                 const s16* srcdata = reinterpret_cast<const s16*>( | ||||
|                     VideoCore::g_memory->GetPhysicalPointer(source_addr)); | ||||
|                 for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { | ||||
|                     input.attr[i][comp] = float24::FromFloat32(srcdata[comp]); | ||||
|                     input.attr[i][comp] = f24::FromFloat32(srcdata[comp]); | ||||
|                 } | ||||
|                 break; | ||||
|             } | ||||
|  | @ -122,7 +122,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, | |||
|                 const float* srcdata = reinterpret_cast<const float*>( | ||||
|                     VideoCore::g_memory->GetPhysicalPointer(source_addr)); | ||||
|                 for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { | ||||
|                     input.attr[i][comp] = float24::FromFloat32(srcdata[comp]); | ||||
|                     input.attr[i][comp] = f24::FromFloat32(srcdata[comp]); | ||||
|                 } | ||||
|                 break; | ||||
|             } | ||||
|  | @ -132,8 +132,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, | |||
|             // is *not* carried over from the default attribute settings even if they're
 | ||||
|             // enabled for this attribute.
 | ||||
|             for (unsigned int comp = vertex_attribute_elements[i]; comp < 4; ++comp) { | ||||
|                 input.attr[i][comp] = | ||||
|                     comp == 3 ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f); | ||||
|                 input.attr[i][comp] = comp == 3 ? f24::One() : f24::Zero(); | ||||
|             } | ||||
| 
 | ||||
|             LOG_TRACE(HW_GPU, | ||||
|  |  | |||
|  | @ -40,7 +40,7 @@ void Init(Frontend::EmuWindow& emu_window, Frontend::EmuWindow* secondary_window | |||
| 
 | ||||
|     switch (graphics_api) { | ||||
|     case Settings::GraphicsAPI::Software: | ||||
|         g_renderer = std::make_unique<VideoCore::RendererSoftware>(system, emu_window); | ||||
|         g_renderer = std::make_unique<SwRenderer::RendererSoftware>(system, emu_window); | ||||
|         break; | ||||
|     case Settings::GraphicsAPI::OpenGL: | ||||
|         g_renderer = std::make_unique<OpenGL::RendererOpenGL>(system, emu_window, secondary_window); | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue