From a19166ddec72cd58e4cbaaf6bfbd10c01cfcc171 Mon Sep 17 00:00:00 2001
From: Vitor Kiguchi <vitor-kiguchi@hotmail.com>
Date: Sat, 20 Jan 2024 19:51:18 -0300
Subject: [PATCH 1/4] pica_types: minor cleanup

---
 src/video_core/pica_types.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/src/video_core/pica_types.h b/src/video_core/pica_types.h
index 4fd27f9fe..ad454555e 100644
--- a/src/video_core/pica_types.h
+++ b/src/video_core/pica_types.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <bit>
 #include <cmath>
 #include <cstring>
 #include <boost/serialization/access.hpp>
@@ -50,7 +51,7 @@ public:
             hex = sign;
         }
 
-        std::memcpy(&res.value, &hex, sizeof(float));
+        res.value = std::bit_cast<float>(hex);
 
         return res;
     }
@@ -95,17 +96,17 @@ public:
     }
 
     constexpr Float<M, E>& operator/=(const Float<M, E>& flt) {
-        value /= flt.ToFloat32();
+        value = operator/(flt).value;
         return *this;
     }
 
     constexpr Float<M, E>& operator+=(const Float<M, E>& flt) {
-        value += flt.ToFloat32();
+        value = operator+(flt).value;
         return *this;
     }
 
     constexpr Float<M, E>& operator-=(const Float<M, E>& flt) {
-        value -= flt.ToFloat32();
+        value = operator-(flt).value;
         return *this;
     }
 

From d2af98673a636d9b86ea288e676a3e9b72c8eee5 Mon Sep 17 00:00:00 2001
From: Vitor Kiguchi <vitor-kiguchi@hotmail.com>
Date: Sat, 20 Jan 2024 21:20:28 -0300
Subject: [PATCH 2/4] pica_types: float: truncate, flush to 0, and treat
 infinities for all values

doing it at FromFloat32 results in this being applied for all
float<M, E> that aren't constructed from raw.

Note: due to lack of compiler support for C++23 at the moment,
the use of std::isnormal and std::abs results in Trunc not being
constexpr, which required the changes to Zero, One, and operator-,
to prevent FromFloat32 being used in constexpr contexts, and those
specific changes may be reverted in the future.
---
 src/video_core/pica_types.h                   | 66 ++++++++++++++++++-
 .../renderer_software/sw_rasterizer.cpp       |  5 +-
 2 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/src/video_core/pica_types.h b/src/video_core/pica_types.h
index ad454555e..043691e87 100644
--- a/src/video_core/pica_types.h
+++ b/src/video_core/pica_types.h
@@ -7,6 +7,7 @@
 #include <bit>
 #include <cmath>
 #include <cstring>
+#include <limits>
 #include <boost/serialization/access.hpp>
 #include "common/common_types.h"
 
@@ -29,6 +30,41 @@ public:
     static constexpr Float<M, E> FromFloat32(float val) {
         Float<M, E> ret;
         ret.value = val;
+        return Trunc(ret);
+    }
+
+    static constexpr Float<M, E> MinNormal() {
+        Float<M, E> ret;
+        // Mininum normal value = 1.0 / (1 << ((1 << (E - 1)) - 2));
+        if constexpr (E == 5) {
+            ret.value = 0x1.p-14;
+        } else {
+            // E == 7
+            ret.value = (0x1.p-62);
+        }
+        return ret;
+    }
+
+    // these values are approximate, rounded up
+    static constexpr Float<M, E> Max() {
+        Float<M, E> ret;
+        if constexpr (E == 5) {
+            ret.value = 0x1.p16;
+        } else {
+            // E == 7
+            ret.value = 0x1.p64;
+        }
+        return ret;
+    }
+
+    // before C++23 std::isnormal and std::abs aren't considered constexpr so this function can't be
+    // used as constexpr until the compilers support that.
+    static constexpr Float<M, E> Trunc(const Float<M, E>& val) {
+        Float<M, E> ret = val.Flushed().InfChecked();
+        if (std::isnormal(val.ToFloat32())) {
+            u32 hex = std::bit_cast<u32>(ret.ToFloat32()) & (0xffffffff ^ ((1 << (23 - M)) - 1));
+            ret.value = std::bit_cast<float>(hex);
+        }
         return ret;
     }
 
@@ -57,11 +93,15 @@ public:
     }
 
     static constexpr Float<M, E> Zero() {
-        return FromFloat32(0.f);
+        Float<M, E> ret;
+        ret.value = 0.f;
+        return ret;
     }
 
     static constexpr Float<M, E> One() {
-        return FromFloat32(1.f);
+        Float<M, E> ret;
+        ret.value = 1.f;
+        return ret;
     }
 
     // Not recommended for anything but logging
@@ -69,6 +109,24 @@ public:
         return value;
     }
 
+    constexpr Float<M, E> Flushed() const {
+        Float<M, E> ret;
+        ret.value = value;
+        if (std::abs(value) < MinNormal().ToFloat32()) {
+            ret.value = 0;
+        }
+        return ret;
+    }
+
+    constexpr Float<M, E> InfChecked() const {
+        Float<M, E> ret;
+        ret.value = value;
+        if (std::abs(value) > Max().ToFloat32()) {
+            ret.value = value * std::numeric_limits<float>::infinity();
+        }
+        return ret;
+    }
+
     constexpr Float<M, E> operator*(const Float<M, E>& flt) const {
         float result = value * flt.ToFloat32();
         // PICA gives 0 instead of NaN when multiplying by inf
@@ -111,7 +169,9 @@ public:
     }
 
     constexpr Float<M, E> operator-() const {
-        return Float<M, E>::FromFloat32(-ToFloat32());
+        Float<M, E> ret;
+        ret.value = -value;
+        return ret;
     }
 
     constexpr bool operator<(const Float<M, E>& flt) const {
diff --git a/src/video_core/renderer_software/sw_rasterizer.cpp b/src/video_core/renderer_software/sw_rasterizer.cpp
index f0f290ebe..6011400e7 100644
--- a/src/video_core/renderer_software/sw_rasterizer.cpp
+++ b/src/video_core/renderer_software/sw_rasterizer.cpp
@@ -125,9 +125,8 @@ void RasterizerSoftware::AddTriangle(const Pica::OutputVertex& v0, const Pica::O
     auto* input_list = &buffer_b;
 
     // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value.
-    // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest
-    //       epsilon possible within f24 accuracy.
-    static constexpr f24 EPSILON = f24::FromFloat32(0.00001f);
+    // TODO: Not sure if this is a valid approach.
+    static constexpr f24 EPSILON = f24::MinNormal();
     static constexpr f24 f0 = f24::Zero();
     static constexpr f24 f1 = f24::One();
     static constexpr std::array<ClippingEdge, 7> clipping_edges = {{

From 5a9bb045d72998c3d1a76e2afbae60119fb033da Mon Sep 17 00:00:00 2001
From: Vitor Kiguchi <vitor-kiguchi@hotmail.com>
Date: Tue, 23 Jan 2024 00:56:57 -0300
Subject: [PATCH 3/4] tests: fix shader_jit_compiler and add pica_float

The Uniform Read and Address Register Offset ran into precision
differences between f32 and f24, which can be easily fixed and
are not the point of the tests.
As for the LG2 and EX2 tests that were failing, they were wrong.
While it is true that 2^79.7 ~= 1e24, the value is bigger than the
biggest representable value in f24, therefore both EX2 and LG2(?)
should result in Inf.
---
 src/tests/CMakeLists.txt                      |  1 +
 src/tests/video_core/pica_float.cpp           | 30 +++++++++++++++++++
 .../video_core/shader/shader_jit_compiler.cpp | 10 ++++---
 3 files changed, 37 insertions(+), 4 deletions(-)
 create mode 100644 src/tests/video_core/pica_float.cpp

diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index 590f07e78..741ba7ec0 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -13,6 +13,7 @@ add_executable(tests
     audio_core/audio_fixures.h
     audio_core/decoder_tests.cpp
     video_core/shader/shader_jit_compiler.cpp
+    video_core/pica_float.cpp
     audio_core/merryhime_3ds_audio/merry_audio/merry_audio.cpp
     audio_core/merryhime_3ds_audio/merry_audio/merry_audio.h
     audio_core/merryhime_3ds_audio/merry_audio/service_fixture.cpp
diff --git a/src/tests/video_core/pica_float.cpp b/src/tests/video_core/pica_float.cpp
new file mode 100644
index 000000000..394420c99
--- /dev/null
+++ b/src/tests/video_core/pica_float.cpp
@@ -0,0 +1,30 @@
+// Copyright 2024 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cmath>
+#include <catch2/catch_approx.hpp>
+#include <catch2/catch_test_macros.hpp>
+#include "video_core/pica_types.h"
+
+using Pica::f24;
+
+TEST_CASE("Infinities", "[video_core][pica_float]") {
+    REQUIRE(std::isinf(f24::FromFloat32(INFINITY).ToFloat32()));
+    REQUIRE(std::isinf(f24::FromFloat32(1.e20f).ToFloat32()));
+    REQUIRE(std::isinf(f24::FromFloat32(-1.e20f).ToFloat32()));
+}
+
+TEST_CASE("Subnormals", "[video_core][pica_float]") {
+    REQUIRE(f24::FromFloat32(1e-20f).ToFloat32() == 0.f);
+}
+
+TEST_CASE("NaN", "[video_core][pica_float]") {
+    const auto inf = f24::FromFloat32(INFINITY);
+    const auto nan = f24::FromFloat32(NAN);
+
+    REQUIRE(std::isnan(nan.ToFloat32()));
+    REQUIRE(std::isnan((nan * f24::Zero()).ToFloat32()));
+    REQUIRE(std::isnan((inf - inf).ToFloat32()));
+    REQUIRE((inf * f24::Zero()).ToFloat32() == 0.f);
+}
diff --git a/src/tests/video_core/shader/shader_jit_compiler.cpp b/src/tests/video_core/shader/shader_jit_compiler.cpp
index 01698a2e1..579bf7e99 100644
--- a/src/tests/video_core/shader/shader_jit_compiler.cpp
+++ b/src/tests/video_core/shader/shader_jit_compiler.cpp
@@ -260,7 +260,7 @@ TEST_CASE("LG2", "[video_core][shader][shader_jit]") {
     REQUIRE(std::isinf(shader.Run(0.f).x));
     REQUIRE(shader.Run(4.f).x == Catch::Approx(2.f));
     REQUIRE(shader.Run(64.f).x == Catch::Approx(6.f));
-    REQUIRE(shader.Run(1.e24f).x == Catch::Approx(79.7262742773f));
+    // REQUIRE(std::isinf(shader.Run(INFINITY).x));
 }
 
 TEST_CASE("EX2", "[video_core][shader][shader_jit]") {
@@ -277,8 +277,9 @@ TEST_CASE("EX2", "[video_core][shader][shader_jit]") {
     REQUIRE(shader.Run(0.f).x == Catch::Approx(1.f));
     REQUIRE(shader.Run(2.f).x == Catch::Approx(4.f));
     REQUIRE(shader.Run(6.f).x == Catch::Approx(64.f));
-    REQUIRE(shader.Run(79.7262742773f).x == Catch::Approx(1.e24f));
     REQUIRE(std::isinf(shader.Run(800.f).x));
+    // If we respect f24 precision, 2^79 = inf, as 79 > 63
+    // REQUIRE(std::isinf(shader.Run(79.7262742773f).x));
 }
 
 TEST_CASE("MUL", "[video_core][shader][shader_jit]") {
@@ -469,7 +470,7 @@ TEST_CASE("Uniform Read", "[video_core][shader][shader_jit]") {
         const float color = (i * 2.0f) / 255.0f;
         const auto color_f24 = Pica::f24::FromFloat32(color);
         shader.shader_setup->uniforms.f[i] = {color_f24, color_f24, color_f24, Pica::f24::One()};
-        f_uniforms[i] = {color, color, color, 1.0f};
+        f_uniforms[i] = {color_f24.ToFloat32(), color_f24.ToFloat32(), color_f24.ToFloat32(), 1.0f};
     }
 
     for (u32 i = 0; i < 96; ++i) {
@@ -506,7 +507,8 @@ TEST_CASE("Address Register Offset", "[video_core][shader][shader_jit]") {
             const auto color_f24 = Pica::f24::FromFloat32(color);
             shader.shader_setup->uniforms.f[i] = {color_f24, color_f24, color_f24,
                                                   Pica::f24::One()};
-            f_uniforms[i] = {color, color, color, 1.f};
+            f_uniforms[i] = {color_f24.ToFloat32(), color_f24.ToFloat32(), color_f24.ToFloat32(),
+                             1.f};
         } else if (i >= 0x60 && i < 0x64) {
             const u8 color = static_cast<u8>((i - 0x60) * 0x10);
             shader.shader_setup->uniforms.i[i - 0x60] = {color, color, color, 255};

From 265ed7ec14eff0b08696db2a9d259f27712a29da Mon Sep 17 00:00:00 2001
From: Vitor Kiguchi <vitor-kiguchi@hotmail.com>
Date: Mon, 5 Feb 2024 17:13:56 -0300
Subject: [PATCH 4/4] sw_rasterizer: don't include the bias in the barycentric
 coords

---
 .../renderer_software/sw_rasterizer.cpp          | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/video_core/renderer_software/sw_rasterizer.cpp b/src/video_core/renderer_software/sw_rasterizer.cpp
index 6011400e7..134acf5b5 100644
--- a/src/video_core/renderer_software/sw_rasterizer.cpp
+++ b/src/video_core/renderer_software/sw_rasterizer.cpp
@@ -30,7 +30,7 @@ using Pica::Texture::TextureInfo;
 // negative/positive z values when computing with f32 precision,
 // causing some vertices to get erroneously clipped. To workaround this problem,
 // we can use a very small epsilon value for clip plane comparison.
-constexpr f32 EPSILON_Z = 0.00000001f;
+constexpr f32 EPSILON_Z = 0.f;
 
 struct Vertex : Pica::OutputVertex {
     Vertex(const OutputVertex& v) : OutputVertex(v) {}
@@ -286,11 +286,11 @@ void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, con
     max_y = ((max_y + Fix12P4::FracMask()) & Fix12P4::IntMask());
 
     const int bias0 =
-        IsRightSideOrFlatBottomEdge(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) ? -1 : 0;
+        IsRightSideOrFlatBottomEdge(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) ? 1 : 0;
     const int bias1 =
-        IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0;
+        IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? 1 : 0;
     const int bias2 =
-        IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0;
+        IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? 1 : 0;
 
     const auto w_inverse = Common::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w);
 
@@ -313,13 +313,13 @@ void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, con
                 }
 
                 // Calculate the barycentric coordinates w0, w1 and w2
-                const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
-                const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
-                const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
+                const s32 w0 = SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
+                const s32 w1 = SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
+                const s32 w2 = SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
                 const s32 wsum = w0 + w1 + w2;
 
                 // If current pixel is not covered by the current primitive
-                if (w0 < 0 || w1 < 0 || w2 < 0) {
+                if (w0 < bias0 || w1 < bias1 || w2 < bias2) {
                     continue;
                 }