Refactor software renderer (#6621)

2025-12-17 20:58:47 +00:00 · 2023-06-24 01:59:18 +03:00 · 2023-06-24 01:59:18 +03:00 · 9b82de6b24
commit 9b82de6b24
parent 7198243319
39 changed files with 1815 additions and 1796 deletions
--- a/src/video_core/shader/debug_data.h
+++ b/src/video_core/shader/debug_data.h
@ -54,12 +54,12 @@ struct DebugData<true> {
            LOOP_INT_IN = 0x800,
        };

-        Common::Vec4<float24> src1;
-        Common::Vec4<float24> src2;
-        Common::Vec4<float24> src3;
+        Common::Vec4<f24> src1;
+        Common::Vec4<f24> src2;
+        Common::Vec4<f24> src3;

-        Common::Vec4<float24> dest_in;
-        Common::Vec4<float24> dest_out;
+        Common::Vec4<f24> dest_in;
+        Common::Vec4<f24> dest_out;

        s32 address_registers[2];
        bool conditional_code[2];
@ -89,7 +89,7 @@ template <DebugDataRecord::Type type, typename ValueType>
 inline void SetField(DebugDataRecord& record, ValueType value);

 template <>
-inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, float24* value) {
+inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, f24* value) {
    record.src1.x = value[0];
    record.src1.y = value[1];
    record.src1.z = value[2];
@ -97,7 +97,7 @@ inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, float24* va
 }

 template <>
-inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, float24* value) {
+inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, f24* value) {
    record.src2.x = value[0];
    record.src2.y = value[1];
    record.src2.z = value[2];
@ -105,7 +105,7 @@ inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, float24* va
 }

 template <>
-inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* value) {
+inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, f24* value) {
    record.src3.x = value[0];
    record.src3.y = value[1];
    record.src3.z = value[2];
@ -113,7 +113,7 @@ inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* va
 }

 template <>
-inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24* value) {
+inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, f24* value) {
    record.dest_in.x = value[0];
    record.dest_in.y = value[1];
    record.dest_in.z = value[2];
@ -121,7 +121,7 @@ inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24*
 }

 template <>
-inline void SetField<DebugDataRecord::DEST_OUT>(DebugDataRecord& record, float24* value) {
+inline void SetField<DebugDataRecord::DEST_OUT>(DebugDataRecord& record, f24* value) {
    record.dest_out.x = value[0];
    record.dest_out.y = value[1];
    record.dest_out.z = value[2];
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@ -5,10 +5,10 @@
 #include <cmath>
 #include <cstring>
 #include "common/arch.h"
+#include "common/assert.h"
 #include "common/bit_set.h"
 #include "common/logging/log.h"
 #include "common/microprofile.h"
-#include "video_core/pica_state.h"
 #include "video_core/regs_rasterizer.h"
 #include "video_core/regs_shader.h"
 #include "video_core/shader/shader.h"
@ -41,11 +41,11 @@ OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs,
        // Allow us to overflow OutputVertex to avoid branches, since
        // RasterizerRegs::VSOutputAttributes::INVALID would write to slot 31, which
        // would be out of bounds otherwise.
-        std::array<float24, 32> vertex_slots_overflow;
+        std::array<f24, 32> vertex_slots_overflow;
    };

    // Assert that OutputVertex has enough space for 24 semantic registers
-    static_assert(sizeof(std::array<float24, 24>) == sizeof(ret),
+    static_assert(sizeof(std::array<f24, 24>) == sizeof(ret),
                  "Struct and array have different sizes.");

    unsigned int num_attributes = regs.vs_output_total & 7;
@ -61,7 +61,7 @@ OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs,
    // interpolation
    for (unsigned i = 0; i < 4; ++i) {
        float c = std::fabs(ret.color[i].ToFloat32());
-        ret.color[i] = float24::FromFloat32(c < 1.0f ? c : 1.0f);
+        ret.color[i] = f24::FromFloat32(c < 1.0f ? c : 1.0f);
    }

    LOG_TRACE(HW_GPU,
@ -86,7 +86,7 @@ void UnitState::LoadInput(const ShaderRegs& config, const AttributeBuffer& input
    }
 }

-static void CopyRegistersToOutput(std::span<Common::Vec4<float24>, 16> regs, u32 mask,
+static void CopyRegistersToOutput(std::span<Common::Vec4<f24>, 16> regs, u32 mask,
                                  AttributeBuffer& buffer) {
    int output_i = 0;
    for (int reg : Common::BitSet<u32>(mask)) {
@ -108,7 +108,7 @@ GSEmitter::~GSEmitter() {
    delete handlers;
 }

-void GSEmitter::Emit(std::span<Common::Vec4<float24>, 16> output_regs) {
+void GSEmitter::Emit(std::span<Common::Vec4<f24>, 16> output_regs) {
    ASSERT(vertex_id < 3);
    // TODO: This should be merged with UnitState::WriteOutput somehow
    CopyRegistersToOutput(output_regs, output_mask, buffer[vertex_id]);
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@ -12,7 +12,6 @@
 #include <boost/serialization/access.hpp>
 #include <boost/serialization/array.hpp>
 #include <boost/serialization/base_object.hpp>
-#include "common/assert.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/hash.h"
@ -29,7 +28,7 @@ using ProgramCode = std::array<u32, MAX_PROGRAM_CODE_LENGTH>;
 using SwizzleData = std::array<u32, MAX_SWIZZLE_DATA_LENGTH>;

 struct AttributeBuffer {
-    alignas(16) Common::Vec4<float24> attr[16];
+    alignas(16) Common::Vec4<f24> attr[16];

 private:
    friend class boost::serialization::access;
@ -46,16 +45,16 @@ using VertexHandler = std::function<void(const AttributeBuffer&)>;
 using WindingSetter = std::function<void()>;

 struct OutputVertex {
-    Common::Vec4<float24> pos;
-    Common::Vec4<float24> quat;
-    Common::Vec4<float24> color;
-    Common::Vec2<float24> tc0;
-    Common::Vec2<float24> tc1;
-    float24 tc0_w;
+    Common::Vec4<f24> pos;
+    Common::Vec4<f24> quat;
+    Common::Vec4<f24> color;
+    Common::Vec2<f24> tc0;
+    Common::Vec2<f24> tc1;
+    f24 tc0_w;
    INSERT_PADDING_WORDS(1);
-    Common::Vec3<float24> view;
+    Common::Vec3<f24> view;
    INSERT_PADDING_WORDS(1);
-    Common::Vec2<float24> tc2;
+    Common::Vec2<f24> tc2;

    static void ValidateSemantics(const RasterizerRegs& regs);
    static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs,
@ -76,8 +75,8 @@ private:
    friend class boost::serialization::access;
 };
 #define ASSERT_POS(var, pos)                                                                       \
-    static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong "       \
-                                                                        "offset.")
+    static_assert(offsetof(OutputVertex, var) == pos * sizeof(f24), "Semantic at wrong "           \
+                                                                    "offset.")
 ASSERT_POS(pos, RasterizerRegs::VSOutputAttributes::POSITION_X);
 ASSERT_POS(quat, RasterizerRegs::VSOutputAttributes::QUATERNION_X);
 ASSERT_POS(color, RasterizerRegs::VSOutputAttributes::COLOR_R);
@ -109,7 +108,7 @@ struct GSEmitter {

    GSEmitter();
    ~GSEmitter();
-    void Emit(std::span<Common::Vec4<float24>, 16> output_regs);
+    void Emit(std::span<Common::Vec4<f24>, 16> output_regs);

 private:
    friend class boost::serialization::access;
@ -136,9 +135,9 @@ struct UnitState {
    struct Registers {
        // The registers are accessed by the shader JIT using SSE instructions, and are therefore
        // required to be 16-byte aligned.
-        alignas(16) std::array<Common::Vec4<float24>, 16> input;
-        alignas(16) std::array<Common::Vec4<float24>, 16> temporary;
-        alignas(16) std::array<Common::Vec4<float24>, 16> output;
+        alignas(16) std::array<Common::Vec4<f24>, 16> input;
+        alignas(16) std::array<Common::Vec4<f24>, 16> temporary;
+        alignas(16) std::array<Common::Vec4<f24>, 16> output;

    private:
        friend class boost::serialization::access;
@ -160,18 +159,16 @@ struct UnitState {
    GSEmitter* emitter_ptr;

    static std::size_t InputOffset(int register_index) {
-        return offsetof(UnitState, registers.input) +
-               register_index * sizeof(Common::Vec4<float24>);
+        return offsetof(UnitState, registers.input) + register_index * sizeof(Common::Vec4<f24>);
    }

    static std::size_t OutputOffset(int register_index) {
-        return offsetof(UnitState, registers.output) +
-               register_index * sizeof(Common::Vec4<float24>);
+        return offsetof(UnitState, registers.output) + register_index * sizeof(Common::Vec4<f24>);
    }

    static std::size_t TemporaryOffset(int register_index) {
        return offsetof(UnitState, registers.temporary) +
-               register_index * sizeof(Common::Vec4<float24>);
+               register_index * sizeof(Common::Vec4<f24>);
    }

    /**
@ -219,13 +216,13 @@ private:
 struct Uniforms {
    // The float uniforms are accessed by the shader JIT using SSE instructions, and are
    // therefore required to be 16-byte aligned.
-    alignas(16) std::array<Common::Vec4<float24>, 96> f;
+    alignas(16) std::array<Common::Vec4<f24>, 96> f;

    std::array<bool, 16> b;
    std::array<Common::Vec4<u8>, 4> i;

    static std::size_t GetFloatUniformOffset(unsigned index) {
-        return offsetof(Uniforms, f) + index * sizeof(Common::Vec4<float24>);
+        return offsetof(Uniforms, f) + index * sizeof(Common::Vec4<f24>);
    }

    static std::size_t GetBoolUniformOffset(unsigned index) {
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@ -80,7 +80,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
    const auto& program_code = setup.program_code;

    // Placeholder for invalid inputs
-    static float24 dummy_vec4_float24[4];
+    static f24 dummy_vec4_float24[4];

    unsigned iteration = 0;
    bool exit_loop = false;
@ -111,7 +111,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData

        debug_data.max_offset = std::max<u32>(debug_data.max_offset, 1 + program_counter);

-        auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* {
+        auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const f24* {
            switch (source_reg.GetRegisterType()) {
            case RegisterType::Input:
                return &state.registers.input[source_reg.GetIndex()].x;
@ -137,15 +137,15 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                    ? 0
                    : state.address_registers[instr.common.address_register_index - 1];

-            const float24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) +
-                                                        (is_inverted ? 0 : address_offset));
-            const float24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted) +
-                                                        (is_inverted ? address_offset : 0));
+            const f24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) +
+                                                    (is_inverted ? 0 : address_offset));
+            const f24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted) +
+                                                    (is_inverted ? address_offset : 0));

            const bool negate_src1 = ((bool)swizzle.negate_src1 != false);
            const bool negate_src2 = ((bool)swizzle.negate_src2 != false);

-            float24 src1[4] = {
+            f24 src1[4] = {
                src1_[(int)swizzle.src1_selector_0.Value()],
                src1_[(int)swizzle.src1_selector_1.Value()],
                src1_[(int)swizzle.src1_selector_2.Value()],
@ -157,7 +157,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                src1[2] = -src1[2];
                src1[3] = -src1[3];
            }
-            float24 src2[4] = {
+            f24 src2[4] = {
                src2_[(int)swizzle.src2_selector_0.Value()],
                src2_[(int)swizzle.src2_selector_1.Value()],
                src2_[(int)swizzle.src2_selector_2.Value()],
@ -170,12 +170,11 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                src2[3] = -src2[3];
            }

-            float24* dest =
-                (instr.common.dest.Value() < 0x10)
-                    ? &state.registers.output[instr.common.dest.Value().GetIndex()][0]
-                : (instr.common.dest.Value() < 0x20)
-                    ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0]
-                    : dummy_vec4_float24;
+            f24* dest = (instr.common.dest.Value() < 0x10)
+                            ? &state.registers.output[instr.common.dest.Value().GetIndex()][0]
+                        : (instr.common.dest.Value() < 0x20)
+                            ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0]
+                            : dummy_vec4_float24;

            debug_data.max_opdesc_id =
                std::max<u32>(debug_data.max_opdesc_id, 1 + instr.common.operand_desc_id);
@ -216,7 +215,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                    if (!swizzle.DestComponentEnabled(i))
                        continue;

-                    dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32()));
+                    dest[i] = f24::FromFloat32(std::floor(src1[i].ToFloat32()));
                }
                Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
                break;
@ -263,11 +262,10 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData

                OpCode::Id opcode = instr.opcode.Value().EffectiveOpCode();
                if (opcode == OpCode::Id::DPH || opcode == OpCode::Id::DPHI)
-                    src1[3] = float24::FromFloat32(1.0f);
+                    src1[3] = f24::One();

                int num_components = (opcode == OpCode::Id::DP3) ? 3 : 4;
-                float24 dot = std::inner_product(src1, src1 + num_components, src2,
-                                                 float24::FromFloat32(0.f));
+                f24 dot = std::inner_product(src1, src1 + num_components, src2, f24::Zero());

                for (int i = 0; i < 4; ++i) {
                    if (!swizzle.DestComponentEnabled(i))
@ -283,7 +281,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
            case OpCode::Id::RCP: {
                Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
                Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
-                float24 rcp_res = float24::FromFloat32(1.0f / src1[0].ToFloat32());
+                f24 rcp_res = f24::FromFloat32(1.0f / src1[0].ToFloat32());
                for (int i = 0; i < 4; ++i) {
                    if (!swizzle.DestComponentEnabled(i))
                        continue;
@ -298,7 +296,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
            case OpCode::Id::RSQ: {
                Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
                Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
-                float24 rsq_res = float24::FromFloat32(1.0f / std::sqrt(src1[0].ToFloat32()));
+                f24 rsq_res = f24::FromFloat32(1.0f / std::sqrt(src1[0].ToFloat32()));
                for (int i = 0; i < 4; ++i) {
                    if (!swizzle.DestComponentEnabled(i))
                        continue;
@ -345,8 +343,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                    if (!swizzle.DestComponentEnabled(i))
                        continue;

-                    dest[i] = (src1[i] >= src2[i]) ? float24::FromFloat32(1.0f)
-                                                   : float24::FromFloat32(0.0f);
+                    dest[i] = (src1[i] >= src2[i]) ? f24::One() : f24::Zero();
                }
                Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
                break;
@ -360,8 +357,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                    if (!swizzle.DestComponentEnabled(i))
                        continue;

-                    dest[i] = (src1[i] < src2[i]) ? float24::FromFloat32(1.0f)
-                                                  : float24::FromFloat32(0.0f);
+                    dest[i] = (src1[i] < src2[i]) ? f24::One() : f24::Zero();
                }
                Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
                break;
@ -413,7 +409,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);

                // EX2 only takes first component exp2 and writes it to all dest components
-                float24 ex2_res = float24::FromFloat32(std::exp2(src1[0].ToFloat32()));
+                f24 ex2_res = f24::FromFloat32(std::exp2(src1[0].ToFloat32()));
                for (int i = 0; i < 4; ++i) {
                    if (!swizzle.DestComponentEnabled(i))
                        continue;
@ -430,7 +426,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);

                // LG2 only takes the first component log2 and writes it to all dest components
-                float24 lg2_res = float24::FromFloat32(std::log2(src1[0].ToFloat32()));
+                f24 lg2_res = f24::FromFloat32(std::log2(src1[0].ToFloat32()));
                for (int i = 0; i < 4; ++i) {
                    if (!swizzle.DestComponentEnabled(i))
                        continue;
@ -466,17 +462,17 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                        ? 0
                        : state.address_registers[instr.mad.address_register_index - 1];

-                const float24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted));
-                const float24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted) +
-                                                            (!is_inverted * address_offset));
-                const float24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted) +
-                                                            (is_inverted * address_offset));
+                const f24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted));
+                const f24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted) +
+                                                        (!is_inverted * address_offset));
+                const f24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted) +
+                                                        (is_inverted * address_offset));

                const bool negate_src1 = ((bool)mad_swizzle.negate_src1 != false);
                const bool negate_src2 = ((bool)mad_swizzle.negate_src2 != false);
                const bool negate_src3 = ((bool)mad_swizzle.negate_src3 != false);

-                float24 src1[4] = {
+                f24 src1[4] = {
                    src1_[(int)mad_swizzle.src1_selector_0.Value()],
                    src1_[(int)mad_swizzle.src1_selector_1.Value()],
                    src1_[(int)mad_swizzle.src1_selector_2.Value()],
@ -488,7 +484,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                    src1[2] = -src1[2];
                    src1[3] = -src1[3];
                }
-                float24 src2[4] = {
+                f24 src2[4] = {
                    src2_[(int)mad_swizzle.src2_selector_0.Value()],
                    src2_[(int)mad_swizzle.src2_selector_1.Value()],
                    src2_[(int)mad_swizzle.src2_selector_2.Value()],
@ -500,7 +496,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                    src2[2] = -src2[2];
                    src2[3] = -src2[3];
                }
-                float24 src3[4] = {
+                f24 src3[4] = {
                    src3_[(int)mad_swizzle.src3_selector_0.Value()],
                    src3_[(int)mad_swizzle.src3_selector_1.Value()],
                    src3_[(int)mad_swizzle.src3_selector_2.Value()],
@ -513,12 +509,11 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                    src3[3] = -src3[3];
                }

-                float24* dest =
-                    (instr.mad.dest.Value() < 0x10)
-                        ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0]
-                    : (instr.mad.dest.Value() < 0x20)
-                        ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0]
-                        : dummy_vec4_float24;
+                f24* dest = (instr.mad.dest.Value() < 0x10)
+                                ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0]
+                            : (instr.mad.dest.Value() < 0x20)
+                                ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0]
+                                : dummy_vec4_float24;

                Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
                Record<DebugDataRecord::SRC2>(debug_data, iteration, src2);
@ -687,7 +682,7 @@ DebugData<true> InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup,
    DebugData<true> debug_data;

    // Setup input register table
-    state.registers.input.fill(Common::Vec4<float24>::AssignToAll(float24::Zero()));
+    state.registers.input.fill(Common::Vec4<f24>::AssignToAll(f24::Zero()));
    state.LoadInput(config, input);
    RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point);
    return debug_data;
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@ -5,6 +5,7 @@
 #include "common/arch.h"
 #if CITRA_ARCH(x86_64)

+#include "common/assert.h"
 #include "common/microprofile.h"
 #include "video_core/shader/shader.h"
 #include "video_core/shader/shader_jit_x64.h"
--- a/src/video_core/shader/shader_jit_x64_compiler.cpp
+++ b/src/video_core/shader/shader_jit_x64_compiler.cpp
@ -813,7 +813,7 @@ void JitShader::Compile_JMP(Instruction instr) {
    }
 }

-static void Emit(GSEmitter* emitter, Common::Vec4<float24> (*output)[16]) {
+static void Emit(GSEmitter* emitter, Common::Vec4<f24> (*output)[16]) {
    emitter->Emit(*output);
 }