Refactor software renderer (#6621)

This commit is contained in:
GPUCode 2023-06-24 01:59:18 +03:00 committed by GitHub
parent 7198243319
commit 9b82de6b24
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
39 changed files with 1815 additions and 1796 deletions

View file

@ -54,12 +54,12 @@ struct DebugData<true> {
LOOP_INT_IN = 0x800,
};
Common::Vec4<float24> src1;
Common::Vec4<float24> src2;
Common::Vec4<float24> src3;
Common::Vec4<f24> src1;
Common::Vec4<f24> src2;
Common::Vec4<f24> src3;
Common::Vec4<float24> dest_in;
Common::Vec4<float24> dest_out;
Common::Vec4<f24> dest_in;
Common::Vec4<f24> dest_out;
s32 address_registers[2];
bool conditional_code[2];
@ -89,7 +89,7 @@ template <DebugDataRecord::Type type, typename ValueType>
inline void SetField(DebugDataRecord& record, ValueType value);
template <>
inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, float24* value) {
inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, f24* value) {
record.src1.x = value[0];
record.src1.y = value[1];
record.src1.z = value[2];
@ -97,7 +97,7 @@ inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, float24* va
}
template <>
inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, float24* value) {
inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, f24* value) {
record.src2.x = value[0];
record.src2.y = value[1];
record.src2.z = value[2];
@ -105,7 +105,7 @@ inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, float24* va
}
template <>
inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* value) {
inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, f24* value) {
record.src3.x = value[0];
record.src3.y = value[1];
record.src3.z = value[2];
@ -113,7 +113,7 @@ inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* va
}
template <>
inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24* value) {
inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, f24* value) {
record.dest_in.x = value[0];
record.dest_in.y = value[1];
record.dest_in.z = value[2];
@ -121,7 +121,7 @@ inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24*
}
template <>
inline void SetField<DebugDataRecord::DEST_OUT>(DebugDataRecord& record, float24* value) {
inline void SetField<DebugDataRecord::DEST_OUT>(DebugDataRecord& record, f24* value) {
record.dest_out.x = value[0];
record.dest_out.y = value[1];
record.dest_out.z = value[2];

View file

@ -5,10 +5,10 @@
#include <cmath>
#include <cstring>
#include "common/arch.h"
#include "common/assert.h"
#include "common/bit_set.h"
#include "common/logging/log.h"
#include "common/microprofile.h"
#include "video_core/pica_state.h"
#include "video_core/regs_rasterizer.h"
#include "video_core/regs_shader.h"
#include "video_core/shader/shader.h"
@ -41,11 +41,11 @@ OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs,
// Allow us to overflow OutputVertex to avoid branches, since
// RasterizerRegs::VSOutputAttributes::INVALID would write to slot 31, which
// would be out of bounds otherwise.
std::array<float24, 32> vertex_slots_overflow;
std::array<f24, 32> vertex_slots_overflow;
};
// Assert that OutputVertex has enough space for 24 semantic registers
static_assert(sizeof(std::array<float24, 24>) == sizeof(ret),
static_assert(sizeof(std::array<f24, 24>) == sizeof(ret),
"Struct and array have different sizes.");
unsigned int num_attributes = regs.vs_output_total & 7;
@ -61,7 +61,7 @@ OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs,
// interpolation
for (unsigned i = 0; i < 4; ++i) {
float c = std::fabs(ret.color[i].ToFloat32());
ret.color[i] = float24::FromFloat32(c < 1.0f ? c : 1.0f);
ret.color[i] = f24::FromFloat32(c < 1.0f ? c : 1.0f);
}
LOG_TRACE(HW_GPU,
@ -86,7 +86,7 @@ void UnitState::LoadInput(const ShaderRegs& config, const AttributeBuffer& input
}
}
static void CopyRegistersToOutput(std::span<Common::Vec4<float24>, 16> regs, u32 mask,
static void CopyRegistersToOutput(std::span<Common::Vec4<f24>, 16> regs, u32 mask,
AttributeBuffer& buffer) {
int output_i = 0;
for (int reg : Common::BitSet<u32>(mask)) {
@ -108,7 +108,7 @@ GSEmitter::~GSEmitter() {
delete handlers;
}
void GSEmitter::Emit(std::span<Common::Vec4<float24>, 16> output_regs) {
void GSEmitter::Emit(std::span<Common::Vec4<f24>, 16> output_regs) {
ASSERT(vertex_id < 3);
// TODO: This should be merged with UnitState::WriteOutput somehow
CopyRegistersToOutput(output_regs, output_mask, buffer[vertex_id]);

View file

@ -12,7 +12,6 @@
#include <boost/serialization/access.hpp>
#include <boost/serialization/array.hpp>
#include <boost/serialization/base_object.hpp>
#include "common/assert.h"
#include "common/common_funcs.h"
#include "common/common_types.h"
#include "common/hash.h"
@ -29,7 +28,7 @@ using ProgramCode = std::array<u32, MAX_PROGRAM_CODE_LENGTH>;
using SwizzleData = std::array<u32, MAX_SWIZZLE_DATA_LENGTH>;
struct AttributeBuffer {
alignas(16) Common::Vec4<float24> attr[16];
alignas(16) Common::Vec4<f24> attr[16];
private:
friend class boost::serialization::access;
@ -46,16 +45,16 @@ using VertexHandler = std::function<void(const AttributeBuffer&)>;
using WindingSetter = std::function<void()>;
struct OutputVertex {
Common::Vec4<float24> pos;
Common::Vec4<float24> quat;
Common::Vec4<float24> color;
Common::Vec2<float24> tc0;
Common::Vec2<float24> tc1;
float24 tc0_w;
Common::Vec4<f24> pos;
Common::Vec4<f24> quat;
Common::Vec4<f24> color;
Common::Vec2<f24> tc0;
Common::Vec2<f24> tc1;
f24 tc0_w;
INSERT_PADDING_WORDS(1);
Common::Vec3<float24> view;
Common::Vec3<f24> view;
INSERT_PADDING_WORDS(1);
Common::Vec2<float24> tc2;
Common::Vec2<f24> tc2;
static void ValidateSemantics(const RasterizerRegs& regs);
static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs,
@ -76,8 +75,8 @@ private:
friend class boost::serialization::access;
};
#define ASSERT_POS(var, pos) \
static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong " \
"offset.")
static_assert(offsetof(OutputVertex, var) == pos * sizeof(f24), "Semantic at wrong " \
"offset.")
ASSERT_POS(pos, RasterizerRegs::VSOutputAttributes::POSITION_X);
ASSERT_POS(quat, RasterizerRegs::VSOutputAttributes::QUATERNION_X);
ASSERT_POS(color, RasterizerRegs::VSOutputAttributes::COLOR_R);
@ -109,7 +108,7 @@ struct GSEmitter {
GSEmitter();
~GSEmitter();
void Emit(std::span<Common::Vec4<float24>, 16> output_regs);
void Emit(std::span<Common::Vec4<f24>, 16> output_regs);
private:
friend class boost::serialization::access;
@ -136,9 +135,9 @@ struct UnitState {
struct Registers {
// The registers are accessed by the shader JIT using SSE instructions, and are therefore
// required to be 16-byte aligned.
alignas(16) std::array<Common::Vec4<float24>, 16> input;
alignas(16) std::array<Common::Vec4<float24>, 16> temporary;
alignas(16) std::array<Common::Vec4<float24>, 16> output;
alignas(16) std::array<Common::Vec4<f24>, 16> input;
alignas(16) std::array<Common::Vec4<f24>, 16> temporary;
alignas(16) std::array<Common::Vec4<f24>, 16> output;
private:
friend class boost::serialization::access;
@ -160,18 +159,16 @@ struct UnitState {
GSEmitter* emitter_ptr;
static std::size_t InputOffset(int register_index) {
return offsetof(UnitState, registers.input) +
register_index * sizeof(Common::Vec4<float24>);
return offsetof(UnitState, registers.input) + register_index * sizeof(Common::Vec4<f24>);
}
static std::size_t OutputOffset(int register_index) {
return offsetof(UnitState, registers.output) +
register_index * sizeof(Common::Vec4<float24>);
return offsetof(UnitState, registers.output) + register_index * sizeof(Common::Vec4<f24>);
}
static std::size_t TemporaryOffset(int register_index) {
return offsetof(UnitState, registers.temporary) +
register_index * sizeof(Common::Vec4<float24>);
register_index * sizeof(Common::Vec4<f24>);
}
/**
@ -219,13 +216,13 @@ private:
struct Uniforms {
// The float uniforms are accessed by the shader JIT using SSE instructions, and are
// therefore required to be 16-byte aligned.
alignas(16) std::array<Common::Vec4<float24>, 96> f;
alignas(16) std::array<Common::Vec4<f24>, 96> f;
std::array<bool, 16> b;
std::array<Common::Vec4<u8>, 4> i;
static std::size_t GetFloatUniformOffset(unsigned index) {
return offsetof(Uniforms, f) + index * sizeof(Common::Vec4<float24>);
return offsetof(Uniforms, f) + index * sizeof(Common::Vec4<f24>);
}
static std::size_t GetBoolUniformOffset(unsigned index) {

View file

@ -80,7 +80,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
const auto& program_code = setup.program_code;
// Placeholder for invalid inputs
static float24 dummy_vec4_float24[4];
static f24 dummy_vec4_float24[4];
unsigned iteration = 0;
bool exit_loop = false;
@ -111,7 +111,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
debug_data.max_offset = std::max<u32>(debug_data.max_offset, 1 + program_counter);
auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* {
auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const f24* {
switch (source_reg.GetRegisterType()) {
case RegisterType::Input:
return &state.registers.input[source_reg.GetIndex()].x;
@ -137,15 +137,15 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
? 0
: state.address_registers[instr.common.address_register_index - 1];
const float24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) +
(is_inverted ? 0 : address_offset));
const float24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted) +
(is_inverted ? address_offset : 0));
const f24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) +
(is_inverted ? 0 : address_offset));
const f24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted) +
(is_inverted ? address_offset : 0));
const bool negate_src1 = ((bool)swizzle.negate_src1 != false);
const bool negate_src2 = ((bool)swizzle.negate_src2 != false);
float24 src1[4] = {
f24 src1[4] = {
src1_[(int)swizzle.src1_selector_0.Value()],
src1_[(int)swizzle.src1_selector_1.Value()],
src1_[(int)swizzle.src1_selector_2.Value()],
@ -157,7 +157,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
src1[2] = -src1[2];
src1[3] = -src1[3];
}
float24 src2[4] = {
f24 src2[4] = {
src2_[(int)swizzle.src2_selector_0.Value()],
src2_[(int)swizzle.src2_selector_1.Value()],
src2_[(int)swizzle.src2_selector_2.Value()],
@ -170,12 +170,11 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
src2[3] = -src2[3];
}
float24* dest =
(instr.common.dest.Value() < 0x10)
? &state.registers.output[instr.common.dest.Value().GetIndex()][0]
: (instr.common.dest.Value() < 0x20)
? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0]
: dummy_vec4_float24;
f24* dest = (instr.common.dest.Value() < 0x10)
? &state.registers.output[instr.common.dest.Value().GetIndex()][0]
: (instr.common.dest.Value() < 0x20)
? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0]
: dummy_vec4_float24;
debug_data.max_opdesc_id =
std::max<u32>(debug_data.max_opdesc_id, 1 + instr.common.operand_desc_id);
@ -216,7 +215,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32()));
dest[i] = f24::FromFloat32(std::floor(src1[i].ToFloat32()));
}
Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
break;
@ -263,11 +262,10 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
OpCode::Id opcode = instr.opcode.Value().EffectiveOpCode();
if (opcode == OpCode::Id::DPH || opcode == OpCode::Id::DPHI)
src1[3] = float24::FromFloat32(1.0f);
src1[3] = f24::One();
int num_components = (opcode == OpCode::Id::DP3) ? 3 : 4;
float24 dot = std::inner_product(src1, src1 + num_components, src2,
float24::FromFloat32(0.f));
f24 dot = std::inner_product(src1, src1 + num_components, src2, f24::Zero());
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
@ -283,7 +281,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
case OpCode::Id::RCP: {
Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
float24 rcp_res = float24::FromFloat32(1.0f / src1[0].ToFloat32());
f24 rcp_res = f24::FromFloat32(1.0f / src1[0].ToFloat32());
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
@ -298,7 +296,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
case OpCode::Id::RSQ: {
Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
float24 rsq_res = float24::FromFloat32(1.0f / std::sqrt(src1[0].ToFloat32()));
f24 rsq_res = f24::FromFloat32(1.0f / std::sqrt(src1[0].ToFloat32()));
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
@ -345,8 +343,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = (src1[i] >= src2[i]) ? float24::FromFloat32(1.0f)
: float24::FromFloat32(0.0f);
dest[i] = (src1[i] >= src2[i]) ? f24::One() : f24::Zero();
}
Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
break;
@ -360,8 +357,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
if (!swizzle.DestComponentEnabled(i))
continue;
dest[i] = (src1[i] < src2[i]) ? float24::FromFloat32(1.0f)
: float24::FromFloat32(0.0f);
dest[i] = (src1[i] < src2[i]) ? f24::One() : f24::Zero();
}
Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
break;
@ -413,7 +409,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
// EX2 only takes first component exp2 and writes it to all dest components
float24 ex2_res = float24::FromFloat32(std::exp2(src1[0].ToFloat32()));
f24 ex2_res = f24::FromFloat32(std::exp2(src1[0].ToFloat32()));
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
@ -430,7 +426,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
// LG2 only takes the first component log2 and writes it to all dest components
float24 lg2_res = float24::FromFloat32(std::log2(src1[0].ToFloat32()));
f24 lg2_res = f24::FromFloat32(std::log2(src1[0].ToFloat32()));
for (int i = 0; i < 4; ++i) {
if (!swizzle.DestComponentEnabled(i))
continue;
@ -466,17 +462,17 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
? 0
: state.address_registers[instr.mad.address_register_index - 1];
const float24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted));
const float24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted) +
(!is_inverted * address_offset));
const float24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted) +
(is_inverted * address_offset));
const f24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted));
const f24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted) +
(!is_inverted * address_offset));
const f24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted) +
(is_inverted * address_offset));
const bool negate_src1 = ((bool)mad_swizzle.negate_src1 != false);
const bool negate_src2 = ((bool)mad_swizzle.negate_src2 != false);
const bool negate_src3 = ((bool)mad_swizzle.negate_src3 != false);
float24 src1[4] = {
f24 src1[4] = {
src1_[(int)mad_swizzle.src1_selector_0.Value()],
src1_[(int)mad_swizzle.src1_selector_1.Value()],
src1_[(int)mad_swizzle.src1_selector_2.Value()],
@ -488,7 +484,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
src1[2] = -src1[2];
src1[3] = -src1[3];
}
float24 src2[4] = {
f24 src2[4] = {
src2_[(int)mad_swizzle.src2_selector_0.Value()],
src2_[(int)mad_swizzle.src2_selector_1.Value()],
src2_[(int)mad_swizzle.src2_selector_2.Value()],
@ -500,7 +496,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
src2[2] = -src2[2];
src2[3] = -src2[3];
}
float24 src3[4] = {
f24 src3[4] = {
src3_[(int)mad_swizzle.src3_selector_0.Value()],
src3_[(int)mad_swizzle.src3_selector_1.Value()],
src3_[(int)mad_swizzle.src3_selector_2.Value()],
@ -513,12 +509,11 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
src3[3] = -src3[3];
}
float24* dest =
(instr.mad.dest.Value() < 0x10)
? &state.registers.output[instr.mad.dest.Value().GetIndex()][0]
: (instr.mad.dest.Value() < 0x20)
? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0]
: dummy_vec4_float24;
f24* dest = (instr.mad.dest.Value() < 0x10)
? &state.registers.output[instr.mad.dest.Value().GetIndex()][0]
: (instr.mad.dest.Value() < 0x20)
? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0]
: dummy_vec4_float24;
Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
Record<DebugDataRecord::SRC2>(debug_data, iteration, src2);
@ -687,7 +682,7 @@ DebugData<true> InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup,
DebugData<true> debug_data;
// Setup input register table
state.registers.input.fill(Common::Vec4<float24>::AssignToAll(float24::Zero()));
state.registers.input.fill(Common::Vec4<f24>::AssignToAll(f24::Zero()));
state.LoadInput(config, input);
RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point);
return debug_data;

View file

@ -5,6 +5,7 @@
#include "common/arch.h"
#if CITRA_ARCH(x86_64)
#include "common/assert.h"
#include "common/microprofile.h"
#include "video_core/shader/shader.h"
#include "video_core/shader/shader_jit_x64.h"

View file

@ -813,7 +813,7 @@ void JitShader::Compile_JMP(Instruction instr) {
}
}
static void Emit(GSEmitter* emitter, Common::Vec4<float24> (*output)[16]) {
static void Emit(GSEmitter* emitter, Common::Vec4<f24> (*output)[16]) {
emitter->Emit(*output);
}