From 0da6a7e2348843027019934ae208753324532fa1 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Thu, 1 Jan 2015 19:58:18 +0100
Subject: [PATCH 01/18] GPU: Properly implement memory fills.

---
 src/core/hle/service/gsp_gpu.cpp | 32 +++++++++++++-------------
 src/core/hle/service/gsp_gpu.h   |  4 ++++
 src/core/hw/gpu.cpp              | 39 ++++++++++++++++++++++----------
 src/core/hw/gpu.h                | 32 +++++++++++++++++++++++---
 4 files changed, 76 insertions(+), 31 deletions(-)

diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp
index 31e61391f..c23cfa3c8 100644
--- a/src/core/hle/service/gsp_gpu.cpp
+++ b/src/core/hle/service/gsp_gpu.cpp
@@ -368,28 +368,28 @@ static void ExecuteCommand(const Command& command, u32 thread_id) {
     case CommandId::SET_MEMORY_FILL:
     {
         auto& params = command.memory_fill;
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_start)), 
-                Memory::VirtualToPhysicalAddress(params.start1) >> 3);
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_end)), 
-                Memory::VirtualToPhysicalAddress(params.end1) >> 3);
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].size)), params.end1 - params.start1);
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].value)), params.value1);
+        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_start)),
+                         Memory::VirtualToPhysicalAddress(params.start1) >> 3);
+        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].address_end)),
+                         Memory::VirtualToPhysicalAddress(params.end1) >> 3);
+        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].value_32bit)), params.value1);
+        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[0].control)), params.control1);
 
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_start)), 
-                Memory::VirtualToPhysicalAddress(params.start2) >> 3);
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_end)), 
-                Memory::VirtualToPhysicalAddress(params.end2) >> 3);
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].size)), params.end2 - params.start2);
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].value)), params.value2);
+        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_start)),
+                         Memory::VirtualToPhysicalAddress(params.start2) >> 3);
+        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].address_end)),
+                         Memory::VirtualToPhysicalAddress(params.end2) >> 3);
+        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].value_32bit)), params.value2);
+        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(memory_fill_config[1].control)), params.control2);
         break;
     }
 
     case CommandId::SET_DISPLAY_TRANSFER:
     {
         auto& params = command.image_copy;
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)), 
+        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)),
                 Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)), 
+        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)),
                 Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3);
         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_size)), params.in_buffer_size);
         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_size)), params.out_buffer_size);
@@ -402,9 +402,9 @@ static void ExecuteCommand(const Command& command, u32 thread_id) {
     case CommandId::SET_TEXTURE_COPY:
     {
         auto& params = command.image_copy;
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)), 
+        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_address)),
                 Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);
-        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)), 
+        WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_address)),
                 Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3);
         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.input_size)), params.in_buffer_size);
         WriteGPURegister(static_cast<u32>(GPU_REG_INDEX(display_transfer_config.output_size)), params.out_buffer_size);
diff --git a/src/core/hle/service/gsp_gpu.h b/src/core/hle/service/gsp_gpu.h
index 65abb194a..a435d418a 100644
--- a/src/core/hle/service/gsp_gpu.h
+++ b/src/core/hle/service/gsp_gpu.h
@@ -109,9 +109,13 @@ struct Command {
             u32 start1;
             u32 value1;
             u32 end1;
+
             u32 start2;
             u32 value2;
             u32 end2;
+
+            u16 control1;
+            u16 control2;
         } memory_fill;
 
         struct {
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index aad0e5d0d..bd7d92cd1 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -67,23 +67,38 @@ inline void Write(u32 addr, const T data) {
     switch (index) {
 
     // Memory fills are triggered once the fill value is written.
-    // NOTE: This is not verified.
-    case GPU_REG_INDEX_WORKAROUND(memory_fill_config[0].value, 0x00004 + 0x3):
-    case GPU_REG_INDEX_WORKAROUND(memory_fill_config[1].value, 0x00008 + 0x3):
+    case GPU_REG_INDEX_WORKAROUND(memory_fill_config[0].trigger, 0x00004 + 0x3):
+    case GPU_REG_INDEX_WORKAROUND(memory_fill_config[1].trigger, 0x00008 + 0x3):
     {
-        const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].value));
-        const auto& config = g_regs.memory_fill_config[is_second_filler];
+        const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].trigger));
+        auto& config = g_regs.memory_fill_config[is_second_filler];
 
-        // TODO: Not sure if this check should be done at GSP level instead
-        if (config.address_start) {
-            // TODO: Not sure if this algorithm is correct, particularly because it doesn't use the size member at all
-            u32* start = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress()));
-            u32* end = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress()));
-            for (u32* ptr = start; ptr < end; ++ptr)
-                *ptr = bswap32(config.value); // TODO: This is just a workaround to missing framebuffer format emulation
+        if (config.address_start && config.trigger) {
+            u8* start = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress()));
+            u8* end = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress()));
+
+            if (config.fill_24bit) {
+                // fill with 24-bit values
+                for (u8* ptr = start; ptr < end; ptr += 3) {
+                    ptr[0] = config.value_24bit_b;
+                    ptr[1] = config.value_24bit_g;
+                    ptr[2] = config.value_24bit_r;
+                }
+            } else if (config.fill_32bit) {
+                // fill with 32-bit values
+                for (u32* ptr = (u32*)start; ptr < (u32*)end; ++ptr)
+                    *ptr = config.value_32bit;
+            } else {
+                // fill with 16-bit values
+                for (u16* ptr = (u16*)start; ptr < (u16*)end; ++ptr)
+                    *ptr = config.value_16bit;
+            }
 
             LOG_TRACE(HW_GPU, "MemoryFill from 0x%08x to 0x%08x", config.GetStartAddress(), config.GetEndAddress());
 
+            config.trigger = 0;
+            config.finished = 1;
+
             if (!is_second_filler) {
                 GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PSC0);
             } else {
diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h
index 9fd694f65..df9aa0d71 100644
--- a/src/core/hw/gpu.h
+++ b/src/core/hw/gpu.h
@@ -84,9 +84,35 @@ struct Regs {
 
     struct {
         u32 address_start;
-        u32 address_end; // ?
-        u32 size;
-        u32 value; // ?
+        u32 address_end;
+
+        union {
+            u32 value_32bit;
+
+            BitField<0, 16, u32> value_16bit;
+
+            // TODO: Verify component order
+            BitField< 0, 8, u32> value_24bit_r;
+            BitField< 8, 8, u32> value_24bit_g;
+            BitField<16, 8, u32> value_24bit_b;
+        };
+
+        union {
+            u32 control;
+
+            // Setting this field to 1 triggers the memory fill.
+            // This field also acts as a status flag, and gets reset to 0 upon completion.
+            BitField<0, 1, u32> trigger;
+
+            // Set to 1 upon completion.
+            BitField<0, 1, u32> finished;
+
+            // 0: fill with 16- or 32-bit wide values; 1: fill with 24-bit wide values
+            BitField<8, 1, u32> fill_24bit;
+
+            // 0: fill with 16-bit wide values; 1: fill with 32-bit wide values
+            BitField<9, 1, u32> fill_32bit;
+        };
 
         inline u32 GetStartAddress() const {
             return DecodeAddressRegister(address_start);

From 67120270f2c3250aca49d813278b342787e3cae0 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Fri, 2 Jan 2015 20:40:52 +0100
Subject: [PATCH 02/18] Pica/CommandProcessor: Work around initialized vertex
 attributes some more.

---
 src/video_core/command_processor.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 0d9f4ba66..b2cc0f027 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <boost/range/algorithm/fill.hpp>
+
 #include "clipper.h"
 #include "command_processor.h"
 #include "math.h"
@@ -65,10 +67,14 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
 
             // Information about internal vertex attributes
             u32 vertex_attribute_sources[16];
-            std::fill(vertex_attribute_sources, &vertex_attribute_sources[16], 0xdeadbeef);
+            boost::fill(vertex_attribute_sources, 0xdeadbeef);
             u32 vertex_attribute_strides[16];
             u32 vertex_attribute_formats[16];
-            u32 vertex_attribute_elements[16];
+
+            // HACK: Initialize vertex_attribute_elements to zero to prevent infinite loops below.
+            // This is one of the hacks required to deal with uninitalized vertex attributes.
+            // TODO: Fix this properly.
+            u32 vertex_attribute_elements[16] = {};
             u32 vertex_attribute_element_size[16];
 
             // Setup attribute data from loaders

From 6c26ec72a5b299a5ceb3e4ca7ed0712d312da548 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Fri, 2 Jan 2015 20:59:23 +0100
Subject: [PATCH 03/18] Pica/CommandProcessor: Properly implement shader load
 destination offset registers.

---
 src/video_core/command_processor.cpp | 22 ++++------------------
 src/video_core/pica.h                |  8 ++++++--
 2 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index b2cc0f027..586ad62b6 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -25,10 +25,6 @@ static int float_regs_counter = 0;
 
 static u32 uniform_write_buffer[4];
 
-// Used for VSLoadProgramData and VSLoadSwizzleData
-static u32 vs_binary_write_offset = 0;
-static u32 vs_swizzle_write_offset = 0;
-
 static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
 
     if (id >= registers.NumIds())
@@ -258,11 +254,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             break;
         }
 
-        // Seems to be used to reset the write pointer for VSLoadProgramData
-        case PICA_REG_INDEX(vs_program.begin_load):
-            vs_binary_write_offset = 0;
-            break;
-
         // Load shader program code
         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):
         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd):
@@ -273,16 +264,11 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2):
         case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3):
         {
-            VertexShader::SubmitShaderMemoryChange(vs_binary_write_offset, value);
-            vs_binary_write_offset++;
+            VertexShader::SubmitShaderMemoryChange(registers.vs_program.offset, value);
+            registers.vs_program.offset++;
             break;
         }
 
-        // Seems to be used to reset the write pointer for VSLoadSwizzleData
-        case PICA_REG_INDEX(vs_swizzle_patterns.begin_load):
-            vs_swizzle_write_offset = 0;
-            break;
-
         // Load swizzle pattern data
         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6):
         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7):
@@ -293,8 +279,8 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc):
         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd):
         {
-            VertexShader::SubmitSwizzleDataChange(vs_swizzle_write_offset, value);
-            vs_swizzle_write_offset++;
+            VertexShader::SubmitSwizzleDataChange(registers.vs_swizzle_patterns.offset, value);
+            registers.vs_swizzle_patterns.offset++;
             break;
         }
 
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 9c1a12dc8..cf9dc4853 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -678,7 +678,9 @@ struct Regs {
     INSERT_PADDING_WORDS(0x2);
 
     struct {
-        u32 begin_load;
+        // Offset of the next instruction to write code to.
+        // Incremented with each instruction write.
+        u32 offset;
 
         // Writing to these registers sets the "current" word in the shader program.
         // TODO: It's not clear how the hardware stores what the "current" word is.
@@ -690,7 +692,9 @@ struct Regs {
     // This register group is used to load an internal table of swizzling patterns,
     // which are indexed by each shader instruction to specify vector component swizzling.
     struct {
-        u32 begin_load;
+        // Offset of the next swizzle pattern to write code to.
+        // Incremented with each instruction write.
+        u32 offset;
 
         // Writing to these registers sets the "current" swizzle pattern in the table.
         // TODO: It's not clear how the hardware stores what the "current" swizzle pattern is.

From 70a764d992937b2919c6262c70c85117fe22d7d9 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 21 Dec 2014 03:01:35 +0100
Subject: [PATCH 04/18] Pica/VertexShader: Implement the LOOP instruction.

---
 src/video_core/vertex_shader.cpp | 50 +++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index 80935a50a..def868ac7 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -85,8 +85,11 @@ struct VertexShaderState {
     };
 
     struct CallStackElement {
-        u32 final_address;
-        u32 return_address;
+        u32 final_address;  // Address upon which we jump to return_address
+        u32 return_address; // Where to jump when leaving scope
+        u8 repeat_counter;  // How often to repeat until this call stack element is removed
+        u8 loop_increment;  // Which value to add to the loop counter after an iteration
+                            // TODO: Should this be a signed value? Does it even matter?
     };
 
     // TODO: Is there a maximal size for this?
@@ -105,9 +108,14 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
     while (true) {
         if (!state.call_stack.empty()) {
-            if (state.program_counter - shader_memory.data() == state.call_stack.top().final_address) {
-                state.program_counter = &shader_memory[state.call_stack.top().return_address];
-                state.call_stack.pop();
+            auto& top = state.call_stack.top();
+            if (state.program_counter - shader_memory.data() == top.final_address) {
+                state.address_registers[2] += top.loop_increment;
+
+                if (top.repeat_counter-- == 0) {
+                    state.program_counter = &shader_memory[top.return_address];
+                    state.call_stack.pop();
+                }
 
                 // TODO: Is "trying again" accurate to hardware?
                 continue;
@@ -118,9 +126,10 @@ static void ProcessShaderCode(VertexShaderState& state) {
         const Instruction& instr = *(const Instruction*)state.program_counter;
         const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
 
-        auto call = [&](VertexShaderState& state, u32 offset, u32 num_instructions, u32 return_offset) {
+        static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions,
+                              u32 return_offset, u8 repeat_count, u8 loop_increment) {
             state.program_counter = &shader_memory[offset] - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
-            state.call_stack.push({ offset + num_instructions, return_offset });
+            state.call_stack.push({ offset + num_instructions, return_offset, repeat_count, loop_increment });
         };
         u32 binary_offset = state.program_counter - shader_memory.data();
 
@@ -457,7 +466,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 call(state,
                      instr.flow_control.dest_offset,
                      instr.flow_control.num_instructions,
-                     binary_offset + 1);
+                     binary_offset + 1, 0, 0);
                 break;
 
             case Instruction::OpCode::CALLU:
@@ -465,7 +474,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                     call(state,
                         instr.flow_control.dest_offset,
                         instr.flow_control.num_instructions,
-                        binary_offset + 1);
+                        binary_offset + 1, 0, 0);
                 }
                 break;
 
@@ -474,7 +483,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                     call(state,
                         instr.flow_control.dest_offset,
                         instr.flow_control.num_instructions,
-                        binary_offset + 1);
+                        binary_offset + 1, 0, 0);
                 }
                 break;
 
@@ -486,12 +495,12 @@ static void ProcessShaderCode(VertexShaderState& state) {
                     call(state,
                          binary_offset + 1,
                          instr.flow_control.dest_offset - binary_offset - 1,
-                         instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 } else {
                     call(state,
                          instr.flow_control.dest_offset,
                          instr.flow_control.num_instructions,
-                         instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 }
 
                 break;
@@ -504,17 +513,30 @@ static void ProcessShaderCode(VertexShaderState& state) {
                     call(state,
                          binary_offset + 1,
                          instr.flow_control.dest_offset - binary_offset - 1,
-                         instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 } else {
                     call(state,
                          instr.flow_control.dest_offset,
                          instr.flow_control.num_instructions,
-                         instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 }
 
                 break;
             }
 
+            case Instruction::OpCode::LOOP:
+            {
+                state.address_registers[2] = shader_uniforms.i[instr.flow_control.int_uniform_id].y;
+
+                call(state,
+                     binary_offset + 1,
+                     instr.flow_control.dest_offset - binary_offset + 1,
+                     instr.flow_control.dest_offset + 1,
+                     shader_uniforms.i[instr.flow_control.int_uniform_id].x,
+                     shader_uniforms.i[instr.flow_control.int_uniform_id].z);
+                break;
+            }
+
             default:
                 LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
                           (int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex);

From 365236fa4c96eaba94b715b6844bff64238b70e5 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Fri, 2 Jan 2015 20:37:25 +0100
Subject: [PATCH 05/18] Pica: Cleanup clipping code and change screenspace z to
 range from -1..0.

The change in depth range seems to reflect better to what applications are expecting, and makes for cleaner code overall (hence is more likely to reflect hardware behavior).
---
 src/video_core/clipper.cpp    | 84 +++++++++++++++--------------------
 src/video_core/rasterizer.cpp | 11 ++---
 2 files changed, 42 insertions(+), 53 deletions(-)

diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 1744066ba..ba3876a76 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -15,30 +15,18 @@ namespace Clipper {
 
 struct ClippingEdge {
 public:
-    enum Type {
-        POS_X = 0,
-        NEG_X = 1,
-        POS_Y = 2,
-        NEG_Y = 3,
-        POS_Z = 4,
-        NEG_Z = 5,
-    };
-
-    ClippingEdge(Type type, float24 position) : type(type), pos(position) {}
+    ClippingEdge(Math::Vec4<float24> coeffs,
+                 Math::Vec4<float24> bias = Math::Vec4<float24>(float24::FromFloat32(0),
+                                                                float24::FromFloat32(0),
+                                                                float24::FromFloat32(0),
+                                                                float24::FromFloat32(0)))
+        : coeffs(coeffs),
+          bias(bias)
+    {
+    }
 
     bool IsInside(const OutputVertex& vertex) const {
-        switch (type) {
-        case POS_X: return vertex.pos.x <= pos * vertex.pos.w;
-        case NEG_X: return vertex.pos.x >= pos * vertex.pos.w;
-        case POS_Y: return vertex.pos.y <= pos * vertex.pos.w;
-        case NEG_Y: return vertex.pos.y >= pos * vertex.pos.w;
-
-        // TODO: Check z compares ... should be 0..1 instead?
-        case POS_Z: return vertex.pos.z <= pos * vertex.pos.w;
-
-        default:
-        case NEG_Z: return vertex.pos.z >= pos * vertex.pos.w;
-        }
+        return Math::Dot(vertex.pos + bias, coeffs) <= float24::FromFloat32(0);
     }
 
     bool IsOutSide(const OutputVertex& vertex) const {
@@ -46,31 +34,17 @@ public:
     }
 
     OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const {
-        auto dotpr = [this](const OutputVertex& vtx) {
-            switch (type) {
-            case POS_X: return vtx.pos.x - vtx.pos.w;
-            case NEG_X: return -vtx.pos.x - vtx.pos.w;
-            case POS_Y: return vtx.pos.y - vtx.pos.w;
-            case NEG_Y: return -vtx.pos.y - vtx.pos.w;
-
-            // TODO: Verify z clipping
-            case POS_Z: return vtx.pos.z - vtx.pos.w;
-
-            default:
-            case NEG_Z: return -vtx.pos.w;
-            }
-        };
-
-        float24 dp = dotpr(v0);
-        float24 dp_prev = dotpr(v1);
+        float24 dp = Math::Dot(v0.pos + bias, coeffs);
+        float24 dp_prev = Math::Dot(v1.pos + bias, coeffs);
         float24 factor = dp_prev / (dp_prev - dp);
 
         return OutputVertex::Lerp(factor, v0, v1);
     }
 
 private:
-    Type type;
     float24 pos;
+    Math::Vec4<float24> coeffs;
+    Math::Vec4<float24> bias;
 };
 
 static void InitScreenCoordinates(OutputVertex& vtx)
@@ -98,10 +72,9 @@ static void InitScreenCoordinates(OutputVertex& vtx)
     vtx.tc2 *= inv_w;
     vtx.pos.w = inv_w;
 
-    // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
     vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
     vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
-    vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale;
+    vtx.screenpos[2] = viewport.offset_z + vtx.pos.z * inv_w * viewport.zscale;
 }
 
 void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
@@ -117,14 +90,29 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
     auto* output_list = &buffer_a;
     auto* input_list  = &buffer_b;
 
+    // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value.
+    // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest
+    //       epsilon possible within float24 accuracy.
+    static const float24 EPSILON = float24::FromFloat32(0.00001);
+    static const float24 f0 = float24::FromFloat32(0.0);
+    static const float24 f1 = float24::FromFloat32(1.0);
+    static const std::array<ClippingEdge, 7> clipping_edges = {{
+        { Math::MakeVec( f1,  f0,  f0, -f1) },  // x = +w
+        { Math::MakeVec(-f1,  f0,  f0, -f1) },  // x = -w
+        { Math::MakeVec( f0,  f1,  f0, -f1) },  // y = +w
+        { Math::MakeVec( f0, -f1,  f0, -f1) },  // y = -w
+        { Math::MakeVec( f0,  f0,  f1,  f0) },  // z =  0
+        { Math::MakeVec( f0,  f0, -f1, -f1) },  // z = -w
+        { Math::MakeVec( f0,  f0,  f0, -f1), Math::Vec4<float24>(f0, f0, f0, EPSILON) }, // w = EPSILON
+    }};
+
+    // TODO: If one vertex lies outside one of the depth clipping planes, some platforms (e.g. Wii)
+    //       drop the whole primitive instead of clipping the primitive properly. We should test if
+    //       this happens on the 3DS, too.
+
     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
-    for (auto edge : { ClippingEdge(ClippingEdge::POS_X, float24::FromFloat32(+1.0)),
-                       ClippingEdge(ClippingEdge::NEG_X, float24::FromFloat32(-1.0)),
-                       ClippingEdge(ClippingEdge::POS_Y, float24::FromFloat32(+1.0)),
-                       ClippingEdge(ClippingEdge::NEG_Y, float24::FromFloat32(-1.0)),
-                       ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
-                       ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
+    for (auto edge : clipping_edges) {
 
         std::swap(input_list, output_list);
         output_list->clear();
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 3faa10153..046c010ef 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -106,16 +106,17 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                                    ScreenToRasterizerCoordinates(v1.screenpos),
                                    ScreenToRasterizerCoordinates(v2.screenpos) };
 
-    if (registers.cull_mode == Regs::CullMode::KeepClockWise) {
-        // Reverse vertex order and use the CCW code path.
+    if (registers.cull_mode == Regs::CullMode::KeepCounterClockWise) {
+        // Reverse vertex order and use the CW code path.
         std::swap(vtxpos[1], vtxpos[2]);
     }
 
     if (registers.cull_mode != Regs::CullMode::KeepAll) {
-        // Cull away triangles which are wound clockwise.
-        // TODO: A check for degenerate triangles ("== 0") should be considered for CullMode::KeepAll
+        // Cull away triangles which are wound counter-clockwise.
         if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0)
             return;
+    } else {
+        // TODO: Consider A check for degenerate triangles ("SignedArea == 0")
     }
 
     // TODO: Proper scissor rect test!
@@ -475,7 +476,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
 
             // TODO: Does depth indeed only get written even if depth testing is enabled?
             if (registers.output_merger.depth_test_enable) {
-                u16 z = (u16)(-(v0.screenpos[2].ToFloat32() * w0 +
+                u16 z = (u16)((v0.screenpos[2].ToFloat32() * w0 +
                             v1.screenpos[2].ToFloat32() * w1 +
                             v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
                 u16 ref_z = GetDepth(x >> 4, y >> 4);

From 638b370fb5a9dff1296e6c60c02ac68911ae666a Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Wed, 18 Feb 2015 13:14:49 +0100
Subject: [PATCH 06/18] Pica/Rasterizer: Clean up and fix backface culling.

---
 src/video_core/rasterizer.cpp | 38 +++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 046c010ef..5769bd81e 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -90,9 +90,14 @@ static int SignedArea (const Math::Vec2<Fix12P4>& vtx1,
     return Math::Cross(vec1, vec2).z;
 };
 
-void ProcessTriangle(const VertexShader::OutputVertex& v0,
-                     const VertexShader::OutputVertex& v1,
-                     const VertexShader::OutputVertex& v2)
+/**
+ * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing
+ * culling via recursion.
+ */
+static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
+                                    const VertexShader::OutputVertex& v1,
+                                    const VertexShader::OutputVertex& v2,
+                                    bool reversed = false)
 {
     // vertex positions in rasterizer coordinates
     auto FloatToFix = [](float24 flt) {
@@ -106,17 +111,22 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                                    ScreenToRasterizerCoordinates(v1.screenpos),
                                    ScreenToRasterizerCoordinates(v2.screenpos) };
 
-    if (registers.cull_mode == Regs::CullMode::KeepCounterClockWise) {
-        // Reverse vertex order and use the CW code path.
-        std::swap(vtxpos[1], vtxpos[2]);
-    }
+    if (registers.cull_mode == Regs::CullMode::KeepAll) {
+        // Make sure we always end up with a triangle wound counter-clockwise
+        if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) {
+            ProcessTriangleInternal(v0, v2, v1, true);
+            return;
+        }
+    } else {
+        if (!reversed && registers.cull_mode == Regs::CullMode::KeepClockWise) {
+            // Reverse vertex order and use the CCW code path.
+            ProcessTriangleInternal(v0, v2, v1, true);
+            return;
+        }
 
-    if (registers.cull_mode != Regs::CullMode::KeepAll) {
-        // Cull away triangles which are wound counter-clockwise.
+        // Cull away triangles which are wound clockwise.
         if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0)
             return;
-    } else {
-        // TODO: Consider A check for degenerate triangles ("SignedArea == 0")
     }
 
     // TODO: Proper scissor rect test!
@@ -695,6 +705,12 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
     }
 }
 
+void ProcessTriangle(const VertexShader::OutputVertex& v0,
+                     const VertexShader::OutputVertex& v1,
+                     const VertexShader::OutputVertex& v2) {
+    ProcessTriangleInternal(v0, v1, v2);
+}
+
 } // namespace Rasterizer
 
 } // namespace Pica

From 3cb22d31a71ce1f0103b075cfc2533aaebff9ace Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Fri, 2 Jan 2015 01:01:06 +0100
Subject: [PATCH 07/18] Pica/Rasterizer: Fix garbage pixels at triangle
 borders.

---
 src/video_core/rasterizer.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 5769bd81e..74182abef 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -101,7 +101,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
 {
     // vertex positions in rasterizer coordinates
     auto FloatToFix = [](float24 flt) {
-                          return Fix12P4(static_cast<unsigned short>(flt.ToFloat32() * 16.0f));
+                          // TODO: Rounding here is necessary to prevent garbage pixels at
+                          //       triangle borders. Is it that the correct solution, though?
+                          return Fix12P4(static_cast<unsigned short>(round(flt.ToFloat32() * 16.0f)));
                       };
     auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3<float24> vec) {
                                              return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};

From 3b5710bae662dbd20b54736362c6f72888b9009e Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Fri, 2 Jan 2015 01:02:18 +0100
Subject: [PATCH 08/18] Pica/Rasterizer: Rasterize actual pixel centers instead
 of pixel corners.

---
 src/video_core/rasterizer.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 74182abef..168a2ada0 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -168,9 +168,10 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
     auto textures = registers.GetTextures();
     auto tev_stages = registers.GetTevStages();
 
+    // Enter rasterization loop, starting at the center of the topleft bounding box corner.
     // TODO: Not sure if looping through x first might be faster
-    for (u16 y = min_y; y < max_y; y += 0x10) {
-        for (u16 x = min_x; x < max_x; x += 0x10) {
+    for (u16 y = min_y + 8; y < max_y; y += 0x10) {
+        for (u16 x = min_x + 8; x < max_x; x += 0x10) {
 
             // Calculate the barycentric coordinates w0, w1 and w2
             int w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});

From aaf30ca4ee7cb539722a2928a578a579641987a1 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Fri, 2 Jan 2015 15:26:50 +0100
Subject: [PATCH 09/18] Pica/OutputMerger: Implement color format checking.

---
 src/video_core/pica.h         |  2 +-
 src/video_core/rasterizer.cpp | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index cf9dc4853..effa61571 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -421,7 +421,7 @@ struct Regs {
         INSERT_PADDING_WORDS(0x6);
 
         u32 depth_format;
-        u32 color_format;
+        BitField<16, 3, u32> color_format;
 
         INSERT_PADDING_WORDS(0x4);
 
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 168a2ada0..27eeb531d 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -20,10 +20,19 @@ namespace Rasterizer {
 static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
     const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
     u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
-    u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
 
-    // Assuming RGBA8 format until actual framebuffer format handling is implemented
-    *(color_buffer + x + y * registers.framebuffer.GetWidth()) = value;
+    switch (registers.framebuffer.color_format) {
+    case registers.framebuffer.RGBA8:
+    {
+        u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
+        *(color_buffer + x + y * registers.framebuffer.GetWidth()) = value;
+        break;
+    }
+
+    default:
+        LOG_CRITICAL(Render_Software, "Unknown framebuffer color format %x", registers.framebuffer.color_format);
+        exit(1);
+    }
 }
 
 static const Math::Vec4<u8> GetPixel(int x, int y) {

From 8bd7a896ea82b5b2f4ff7f2ec50624ff6ec45431 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Thu, 1 Jan 2015 19:59:11 +0100
Subject: [PATCH 10/18] Pica: Fix a bug in the register definitions, relating
 to texture wrapping.

---
 src/video_core/pica.h         | 2 +-
 src/video_core/rasterizer.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index effa61571..ef9809d57 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -131,7 +131,7 @@ struct Regs {
 
         union {
             BitField< 8, 2, WrapMode> wrap_s;
-            BitField<11, 2, WrapMode> wrap_t;
+            BitField<12, 2, WrapMode> wrap_t;
         };
 
         INSERT_PADDING_WORDS(0x1);
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 27eeb531d..65dddb47c 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -243,7 +243,7 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
 
                 int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
                 int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
-                auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) {
+                static auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) {
                     switch (mode) {
                         case Regs::TextureConfig::ClampToEdge:
                             val = std::max(val, 0);

From 6ca752ccbc7c59dab66f476ca02d3b53527c57da Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sat, 3 Jan 2015 13:33:57 +0100
Subject: [PATCH 11/18] Pica/TextureUnit: Implement mirrored repeating texture
 wrapping.

---
 src/video_core/pica.h         |  5 +++--
 src/video_core/rasterizer.cpp | 10 +++++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index ef9809d57..c20bf99d4 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -118,8 +118,9 @@ struct Regs {
 
     struct TextureConfig {
         enum WrapMode : u32 {
-            ClampToEdge = 0,
-            Repeat      = 2,
+            ClampToEdge    = 0,
+            Repeat         = 2,
+            MirroredRepeat = 3,
         };
 
         INSERT_PADDING_WORDS(0x1);
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 65dddb47c..f788122d8 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -251,7 +251,15 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                             return val;
 
                         case Regs::TextureConfig::Repeat:
-                            return (int)(((unsigned)val) % size);
+                            return (int)((unsigned)val % size);
+
+                        case Regs::TextureConfig::MirroredRepeat:
+                        {
+                            int val = (int)((unsigned)val % (2 * size));
+                            if (val >= size)
+                                val = 2 * size - 1 - val;
+                            return val;
+                        }
 
                         default:
                             LOG_ERROR(HW_GPU, "Unknown texture coordinate wrapping mode %x\n", (int)mode);

From 087edcfbec86ba730d55c4fdbbf65097a8cfb8e4 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sat, 3 Jan 2015 13:37:05 +0100
Subject: [PATCH 12/18] Pica/OutputMerger: Fix flipped framebuffers.

---
 src/video_core/rasterizer.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index f788122d8..9cad5f9b6 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -21,6 +21,10 @@ static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
     const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
     u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
 
+    // Similarly to textures, the render framebuffer is laid out from bottom to top, too.
+    // NOTE: The framebuffer height register contains the actual FB height minus one.
+    y = (registers.framebuffer.height - y);
+
     switch (registers.framebuffer.color_format) {
     case registers.framebuffer.RGBA8:
     {
@@ -39,6 +43,8 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {
     const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
     u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
 
+    y = (registers.framebuffer.height - y);
+
     u32 value = *(color_buffer_u32 + x + y * registers.framebuffer.GetWidth());
     Math::Vec4<u8> ret;
     ret.a() = value >> 24;
@@ -52,6 +58,8 @@ static u32 GetDepth(int x, int y) {
     const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
     u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
 
+    y = (registers.framebuffer.height - y);
+
     // Assuming 16-bit depth buffer format until actual format handling is implemented
     return *(depth_buffer + x + y * registers.framebuffer.GetWidth());
 }
@@ -60,6 +68,8 @@ static void SetDepth(int x, int y, u16 value) {
     const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
     u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
 
+    y = (registers.framebuffer.height - y);
+
     // Assuming 16-bit depth buffer format until actual format handling is implemented
     *(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value;
 }

From 04cd06d5c285848c29278083891474ee78797c8a Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sat, 3 Jan 2015 13:45:10 +0100
Subject: [PATCH 13/18] Pica/TextureEnvironment: Add support for the MAD-like
 texture combiners and clean up texture environment logic.

---
 src/video_core/pica.h         |  3 +++
 src/video_core/rasterizer.cpp | 25 +++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index c20bf99d4..23fc6b9ba 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -266,6 +266,9 @@ struct Regs {
             AddSigned       = 3,
             Lerp            = 4,
             Subtract        = 5,
+
+            MultiplyThenAdd = 8,
+            AddThenMultiply = 9,
         };
 
         union {
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 9cad5f9b6..eacca82e5 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -419,6 +419,25 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                         return result.Cast<u8>();
                     }
 
+                    case Operation::MultiplyThenAdd:
+                    {
+                        auto result = (input[0] * input[1] + 255 * input[2].Cast<int>()) / 255;
+                        result.r() = std::min(255, result.r());
+                        result.g() = std::min(255, result.g());
+                        result.b() = std::min(255, result.b());
+                        return result.Cast<u8>();
+                    }
+
+                    case Operation::AddThenMultiply:
+                    {
+                        auto result = input[0] + input[1];
+                        result.r() = std::min(255, result.r());
+                        result.g() = std::min(255, result.g());
+                        result.b() = std::min(255, result.b());
+                        result = (result * input[2].Cast<int>()) / 255;
+                        return result.Cast<u8>();
+                    }
+
                     default:
                         LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
                         UNIMPLEMENTED();
@@ -443,6 +462,12 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                     case Operation::Subtract:
                         return std::max(0, (int)input[0] - (int)input[1]);
 
+                    case Operation::MultiplyThenAdd:
+                        return std::min(255, (input[0] * input[1] + 255 * input[2]) / 255);
+
+                    case Operation::AddThenMultiply:
+                        return (std::min(255, (input[0] + input[1])) * input[2]) / 255;
+
                     default:
                         LOG_ERROR(HW_GPU, "Unknown alpha combiner operation %d\n", (int)op);
                         UNIMPLEMENTED();

From e11fb96408b27e2aa76e29a380fe3a2d15d37d32 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sat, 3 Jan 2015 13:49:53 +0100
Subject: [PATCH 14/18] Pica/TextureEnvironment: Treat texture combiner source
 1 as the PrimaryColor.

Not really sure where the difference is, but some applications seem to use this 1:1 the same way...
---
 src/video_core/pica.h         | 2 ++
 src/video_core/rasterizer.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 23fc6b9ba..24f2c2382 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -224,6 +224,8 @@ struct Regs {
     struct TevStageConfig {
         enum class Source : u32 {
             PrimaryColor           = 0x0,
+            PrimaryFragmentColor   = 0x1,
+
             Texture0               = 0x3,
             Texture1               = 0x4,
             Texture2               = 0x5,
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index eacca82e5..4bf7593ce 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -303,7 +303,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
 
                 auto GetSource = [&](Source source) -> Math::Vec4<u8> {
                     switch (source) {
+                    // TODO: What's the difference between these two?
                     case Source::PrimaryColor:
+                    case Source::PrimaryFragmentColor:
                         return primary_color;
 
                     case Source::Texture0:

From 81ebb4d682a1c2f452d96bd5545251a8fd856def Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sat, 3 Jan 2015 13:51:51 +0100
Subject: [PATCH 15/18] Pica/TextureEnvironment: Add a note.

---
 src/video_core/rasterizer.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 4bf7593ce..b7a7e62ab 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -277,6 +277,10 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                             return 0;
                     }
                 };
+
+                // Textures are laid out from bottom to top, hence we invert the t coordinate.
+                // NOTE: This may not be the right place for the inversion.
+                // TODO: Check if this applies to ETC textures, too.
                 s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width);
                 t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);
 

From 156120434251d0ae726442206f8b5b41338d700d Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Wed, 11 Feb 2015 21:39:43 +0100
Subject: [PATCH 16/18] Pica/BlendUnit: Implement separate color/alpha blend
 equations.

---
 src/video_core/pica.h         |   2 +-
 src/video_core/rasterizer.cpp | 122 ++++++++++++++++------------------
 2 files changed, 59 insertions(+), 65 deletions(-)

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 24f2c2382..e4a5ef78e 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -343,7 +343,7 @@ struct Regs {
         };
 
         union {
-            enum BlendEquation : u32 {
+            enum class BlendEquation : u32 {
                 Add             = 0,
                 Subtract        = 1,
                 ReverseSubtract = 2,
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index b7a7e62ab..f96015de4 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -5,6 +5,7 @@
 #include <algorithm>
 
 #include "common/common_types.h"
+#include "common/math_util.h"
 
 #include "math.h"
 #include "pica.h"
@@ -596,6 +597,7 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
             }
 
             auto dest = GetPixel(x >> 4, y >> 4);
+            Math::Vec4<u8> blend_output = combiner_output;
 
             if (registers.output_merger.alphablend_enable) {
                 auto params = registers.output_merger.alpha_blending;
@@ -684,81 +686,73 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                     }
                 };
 
+                using BlendEquation = decltype(params)::BlendEquation;
+                static auto EvaluateBlendEquation = [](const Math::Vec4<u8>& src, const Math::Vec4<u8>& srcfactor,
+                                                       const Math::Vec4<u8>& dest, const Math::Vec4<u8>& destfactor,
+                                                       BlendEquation equation) {
+                    Math::Vec4<int> result;
+
+                    auto src_result = (src  *  srcfactor).Cast<int>();
+                    auto dst_result = (dest * destfactor).Cast<int>();
+
+                    switch (equation) {
+                    case BlendEquation::Add:
+                        result = (src_result + dst_result) / 255;
+                        break;
+
+                    case BlendEquation::Subtract:
+                        result = (src_result - dst_result) / 255;
+                        break;
+
+                    case BlendEquation::ReverseSubtract:
+                        result = (dst_result - src_result) / 255;
+                        break;
+
+                    // TODO: How do these two actually work?
+                    //       OpenGL doesn't include the blend factors in the min/max computations,
+                    //       but is this what the 3DS actually does?
+                    case BlendEquation::Min:
+                        result.r() = std::min(src.r(), dest.r());
+                        result.g() = std::min(src.g(), dest.g());
+                        result.b() = std::min(src.b(), dest.b());
+                        result.a() = std::min(src.a(), dest.a());
+                        break;
+
+                    case BlendEquation::Max:
+                        result.r() = std::max(src.r(), dest.r());
+                        result.g() = std::max(src.g(), dest.g());
+                        result.b() = std::max(src.b(), dest.b());
+                        result.a() = std::max(src.a(), dest.a());
+                        break;
+
+                    default:
+                        LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", equation);
+                        exit(0);
+                    }
+
+                    return Math::Vec4<u8>(MathUtil::Clamp(result.r(), 0, 255),
+                                    MathUtil::Clamp(result.g(), 0, 255),
+                                    MathUtil::Clamp(result.b(), 0, 255),
+                                    MathUtil::Clamp(result.a(), 0, 255));
+                };
+
                 auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb),
                                                LookupFactorA(params.factor_source_a));
                 auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb),
                                                LookupFactorA(params.factor_dest_a));
-                                               
-                auto src_result = (combiner_output * srcfactor).Cast<int>();
-                auto dst_result = (dest * dstfactor).Cast<int>();
 
-                switch (params.blend_equation_rgb) {
-                case params.Add:
-                {
-                    auto result = (src_result + dst_result) / 255;
-                    result.r() = std::min(255, result.r());
-                    result.g() = std::min(255, result.g());
-                    result.b() = std::min(255, result.b());
-                    combiner_output = result.Cast<u8>();
-                    break;
-                }
-                
-                case params.Subtract:
-                {
-                    auto result = (src_result - dst_result) / 255;
-                    result.r() = std::max(0, result.r());
-                    result.g() = std::max(0, result.g());
-                    result.b() = std::max(0, result.b());
-                    combiner_output = result.Cast<u8>();
-                    break;
-                }
-                
-                case params.ReverseSubtract:
-                {
-                    auto result = (dst_result - src_result) / 255;
-                    result.r() = std::max(0, result.r());
-                    result.g() = std::max(0, result.g());
-                    result.b() = std::max(0, result.b());
-                    combiner_output = result.Cast<u8>();
-                    break;
-                }
-                
-                case params.Min:
-                {
-                    // TODO: GL spec says to do it without the factors, but is this what the 3DS does?
-                    Math::Vec4<int> result;
-                    result.r() = std::min(combiner_output.r(),dest.r());
-                    result.g() = std::min(combiner_output.g(),dest.g());
-                    result.b() = std::min(combiner_output.b(),dest.b());
-                    combiner_output = result.Cast<u8>();
-                    break;
-                }
-                
-                case params.Max:
-                {
-                    // TODO: GL spec says to do it without the factors, but is this what the 3DS does?
-                    Math::Vec4<int> result;
-                    result.r() = std::max(combiner_output.r(),dest.r());
-                    result.g() = std::max(combiner_output.g(),dest.g());
-                    result.b() = std::max(combiner_output.b(),dest.b());
-                    combiner_output = result.Cast<u8>();
-                    break;
-                }
-
-                default:
-                    LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", params.blend_equation_rgb.Value());
-                    exit(0);
-                }
+                blend_output     = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_rgb);
+                blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_a).a();
             } else {
                 LOG_CRITICAL(HW_GPU, "logic op: %x", registers.output_merger.logic_op);
                 exit(0);
             }
 
             const Math::Vec4<u8> result = {
-                registers.output_merger.red_enable   ? combiner_output.r() : dest.r(),
-                registers.output_merger.green_enable ? combiner_output.g() : dest.g(),
-                registers.output_merger.blue_enable  ? combiner_output.b() : dest.b(),
-                registers.output_merger.alpha_enable ? combiner_output.a() : dest.a()
+                registers.output_merger.red_enable   ? blend_output.r() : dest.r(),
+                registers.output_merger.green_enable ? blend_output.g() : dest.g(),
+                registers.output_merger.blue_enable  ? blend_output.b() : dest.b(),
+                registers.output_merger.alpha_enable ? blend_output.a() : dest.a()
             };
 
             DrawPixel(x >> 4, y >> 4, result);

From 6e5a903286a50b84e417d1bfda7ef07108354ae3 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Wed, 18 Feb 2015 12:14:40 +0100
Subject: [PATCH 17/18] Pica/Rasterizer: Make some local lambdas static.

---
 src/video_core/rasterizer.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index f96015de4..a3712f116 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -120,14 +120,14 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                                     bool reversed = false)
 {
     // vertex positions in rasterizer coordinates
-    auto FloatToFix = [](float24 flt) {
-                          // TODO: Rounding here is necessary to prevent garbage pixels at
-                          //       triangle borders. Is it that the correct solution, though?
-                          return Fix12P4(static_cast<unsigned short>(round(flt.ToFloat32() * 16.0f)));
-                      };
-    auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3<float24> vec) {
-                                             return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
-                                         };
+    static auto FloatToFix = [](float24 flt) {
+        // TODO: Rounding here is necessary to prevent garbage pixels at
+        //       triangle borders. Is it that the correct solution, though?
+        return Fix12P4(static_cast<unsigned short>(round(flt.ToFloat32() * 16.0f)));
+    };
+    static auto ScreenToRasterizerCoordinates = [](const Math::Vec3<float24>& vec) {
+        return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
+    };
 
     Math::Vec3<Fix12P4> vtxpos[3]{ ScreenToRasterizerCoordinates(v0.screenpos),
                                    ScreenToRasterizerCoordinates(v1.screenpos),

From 2eee3a87f915a8d18482cdd15e6bfd823467ea13 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Wed, 18 Feb 2015 13:50:25 +0100
Subject: [PATCH 18/18] Pica/Rasterizer: Replace exit() calls with
 UNIMPLEMENTED().

---
 src/video_core/rasterizer.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index a3712f116..94873f406 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -36,7 +36,7 @@ static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
 
     default:
         LOG_CRITICAL(Render_Software, "Unknown framebuffer color format %x", registers.framebuffer.color_format);
-        exit(1);
+        UNIMPLEMENTED();
     }
 }
 
@@ -648,7 +648,7 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
 
                     default:
                         LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor);
-                        exit(0);
+                        UNIMPLEMENTED();
                         break;
                     }
                 };
@@ -681,7 +681,7 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
 
                     default:
                         LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor);
-                        exit(0);
+                        UNIMPLEMENTED();
                         break;
                     }
                 };
@@ -727,7 +727,7 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
 
                     default:
                         LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", equation);
-                        exit(0);
+                        UNIMPLEMENTED();
                     }
 
                     return Math::Vec4<u8>(MathUtil::Clamp(result.r(), 0, 255),
@@ -745,7 +745,7 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
                 blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_a).a();
             } else {
                 LOG_CRITICAL(HW_GPU, "logic op: %x", registers.output_merger.logic_op);
-                exit(0);
+                UNIMPLEMENTED();
             }
 
             const Math::Vec4<u8> result = {