From bd798390d5c3d1bf9084abe9839597162608f575 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Wed, 23 Jul 2014 14:42:15 +0200
Subject: [PATCH 01/11] GSP: Fix a major regression introduced in ffda035c, due
 to which no display transfers were triggered at all anymore.

---
 src/core/hle/service/gsp.cpp | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/core/hle/service/gsp.cpp b/src/core/hle/service/gsp.cpp
index 08e65612e..e2b0a48a7 100644
--- a/src/core/hle/service/gsp.cpp
+++ b/src/core/hle/service/gsp.cpp
@@ -32,7 +32,7 @@ static inline u8* GetCommandBuffer(u32 thread_id) {
     if (0 == g_shared_memory)
         return nullptr;
 
-    return Kernel::GetSharedMemoryPointer(g_shared_memory, 
+    return Kernel::GetSharedMemoryPointer(g_shared_memory,
         0x800 + (thread_id * sizeof(CommandBuffer)));
 }
 
@@ -205,8 +205,16 @@ void ExecuteCommand(const Command& command) {
         break;
     }
 
-    // TODO: Check if texture copies are implemented correctly..
     case CommandId::SET_DISPLAY_TRANSFER:
+    {
+        auto& params = command.image_copy;
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_address), params.in_buffer_address >> 3);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_address), params.out_buffer_address >> 3);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_size), params.in_buffer_size);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_size), params.out_buffer_size);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.flags), params.flags);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.trigger), 1);
+
         // TODO(bunnei): Signalling all of these interrupts here is totally wrong, but it seems to
         // work well enough for running demos. Need to figure out how these all work and trigger
         // them correctly.
@@ -216,7 +224,9 @@ void ExecuteCommand(const Command& command) {
         SignalInterrupt(InterruptId::P3D);
         SignalInterrupt(InterruptId::DMA);
         break;
+    }
 
+    // TODO: Check if texture copies are implemented correctly..
     case CommandId::SET_TEXTURE_COPY:
     {
         auto& params = command.image_copy;
@@ -226,8 +236,7 @@ void ExecuteCommand(const Command& command) {
         WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_size), params.out_buffer_size);
         WriteGPURegister(GPU_REG_INDEX(display_transfer_config.flags), params.flags);
 
-        // TODO: Should this only be ORed with 1 for texture copies?
-        // trigger transfer
+        // TODO: Should this register be set to 1 or should instead its value be OR-ed with 1?
         WriteGPURegister(GPU_REG_INDEX(display_transfer_config.trigger), 1);
         break;
     }

From 7b6a7d7dfb92d7a6d3537ea8b0339c2170d7eb84 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 3 Aug 2014 01:46:47 +0200
Subject: [PATCH 02/11] Pica/GPU: Change hardware registers to use physical
 addresses rather than virtual ones.

This cleans up the mess that address reading/writing had become and makes the code a *lot* more sensible.
This adds a physical<->virtual address converter to mem_map.h. For further accuracy, we will want to properly extend this to support a wider range of address regions. For now, this makes simply homebrew applications work in a good manner though.
---
 src/core/hle/service/gsp.cpp                  |  18 ++--
 src/core/hw/gpu.cpp                           | 102 ++++--------------
 src/core/hw/gpu.h                             |  66 ------------
 src/core/mem_map.cpp                          |   4 +-
 src/core/mem_map.h                            |  22 ++--
 src/core/mem_map_funcs.cpp                    |  70 ++++++------
 src/video_core/pica.h                         |   2 +-
 .../renderer_opengl/renderer_opengl.cpp       |  14 +--
 8 files changed, 81 insertions(+), 217 deletions(-)

diff --git a/src/core/hle/service/gsp.cpp b/src/core/hle/service/gsp.cpp
index e2b0a48a7..635f50a53 100644
--- a/src/core/hle/service/gsp.cpp
+++ b/src/core/hle/service/gsp.cpp
@@ -173,7 +173,7 @@ void ExecuteCommand(const Command& command) {
     case CommandId::SET_COMMAND_LIST_LAST:
     {
         auto& params = command.set_command_list_last;
-        WriteGPURegister(GPU_REG_INDEX(command_processor_config.address), params.address >> 3);
+        WriteGPURegister(GPU_REG_INDEX(command_processor_config.address), Memory::VirtualToPhysicalAddress(params.address) >> 3);
         WriteGPURegister(GPU_REG_INDEX(command_processor_config.size), params.size >> 3);
 
         // TODO: Not sure if we are supposed to always write this .. seems to trigger processing though
@@ -193,13 +193,13 @@ void ExecuteCommand(const Command& command) {
     case CommandId::SET_MEMORY_FILL:
     {
         auto& params = command.memory_fill;
-        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].address_start), params.start1 >> 3);
-        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].address_end), params.end1 >> 3);
+        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].address_start), Memory::VirtualToPhysicalAddress(params.start1) >> 3);
+        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].address_end), Memory::VirtualToPhysicalAddress(params.end1) >> 3);
         WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].size), params.end1 - params.start1);
         WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].value), params.value1);
 
-        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].address_start), params.start2 >> 3);
-        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].address_end), params.end2 >> 3);
+        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].address_start), Memory::VirtualToPhysicalAddress(params.start2) >> 3);
+        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].address_end), Memory::VirtualToPhysicalAddress(params.end2) >> 3);
         WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].size), params.end2 - params.start2);
         WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].value), params.value2);
         break;
@@ -208,8 +208,8 @@ void ExecuteCommand(const Command& command) {
     case CommandId::SET_DISPLAY_TRANSFER:
     {
         auto& params = command.image_copy;
-        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_address), params.in_buffer_address >> 3);
-        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_address), params.out_buffer_address >> 3);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_address), Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_address), Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3);
         WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_size), params.in_buffer_size);
         WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_size), params.out_buffer_size);
         WriteGPURegister(GPU_REG_INDEX(display_transfer_config.flags), params.flags);
@@ -230,8 +230,8 @@ void ExecuteCommand(const Command& command) {
     case CommandId::SET_TEXTURE_COPY:
     {
         auto& params = command.image_copy;
-        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_address), params.in_buffer_address >> 3);
-        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_address), params.out_buffer_address >> 3);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_address), Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_address), Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3);
         WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_size), params.in_buffer_size);
         WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_size), params.out_buffer_size);
         WriteGPURegister(GPU_REG_INDEX(display_transfer_config.flags), params.flags);
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index fd40f8ac0..591997aa3 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -24,83 +24,6 @@ Regs g_regs;
 u32 g_cur_line = 0;         ///< Current vertical screen line
 u64 g_last_line_ticks = 0;  ///< CPU tick count from last vertical screen line
 
-/**
- * Sets whether the framebuffers are in the GSP heap (FCRAM) or VRAM
- * @param
- */
-void SetFramebufferLocation(const FramebufferLocation mode) {
-    switch (mode) {
-    case FRAMEBUFFER_LOCATION_FCRAM:
-    {
-        auto& framebuffer_top = g_regs.framebuffer_config[0];
-        auto& framebuffer_sub = g_regs.framebuffer_config[1];
-
-        framebuffer_top.address_left1  = PADDR_TOP_LEFT_FRAME1;
-        framebuffer_top.address_left2  = PADDR_TOP_LEFT_FRAME2;
-        framebuffer_top.address_right1 = PADDR_TOP_RIGHT_FRAME1;
-        framebuffer_top.address_right2 = PADDR_TOP_RIGHT_FRAME2;
-        framebuffer_sub.address_left1  = PADDR_SUB_FRAME1;
-        //framebuffer_sub.address_left2  = unknown;
-        framebuffer_sub.address_right1 = PADDR_SUB_FRAME2;
-        //framebuffer_sub.address_right2 = unknown;
-        break;
-    }
-
-    case FRAMEBUFFER_LOCATION_VRAM:
-    {
-        auto& framebuffer_top = g_regs.framebuffer_config[0];
-        auto& framebuffer_sub = g_regs.framebuffer_config[1];
-
-        framebuffer_top.address_left1  = PADDR_VRAM_TOP_LEFT_FRAME1;
-        framebuffer_top.address_left2  = PADDR_VRAM_TOP_LEFT_FRAME2;
-        framebuffer_top.address_right1 = PADDR_VRAM_TOP_RIGHT_FRAME1;
-        framebuffer_top.address_right2 = PADDR_VRAM_TOP_RIGHT_FRAME2;
-        framebuffer_sub.address_left1  = PADDR_VRAM_SUB_FRAME1;
-        //framebuffer_sub.address_left2  = unknown;
-        framebuffer_sub.address_right1 = PADDR_VRAM_SUB_FRAME2;
-        //framebuffer_sub.address_right2 = unknown;
-        break;
-    }
-    }
-}
-
-/**
- * Gets the location of the framebuffers
- * @return Location of framebuffers as FramebufferLocation enum
- */
-FramebufferLocation GetFramebufferLocation(u32 address) {
-    if ((address & ~Memory::VRAM_MASK) == Memory::VRAM_PADDR) {
-        return FRAMEBUFFER_LOCATION_VRAM;
-    } else if ((address & ~Memory::FCRAM_MASK) == Memory::FCRAM_PADDR) {
-        return FRAMEBUFFER_LOCATION_FCRAM;
-    } else {
-        ERROR_LOG(GPU, "unknown framebuffer location!");
-    }
-    return FRAMEBUFFER_LOCATION_UNKNOWN;
-}
-
-u32 GetFramebufferAddr(const u32 address) {
-    switch (GetFramebufferLocation(address)) {
-    case FRAMEBUFFER_LOCATION_FCRAM:
-        return Memory::VirtualAddressFromPhysical_FCRAM(address);
-    case FRAMEBUFFER_LOCATION_VRAM:
-        return Memory::VirtualAddressFromPhysical_VRAM(address);
-    default:
-        ERROR_LOG(GPU, "unknown framebuffer location");
-    }
-    return 0;
-}
-
-/**
- * Gets a read-only pointer to a framebuffer in memory
- * @param address Physical address of framebuffer
- * @return Returns const pointer to raw framebuffer
- */
-const u8* GetFramebufferPointer(const u32 address) {
-    u32 addr = GetFramebufferAddr(address);
-    return (addr != 0) ? Memory::GetPointer(addr) : nullptr;
-}
-
 template <typename T>
 inline void Read(T &var, const u32 raw_addr) {
     u32 addr = raw_addr - 0x1EF00000;
@@ -141,8 +64,8 @@ inline void Write(u32 addr, const T data) {
         // TODO: Not sure if this check should be done at GSP level instead
         if (config.address_start) {
             // TODO: Not sure if this algorithm is correct, particularly because it doesn't use the size member at all
-            u32* start = (u32*)Memory::GetPointer(config.GetStartAddress());
-            u32* end = (u32*)Memory::GetPointer(config.GetEndAddress());
+            u32* start = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress()));
+            u32* end = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress()));
             for (u32* ptr = start; ptr < end; ++ptr)
                 *ptr = bswap32(config.value); // TODO: This is just a workaround to missing framebuffer format emulation
 
@@ -155,8 +78,8 @@ inline void Write(u32 addr, const T data) {
     {
         const auto& config = g_regs.display_transfer_config;
         if (config.trigger & 1) {
-            u8* source_pointer = Memory::GetPointer(config.GetPhysicalInputAddress());
-            u8* dest_pointer = Memory::GetPointer(config.GetPhysicalOutputAddress());
+            u8* source_pointer = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetPhysicalInputAddress()));
+            u8* dest_pointer = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetPhysicalOutputAddress()));
 
             for (int y = 0; y < config.output_height; ++y) {
                 // TODO: Why does the register seem to hold twice the framebuffer width?
@@ -276,11 +199,22 @@ void Init() {
     g_cur_line = 0;
     g_last_line_ticks = Core::g_app_core->GetTicks();
 
-//    SetFramebufferLocation(FRAMEBUFFER_LOCATION_FCRAM);
-    SetFramebufferLocation(FRAMEBUFFER_LOCATION_VRAM);
-
     auto& framebuffer_top = g_regs.framebuffer_config[0];
     auto& framebuffer_sub = g_regs.framebuffer_config[1];
+
+    // Setup default framebuffer addresses (located in VRAM)
+    // .. or at least these are the ones used by system applets.
+    // There's probably a smarter way to come up with addresses
+    // like this which does not require hardcoding.
+    framebuffer_top.address_left1  = 0x181E6000;
+    framebuffer_top.address_left2  = 0x1822C800;
+    framebuffer_top.address_right1 = 0x18273000;
+    framebuffer_top.address_right2 = 0x182B9800;
+    framebuffer_sub.address_left1  = 0x1848F000;
+    //framebuffer_sub.address_left2  = unknown;
+    framebuffer_sub.address_right1 = 0x184C7800;
+    //framebuffer_sub.address_right2 = unknown;
+
     // TODO: Width should be 240 instead?
     framebuffer_top.width = 480;
     framebuffer_top.height = 400;
diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h
index 3065da891..d20311a00 100644
--- a/src/core/hw/gpu.h
+++ b/src/core/hw/gpu.h
@@ -249,72 +249,6 @@ static_assert(sizeof(Regs) == 0x1000 * sizeof(u32), "Invalid total size of regis
 
 extern Regs g_regs;
 
-enum {
-    TOP_ASPECT_X        = 0x5,
-    TOP_ASPECT_Y        = 0x3,
-
-    TOP_HEIGHT          = 240,
-    TOP_WIDTH           = 400,
-    BOTTOM_WIDTH        = 320,
-
-    // Physical addresses in FCRAM (chosen arbitrarily)
-    PADDR_TOP_LEFT_FRAME1       = 0x201D4C00,
-    PADDR_TOP_LEFT_FRAME2       = 0x202D4C00,
-    PADDR_TOP_RIGHT_FRAME1      = 0x203D4C00,
-    PADDR_TOP_RIGHT_FRAME2      = 0x204D4C00,
-    PADDR_SUB_FRAME1            = 0x205D4C00,
-    PADDR_SUB_FRAME2            = 0x206D4C00,
-    // Physical addresses in FCRAM used by ARM9 applications
-/*    PADDR_TOP_LEFT_FRAME1       = 0x20184E60,
-    PADDR_TOP_LEFT_FRAME2       = 0x201CB370,
-    PADDR_TOP_RIGHT_FRAME1      = 0x20282160,
-    PADDR_TOP_RIGHT_FRAME2      = 0x202C8670,
-    PADDR_SUB_FRAME1            = 0x202118E0,
-    PADDR_SUB_FRAME2            = 0x20249CF0,*/
-
-    // Physical addresses in VRAM
-    // TODO: These should just be deduced from the ones above
-    PADDR_VRAM_TOP_LEFT_FRAME1  = 0x181D4C00,
-    PADDR_VRAM_TOP_LEFT_FRAME2  = 0x182D4C00,
-    PADDR_VRAM_TOP_RIGHT_FRAME1 = 0x183D4C00,
-    PADDR_VRAM_TOP_RIGHT_FRAME2 = 0x184D4C00,
-    PADDR_VRAM_SUB_FRAME1       = 0x185D4C00,
-    PADDR_VRAM_SUB_FRAME2       = 0x186D4C00,
-    // Physical addresses in VRAM used by ARM9 applications
-/*    PADDR_VRAM_TOP_LEFT_FRAME2  = 0x181CB370,
-    PADDR_VRAM_TOP_RIGHT_FRAME1 = 0x18282160,
-    PADDR_VRAM_TOP_RIGHT_FRAME2 = 0x182C8670,
-    PADDR_VRAM_SUB_FRAME1       = 0x182118E0,
-    PADDR_VRAM_SUB_FRAME2       = 0x18249CF0,*/
-};
-
-/// Framebuffer location
-enum FramebufferLocation {
-    FRAMEBUFFER_LOCATION_UNKNOWN,   ///< Framebuffer location is unknown
-    FRAMEBUFFER_LOCATION_FCRAM,     ///< Framebuffer is in the GSP heap
-    FRAMEBUFFER_LOCATION_VRAM,      ///< Framebuffer is in VRAM
-};
-
-/**
- * Sets whether the framebuffers are in the GSP heap (FCRAM) or VRAM
- * @param
- */
-void SetFramebufferLocation(const FramebufferLocation mode);
-
-/**
- * Gets a read-only pointer to a framebuffer in memory
- * @param address Physical address of framebuffer
- * @return Returns const pointer to raw framebuffer
- */
-const u8* GetFramebufferPointer(const u32 address);
-
-u32 GetFramebufferAddr(const u32 address);
-
-/**
- * Gets the location of the framebuffers
- */
-FramebufferLocation GetFramebufferLocation(u32 address);
-
 template <typename T>
 void Read(T &var, const u32 addr);
 
diff --git a/src/core/mem_map.cpp b/src/core/mem_map.cpp
index c45746be9..14fc01471 100644
--- a/src/core/mem_map.cpp
+++ b/src/core/mem_map.cpp
@@ -72,14 +72,14 @@ void Init() {
 
     g_base = MemoryMap_Setup(g_views, kNumMemViews, flags, &g_arena);
 
-    NOTICE_LOG(MEMMAP, "initialized OK, RAM at %p (mirror at 0 @ %p)", g_heap, 
+    NOTICE_LOG(MEMMAP, "initialized OK, RAM at %p (mirror at 0 @ %p)", g_heap,
         g_physical_fcram);
 }
 
 void Shutdown() {
     u32 flags = 0;
     MemoryMap_Shutdown(g_views, kNumMemViews, flags, &g_arena);
-    
+
     g_arena.ReleaseSpace();
     g_base = NULL;
 
diff --git a/src/core/mem_map.h b/src/core/mem_map.h
index 12941f558..3c7810573 100644
--- a/src/core/mem_map.h
+++ b/src/core/mem_map.h
@@ -14,7 +14,6 @@ namespace Memory {
 enum {
     BOOTROM_SIZE            = 0x00010000,   ///< Bootrom (super secret code/data @ 0x8000) size
     MPCORE_PRIV_SIZE        = 0x00002000,   ///< MPCore private memory region size
-    VRAM_SIZE               = 0x00600000,   ///< VRAM size
     DSP_SIZE                = 0x00080000,   ///< DSP memory size
     AXI_WRAM_SIZE           = 0x00080000,   ///< AXI WRAM size
 
@@ -23,8 +22,6 @@ enum {
     FCRAM_PADDR_END         = (FCRAM_PADDR + FCRAM_SIZE),       ///< FCRAM end of physical space
     FCRAM_VADDR             = 0x08000000,                       ///< FCRAM virtual address
     FCRAM_VADDR_END         = (FCRAM_VADDR + FCRAM_SIZE),       ///< FCRAM end of virtual space
-    FCRAM_VADDR_FW0B        = 0xF0000000,                       ///< FCRAM adress for firmare FW0B
-    FCRAM_VADDR_FW0B_END    = (FCRAM_VADDR_FW0B + FCRAM_SIZE),  ///< FCRAM adress end for FW0B
     FCRAM_MASK              = (FCRAM_SIZE - 1),                 ///< FCRAM mask
 
     SHARED_MEMORY_SIZE      = 0x04000000,   ///< Shared memory size
@@ -73,6 +70,7 @@ enum {
     HARDWARE_IO_PADDR_END   = (HARDWARE_IO_PADDR + HARDWARE_IO_SIZE),
     HARDWARE_IO_VADDR_END   = (HARDWARE_IO_VADDR + HARDWARE_IO_SIZE),
 
+    VRAM_SIZE               = 0x00600000,
     VRAM_PADDR              = 0x18000000,
     VRAM_VADDR              = 0x1F000000,
     VRAM_PADDR_END          = (VRAM_PADDR + VRAM_SIZE),
@@ -112,7 +110,7 @@ struct MemoryBlock {
 
 // In 64-bit, this might point to "high memory" (above the 32-bit limit),
 // so be sure to load it into a 64-bit register.
-extern u8 *g_base; 
+extern u8 *g_base;
 
 // These are guaranteed to point to "low memory" addresses (sub-32-bit).
 // 64-bit: Pointers to low-mem (sub-0x10000000) mirror
@@ -147,7 +145,7 @@ void Write32(const u32 addr, const u32 data);
 
 void WriteBlock(const u32 addr, const u8* data, const int size);
 
-u8* GetPointer(const u32 Address);
+u8* GetPointer(const u32 virtual_address);
 
 /**
  * Maps a block of memory on the heap
@@ -169,16 +167,10 @@ inline const char* GetCharPointer(const u32 address) {
     return (const char *)GetPointer(address);
 }
 
-inline const u32 VirtualAddressFromPhysical_FCRAM(const u32 address) {
-    return ((address & FCRAM_MASK) | FCRAM_VADDR);
-}
+/// Converts a physical address to virtual address
+u32 PhysicalToVirtualAddress(const u32 addr);
 
-inline const u32 VirtualAddressFromPhysical_IO(const u32 address) {
-    return (address + 0x0EB00000);
-}
-
-inline const u32 VirtualAddressFromPhysical_VRAM(const u32 address) {
-    return (address + 0x07000000);
-}
+/// Converts a virtual address to physical address
+u32 VirtualToPhysicalAddress(const u32 addr);
 
 } // namespace
diff --git a/src/core/mem_map_funcs.cpp b/src/core/mem_map_funcs.cpp
index 305be8468..5772cca52 100644
--- a/src/core/mem_map_funcs.cpp
+++ b/src/core/mem_map_funcs.cpp
@@ -17,37 +17,44 @@ std::map<u32, MemoryBlock> g_heap_map;
 std::map<u32, MemoryBlock> g_heap_gsp_map;
 std::map<u32, MemoryBlock> g_shared_map;
 
-/// Convert a physical address (or firmware-specific virtual address) to primary virtual address
-u32 _VirtualAddress(const u32 addr) {
-    // Our memory interface read/write functions assume virtual addresses. Put any physical address 
-    // to virtual address translations here. This is obviously quite hacky... But we're not doing 
-    // any MMU emulation yet or anything
-    if ((addr >= FCRAM_PADDR) && (addr < FCRAM_PADDR_END)) {
-        return VirtualAddressFromPhysical_FCRAM(addr);
-
-    // Virtual address mapping FW0B
-    } else if ((addr >= FCRAM_VADDR_FW0B) && (addr < FCRAM_VADDR_FW0B_END)) {
-        return VirtualAddressFromPhysical_FCRAM(addr);
-
-    // Hardware IO
-    // TODO(bunnei): FixMe
-    // This isn't going to work... The physical address of HARDWARE_IO conflicts with the virtual 
-    // address of shared memory.
-    //} else if ((addr >= HARDWARE_IO_PADDR) && (addr < HARDWARE_IO_PADDR_END)) {
-    //    return (addr + 0x0EB00000);
-
+/// Convert a physical address to virtual address
+u32 PhysicalToVirtualAddress(const u32 addr) {
+    // Our memory interface read/write functions assume virtual addresses. Put any physical address
+    // to virtual address translations here. This is quite hacky, but necessary until we implement
+    // proper MMU emulation.
+    // TODO: Screw it, I'll let bunnei figure out how to do this properly.
+    if ((addr >= VRAM_PADDR) && (addr < VRAM_PADDR_END)) {
+        return addr - VRAM_PADDR + VRAM_VADDR;
+    }else if ((addr >= FCRAM_PADDR) && (addr < FCRAM_PADDR_END)) {
+        return addr - FCRAM_PADDR + FCRAM_VADDR;
     }
+
+    ERROR_LOG(MEMMAP, "Unknown physical address @ 0x%08x", addr);
+    return addr;
+}
+
+/// Convert a physical address to virtual address
+u32 VirtualToPhysicalAddress(const u32 addr) {
+    // Our memory interface read/write functions assume virtual addresses. Put any physical address
+    // to virtual address translations here. This is quite hacky, but necessary until we implement
+    // proper MMU emulation.
+    // TODO: Screw it, I'll let bunnei figure out how to do this properly.
+    if ((addr >= VRAM_VADDR) && (addr < VRAM_VADDR_END)) {
+        return addr - 0x07000000;
+    } else if ((addr >= FCRAM_VADDR) && (addr < FCRAM_VADDR_END)) {
+        return addr - FCRAM_VADDR + FCRAM_PADDR;
+    }
+
+    ERROR_LOG(MEMMAP, "Unknown virtual address @ 0x%08x", addr);
     return addr;
 }
 
 template <typename T>
-inline void Read(T &var, const u32 addr) {
+inline void Read(T &var, const u32 vaddr) {
     // TODO: Figure out the fastest order of tests for both read and write (they are probably different).
     // TODO: Make sure this represents the mirrors in a correct way.
     // Could just do a base-relative read, too.... TODO
 
-    const u32 vaddr = _VirtualAddress(addr);
-
     // Kernel memory command buffer
     if (vaddr >= KERNEL_MEMORY_VADDR && vaddr < KERNEL_MEMORY_VADDR_END) {
         var = *((const T*)&g_kernel_mem[vaddr & KERNEL_MEMORY_MASK]);
@@ -91,9 +98,8 @@ inline void Read(T &var, const u32 addr) {
 }
 
 template <typename T>
-inline void Write(u32 addr, const T data) {
-    u32 vaddr = _VirtualAddress(addr);
-    
+inline void Write(u32 vaddr, const T data) {
+
     // Kernel memory command buffer
     if (vaddr >= KERNEL_MEMORY_VADDR && vaddr < KERNEL_MEMORY_VADDR_END) {
         *(T*)&g_kernel_mem[vaddr & KERNEL_MEMORY_MASK] = data;
@@ -133,16 +139,14 @@ inline void Write(u32 addr, const T data) {
     //    _assert_msg_(MEMMAP, false, "umimplemented write to Configuration Memory");
     //} else if ((vaddr & 0xFFFFF000) == 0x1FF81000) {
     //    _assert_msg_(MEMMAP, false, "umimplemented write to shared page");
-    
+
     // Error out...
     } else {
         ERROR_LOG(MEMMAP, "unknown Write%d 0x%08X @ 0x%08X", sizeof(data) * 8, data, vaddr);
     }
 }
 
-u8 *GetPointer(const u32 addr) {
-    const u32 vaddr = _VirtualAddress(addr);
-
+u8 *GetPointer(const u32 vaddr) {
     // Kernel memory command buffer
     if (vaddr >= KERNEL_MEMORY_VADDR && vaddr < KERNEL_MEMORY_VADDR_END) {
         return g_kernel_mem + (vaddr & KERNEL_MEMORY_MASK);
@@ -185,12 +189,12 @@ u8 *GetPointer(const u32 addr) {
  */
 u32 MapBlock_Heap(u32 size, u32 operation, u32 permissions) {
     MemoryBlock block;
-    
+
     block.base_address  = HEAP_VADDR;
     block.size          = size;
     block.operation     = operation;
     block.permissions   = permissions;
-    
+
     if (g_heap_map.size() > 0) {
         const MemoryBlock last_block = g_heap_map.rbegin()->second;
         block.address = last_block.address + last_block.size;
@@ -208,12 +212,12 @@ u32 MapBlock_Heap(u32 size, u32 operation, u32 permissions) {
  */
 u32 MapBlock_HeapGSP(u32 size, u32 operation, u32 permissions) {
     MemoryBlock block;
-    
+
     block.base_address  = HEAP_GSP_VADDR;
     block.size          = size;
     block.operation     = operation;
     block.permissions   = permissions;
-    
+
     if (g_heap_gsp_map.size() > 0) {
         const MemoryBlock last_block = g_heap_gsp_map.rbegin()->second;
         block.address = last_block.address + last_block.size;
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index d64559d72..858335d44 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -45,7 +45,7 @@ struct Regs {
     INSERT_PADDING_WORDS(0x41);
 
     BitField<0, 24, u32> viewport_size_x;
-    INSERT_PADDING_WORDS(1);
+    INSERT_PADDING_WORDS(0x1);
     BitField<0, 24, u32> viewport_size_y;
 
     INSERT_PADDING_WORDS(0x1bc);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 02b174562..f11a64fad 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -81,20 +81,20 @@ void RendererOpenGL::RenderXFB(const common::Rect& src_rect, const common::Rect&
     const auto& framebuffer_top = GPU::g_regs.framebuffer_config[0];
     const auto& framebuffer_sub = GPU::g_regs.framebuffer_config[1];
     const u32 active_fb_top = (framebuffer_top.active_fb == 1)
-                                ? framebuffer_top.address_left2
-                                : framebuffer_top.address_left1;
+                                ? Memory::PhysicalToVirtualAddress(framebuffer_top.address_left2)
+                                : Memory::PhysicalToVirtualAddress(framebuffer_top.address_left1);
     const u32 active_fb_sub = (framebuffer_sub.active_fb == 1)
-                                ? framebuffer_sub.address_left2
-                                : framebuffer_sub.address_left1;
+                                ? Memory::PhysicalToVirtualAddress(framebuffer_sub.address_left2)
+                                : Memory::PhysicalToVirtualAddress(framebuffer_sub.address_left1);
 
     DEBUG_LOG(GPU, "RenderXFB: 0x%08x bytes from 0x%08x(%dx%d), fmt %x",
               framebuffer_top.stride * framebuffer_top.height,
-              GPU::GetFramebufferAddr(active_fb_top), (int)framebuffer_top.width,
+              active_fb_top, (int)framebuffer_top.width,
               (int)framebuffer_top.height, (int)framebuffer_top.format);
 
     // TODO: This should consider the GPU registers for framebuffer width, height and stride.
-    FlipFramebuffer(GPU::GetFramebufferPointer(active_fb_top), m_xfb_top_flipped);
-    FlipFramebuffer(GPU::GetFramebufferPointer(active_fb_sub), m_xfb_bottom_flipped);
+    FlipFramebuffer(Memory::GetPointer(active_fb_top), m_xfb_top_flipped);
+    FlipFramebuffer(Memory::GetPointer(active_fb_sub), m_xfb_bottom_flipped);
 
     // Blit the top framebuffer
     // ------------------------

From 8528c810cf6e2fcaad89c47ff2b598529a5bff64 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 27 Jul 2014 20:07:45 +0200
Subject: [PATCH 03/11] Video core: Add utility class for vector operations.

I wrote most of this for ppsspp, so I hold full copyright over it.
In addition to the original release in ppsspp, this provides functionality to easily extend e.g. two-dimensional vectors to three-dimensional vectors.
---
 src/video_core/CMakeLists.txt             |   3 +-
 src/video_core/math.h                     | 578 ++++++++++++++++++++++
 src/video_core/video_core.vcxproj         |   1 +
 src/video_core/video_core.vcxproj.filters |   1 +
 4 files changed, 582 insertions(+), 1 deletion(-)
 create mode 100644 src/video_core/math.h

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index e43e6e1bb..2503b9d18 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -2,8 +2,9 @@ set(SRCS    video_core.cpp
             utils.cpp
             renderer_opengl/renderer_opengl.cpp)
 
-set(HEADERS video_core.h
+set(HEADERS math.h
             utils.h
+            video_core.h
             renderer_base.h
             renderer_opengl/renderer_opengl.h)
 
diff --git a/src/video_core/math.h b/src/video_core/math.h
new file mode 100644
index 000000000..7030f2cfb
--- /dev/null
+++ b/src/video_core/math.h
@@ -0,0 +1,578 @@
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+
+// Copyright 2014 Tony Wasserka
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the owner nor the names of its contributors may
+//       be used to endorse or promote products derived from this software
+//       without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <cmath>
+
+namespace Math {
+
+template<typename T> class Vec2;
+template<typename T> class Vec3;
+template<typename T> class Vec4;
+
+
+template<typename T>
+class Vec2 {
+public:
+    struct {
+        T x,y;
+    };
+
+    T* AsArray() { return &x; }
+
+    Vec2() = default;
+    Vec2(const T a[2]) : x(a[0]), y(a[1]) {}
+    Vec2(const T& _x, const T& _y) : x(_x), y(_y) {}
+
+    template<typename T2>
+    Vec2<T2> Cast() const {
+        return Vec2<T2>((T2)x, (T2)y);
+    }
+
+    static Vec2 AssignToAll(const T& f)
+    {
+        return Vec2<T>(f, f);
+    }
+
+    void Write(T a[2])
+    {
+        a[0] = x; a[1] = y;
+    }
+
+    Vec2 operator +(const Vec2& other) const
+    {
+        return Vec2(x+other.x, y+other.y);
+    }
+    void operator += (const Vec2 &other)
+    {
+        x+=other.x; y+=other.y;
+    }
+    Vec2 operator -(const Vec2& other) const
+    {
+        return Vec2(x-other.x, y-other.y);
+    }
+    void operator -= (const Vec2& other)
+    {
+        x-=other.x; y-=other.y;
+    }
+    Vec2 operator -() const
+    {
+        return Vec2(-x,-y);
+    }
+    Vec2 operator * (const Vec2& other) const
+    {
+        return Vec2(x*other.x, y*other.y);
+    }
+    template<typename V>
+    Vec2 operator * (const V& f) const
+    {
+        return Vec2(x*f,y*f);
+    }
+    template<typename V>
+    void operator *= (const V& f)
+    {
+        x*=f; y*=f;
+    }
+    template<typename V>
+    Vec2 operator / (const V& f) const
+    {
+        return Vec2(x/f,y/f);
+    }
+    template<typename V>
+    void operator /= (const V& f)
+    {
+        *this = *this / f;
+    }
+
+    T Length2() const
+    {
+        return x*x + y*y;
+    }
+
+    // Only implemented for T=float
+    float Length() const;
+    void SetLength(const float l);
+    Vec2 WithLength(const float l) const;
+    float Distance2To(Vec2 &other);
+    Vec2 Normalized() const;
+    float Normalize(); // returns the previous length, which is often useful
+
+    T& operator [] (int i) //allow vector[1] = 3   (vector.y=3)
+    {
+        return *((&x) + i);
+    }
+    T operator [] (const int i) const
+    {
+        return *((&x) + i);
+    }
+
+    void SetZero()
+    {
+        x=0; y=0;
+    }
+
+    // Common aliases: UV (texel coordinates), ST (texture coordinates)
+    T& u() { return x; }
+    T& v() { return y; }
+    T& s() { return x; }
+    T& t() { return y; }
+
+    const T& u() const { return x; }
+    const T& v() const { return y; }
+    const T& s() const { return x; }
+    const T& t() const { return y; }
+
+    // swizzlers - create a subvector of specific components
+    Vec2 yx() const { return Vec2(y, x); }
+    Vec2 vu() const { return Vec2(y, x); }
+    Vec2 ts() const { return Vec2(y, x); }
+
+    // Inserters to add new elements to effectively create larger vectors containing this Vec2
+    Vec3<T> InsertBeforeX(const T& value) {
+        return Vec3<T>(value, x, y);
+    }
+    Vec3<T> InsertBeforeY(const T& value) {
+        return Vec3<T>(x, value, y);
+    }
+    Vec3<T> Append(const T& value) {
+        return Vec3<T>(x, y, value);
+    }
+};
+
+template<typename T, typename V>
+Vec2<T> operator * (const V& f, const Vec2<T>& vec)
+{
+    return Vec2<T>(f*vec.x,f*vec.y);
+}
+
+typedef Vec2<float> Vec2f;
+
+template<typename T>
+class Vec3
+{
+public:
+    struct
+    {
+        T x,y,z;
+    };
+
+    T* AsArray() { return &x; }
+
+    Vec3() = default;
+    Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
+    Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
+
+    template<typename T2>
+    Vec3<T2> Cast() const {
+        return Vec3<T2>((T2)x, (T2)y, (T2)z);
+    }
+
+    // Only implemented for T=int and T=float
+    static Vec3 FromRGB(unsigned int rgb);
+    unsigned int ToRGB() const; // alpha bits set to zero
+
+    static Vec3 AssignToAll(const T& f)
+    {
+        return Vec3<T>(f, f, f);
+    }
+
+    void Write(T a[3])
+    {
+        a[0] = x; a[1] = y; a[2] = z;
+    }
+
+    Vec3 operator +(const Vec3 &other) const
+    {
+        return Vec3(x+other.x, y+other.y, z+other.z);
+    }
+    void operator += (const Vec3 &other)
+    {
+        x+=other.x; y+=other.y; z+=other.z;
+    }
+    Vec3 operator -(const Vec3 &other) const
+    {
+        return Vec3(x-other.x, y-other.y, z-other.z);
+    }
+    void operator -= (const Vec3 &other)
+    {
+        x-=other.x; y-=other.y; z-=other.z;
+    }
+    Vec3 operator -() const
+    {
+        return Vec3(-x,-y,-z);
+    }
+    Vec3 operator * (const Vec3 &other) const
+    {
+        return Vec3(x*other.x, y*other.y, z*other.z);
+    }
+    template<typename V>
+    Vec3 operator * (const V& f) const
+    {
+        return Vec3(x*f,y*f,z*f);
+    }
+    template<typename V>
+    void operator *= (const V& f)
+    {
+        x*=f; y*=f; z*=f;
+    }
+    template<typename V>
+    Vec3 operator / (const V& f) const
+    {
+        return Vec3(x/f,y/f,z/f);
+    }
+    template<typename V>
+    void operator /= (const V& f)
+    {
+        *this = *this / f;
+    }
+
+    T Length2() const
+    {
+        return x*x + y*y + z*z;
+    }
+
+    // Only implemented for T=float
+    float Length() const;
+    void SetLength(const float l);
+    Vec3 WithLength(const float l) const;
+    float Distance2To(Vec3 &other);
+    Vec3 Normalized() const;
+    float Normalize(); // returns the previous length, which is often useful
+
+    T& operator [] (int i) //allow vector[2] = 3   (vector.z=3)
+    {
+        return *((&x) + i);
+    }
+    T operator [] (const int i) const
+    {
+        return *((&x) + i);
+    }
+
+    void SetZero()
+    {
+        x=0; y=0; z=0;
+    }
+
+    // Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
+    T& u() { return x; }
+    T& v() { return y; }
+    T& w() { return z; }
+
+    T& r() { return x; }
+    T& g() { return y; }
+    T& b() { return z; }
+
+    T& s() { return x; }
+    T& t() { return y; }
+    T& q() { return z; }
+
+    const T& u() const { return x; }
+    const T& v() const { return y; }
+    const T& w() const { return z; }
+
+    const T& r() const { return x; }
+    const T& g() const { return y; }
+    const T& b() const { return z; }
+
+    const T& s() const { return x; }
+    const T& t() const { return y; }
+    const T& q() const { return z; }
+
+    // swizzlers - create a subvector of specific components
+    // e.g. Vec2 uv() { return Vec2(x,y); }
+    // _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
+#define _DEFINE_SWIZZLER2(a, b, name) Vec2<T> name() const { return Vec2<T>(a, b); }
+#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
+    _DEFINE_SWIZZLER2(a, b, a##b); \
+    _DEFINE_SWIZZLER2(a, b, a2##b2); \
+    _DEFINE_SWIZZLER2(a, b, a3##b3); \
+    _DEFINE_SWIZZLER2(a, b, a4##b4); \
+    _DEFINE_SWIZZLER2(b, a, b##a); \
+    _DEFINE_SWIZZLER2(b, a, b2##a2); \
+    _DEFINE_SWIZZLER2(b, a, b3##a3); \
+    _DEFINE_SWIZZLER2(b, a, b4##a4);
+
+    DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
+    DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
+    DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
+#undef DEFINE_SWIZZLER2
+#undef _DEFINE_SWIZZLER2
+
+    // Inserters to add new elements to effectively create larger vectors containing this Vec2
+    Vec4<T> InsertBeforeX(const T& value) {
+        return Vec4<T>(value, x, y, z);
+    }
+    Vec4<T> InsertBeforeY(const T& value) {
+        return Vec4<T>(x, value, y, z);
+    }
+    Vec4<T> InsertBeforeZ(const T& value) {
+        return Vec4<T>(x, y, value, z);
+    }
+    Vec4<T> Append(const T& value) {
+        return Vec4<T>(x, y, z, value);
+    }
+};
+
+template<typename T, typename V>
+Vec3<T> operator * (const V& f, const Vec3<T>& vec)
+{
+    return Vec3<T>(f*vec.x,f*vec.y,f*vec.z);
+}
+
+typedef Vec3<float> Vec3f;
+
+template<typename T>
+class Vec4
+{
+public:
+    struct
+    {
+        T x,y,z,w;
+    };
+
+    T* AsArray() { return &x; }
+
+    Vec4() = default;
+    Vec4(const T a[4]) : x(a[0]), y(a[1]), z(a[2]), w(a[3]) {}
+    Vec4(const T& _x, const T& _y, const T& _z, const T& _w) : x(_x), y(_y), z(_z), w(_w) {}
+
+    template<typename T2>
+    Vec4<T2> Cast() const {
+        return Vec4<T2>((T2)x, (T2)y, (T2)z, (T2)w);
+    }
+
+    // Only implemented for T=int and T=float
+    static Vec4 FromRGBA(unsigned int rgba);
+    unsigned int ToRGBA() const;
+
+    static Vec4 AssignToAll(const T& f) {
+        return Vec4<T>(f, f, f, f);
+    }
+
+    void Write(T a[4])
+    {
+        a[0] = x; a[1] = y; a[2] = z; a[3] = w;
+    }
+
+    Vec4 operator +(const Vec4& other) const
+    {
+        return Vec4(x+other.x, y+other.y, z+other.z, w+other.w);
+    }
+    void operator += (const Vec4& other)
+    {
+        x+=other.x; y+=other.y; z+=other.z; w+=other.w;
+    }
+    Vec4 operator -(const Vec4 &other) const
+    {
+        return Vec4(x-other.x, y-other.y, z-other.z, w-other.w);
+    }
+    void operator -= (const Vec4 &other)
+    {
+        x-=other.x; y-=other.y; z-=other.z; w-=other.w;
+    }
+    Vec4 operator -() const
+    {
+        return Vec4(-x,-y,-z,-w);
+    }
+    Vec4 operator * (const Vec4 &other) const
+    {
+        return Vec4(x*other.x, y*other.y, z*other.z, w*other.w);
+    }
+    template<typename V>
+    Vec4 operator * (const V& f) const
+    {
+        return Vec4(x*f,y*f,z*f,w*f);
+    }
+    template<typename V>
+    void operator *= (const V& f)
+    {
+        x*=f; y*=f; z*=f; w*=f;
+    }
+    template<typename V>
+    Vec4 operator / (const V& f) const
+    {
+        return Vec4(x/f,y/f,z/f,w/f);
+    }
+    template<typename V>
+    void operator /= (const V& f)
+    {
+        *this = *this / f;
+    }
+
+    T Length2() const
+    {
+        return x*x + y*y + z*z + w*w;
+    }
+
+    // Only implemented for T=float
+    float Length() const;
+    void SetLength(const float l);
+    Vec4 WithLength(const float l) const;
+    float Distance2To(Vec4 &other);
+    Vec4 Normalized() const;
+    float Normalize(); // returns the previous length, which is often useful
+
+    T& operator [] (int i) //allow vector[2] = 3   (vector.z=3)
+    {
+        return *((&x) + i);
+    }
+    T operator [] (const int i) const
+    {
+        return *((&x) + i);
+    }
+
+    void SetZero()
+    {
+        x=0; y=0; z=0;
+    }
+
+    // Common alias: RGBA (colors)
+    T& r() { return x; }
+    T& g() { return y; }
+    T& b() { return z; }
+    T& a() { return w; }
+
+    const T& r() const { return x; }
+    const T& g() const { return y; }
+    const T& b() const { return z; }
+    const T& a() const { return w; }
+
+    // swizzlers - create a subvector of specific components
+    // e.g. Vec2 uv() { return Vec2(x,y); }
+    // _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
+#define _DEFINE_SWIZZLER2(a, b, name) Vec2<T> name() const { return Vec2<T>(a, b); }
+#define DEFINE_SWIZZLER2(a, b, a2, b2) \
+    _DEFINE_SWIZZLER2(a, b, a##b); \
+    _DEFINE_SWIZZLER2(a, b, a2##b2); \
+    _DEFINE_SWIZZLER2(b, a, b##a); \
+    _DEFINE_SWIZZLER2(b, a, b2##a2);
+
+    DEFINE_SWIZZLER2(x, y, r, g);
+    DEFINE_SWIZZLER2(x, z, r, b);
+    DEFINE_SWIZZLER2(x, w, r, a);
+    DEFINE_SWIZZLER2(y, z, g, b);
+    DEFINE_SWIZZLER2(y, w, g, a);
+    DEFINE_SWIZZLER2(z, w, b, a);
+#undef DEFINE_SWIZZLER2
+#undef _DEFINE_SWIZZLER2
+
+#define _DEFINE_SWIZZLER3(a, b, c, name) Vec3<T> name() const { return Vec3<T>(a, b, c); }
+#define DEFINE_SWIZZLER3(a, b, c, a2, b2, c2) \
+    _DEFINE_SWIZZLER3(a, b, c, a##b##c); \
+    _DEFINE_SWIZZLER3(a, c, b, a##c##b); \
+    _DEFINE_SWIZZLER3(b, a, c, b##a##c); \
+    _DEFINE_SWIZZLER3(b, c, a, b##c##a); \
+    _DEFINE_SWIZZLER3(c, a, b, c##a##b); \
+    _DEFINE_SWIZZLER3(c, b, a, c##b##a); \
+    _DEFINE_SWIZZLER3(a, b, c, a2##b2##c2); \
+    _DEFINE_SWIZZLER3(a, c, b, a2##c2##b2); \
+    _DEFINE_SWIZZLER3(b, a, c, b2##a2##c2); \
+    _DEFINE_SWIZZLER3(b, c, a, b2##c2##a2); \
+    _DEFINE_SWIZZLER3(c, a, b, c2##a2##b2); \
+    _DEFINE_SWIZZLER3(c, b, a, c2##b2##a2);
+
+    DEFINE_SWIZZLER3(x, y, z, r, g, b);
+    DEFINE_SWIZZLER3(x, y, w, r, g, a);
+    DEFINE_SWIZZLER3(x, z, w, r, b, a);
+    DEFINE_SWIZZLER3(y, z, w, g, b, a);
+#undef DEFINE_SWIZZLER3
+#undef _DEFINE_SWIZZLER3
+};
+
+
+template<typename T, typename V>
+Vec4<T> operator * (const V& f, const Vec4<T>& vec)
+{
+    return Vec4<T>(f*vec.x,f*vec.y,f*vec.z,f*vec.w);
+}
+
+typedef Vec4<float> Vec4f;
+
+
+template<typename T>
+static inline T Dot(const Vec2<T>& a, const Vec2<T>& b)
+{
+    return a.x*b.x + a.y*b.y;
+}
+
+template<typename T>
+static inline T Dot(const Vec3<T>& a, const Vec3<T>& b)
+{
+    return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+template<typename T>
+static inline T Dot(const Vec4<T>& a, const Vec4<T>& b)
+{
+    return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
+}
+
+template<typename T>
+static inline Vec3<T> Cross(const Vec3<T>& a, const Vec3<T>& b)
+{
+    return Vec3<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+}
+
+// linear interpolation via float: 0.0=begin, 1.0=end
+template<typename X>
+static inline X Lerp(const X& begin, const X& end, const float t)
+{
+    return begin*(1.f-t) + end*t;
+}
+
+// linear interpolation via int: 0=begin, base=end
+template<typename X, int base>
+static inline X LerpInt(const X& begin, const X& end, const int t)
+{
+    return (begin*(base-t) + end*t) / base;
+}
+
+// Utility vector factories
+template<typename T>
+static inline Vec2<T> MakeVec2(const T& x, const T& y)
+{
+    return Vec2<T>{x, y};
+}
+
+template<typename T>
+static inline Vec3<T> MakeVec3(const T& x, const T& y, const T& z)
+{
+    return Vec3<T>{x, y, z};
+}
+
+template<typename T>
+static inline Vec4<T> MakeVec4(const T& x, const T& y, const T& z, const T& w)
+{
+    return Vec4<T>{x, y, z, w};
+}
+
+} // namespace
diff --git a/src/video_core/video_core.vcxproj b/src/video_core/video_core.vcxproj
index d77be2bef..2dbfc68dd 100644
--- a/src/video_core/video_core.vcxproj
+++ b/src/video_core/video_core.vcxproj
@@ -25,6 +25,7 @@
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="gpu_debugger.h" />
+    <ClInclude Include="math.h" />
     <ClInclude Include="pica.h" />
     <ClInclude Include="renderer_base.h" />
     <ClInclude Include="utils.h" />
diff --git a/src/video_core/video_core.vcxproj.filters b/src/video_core/video_core.vcxproj.filters
index b89ac1ac4..b42823d2a 100644
--- a/src/video_core/video_core.vcxproj.filters
+++ b/src/video_core/video_core.vcxproj.filters
@@ -17,6 +17,7 @@
       <Filter>renderer_opengl</Filter>
     </ClInclude>
     <ClInclude Include="gpu_debugger.h" />
+    <ClInclude Include="math.h" />
     <ClInclude Include="pica.h" />
     <ClInclude Include="renderer_base.h" />
     <ClInclude Include="utils.h" />

From 98ad16a45b9441a54d80e67425ac3ddee24f08dc Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 27 Jul 2014 20:08:42 +0200
Subject: [PATCH 04/11] Pica: Add float24 structure.

24-bit floating points are used internally for calculations on the GPU, however the current code will still emulate that with 32-bit floating points.
In the future we might want to accurately perform the calculations with correct bitness in the future, but for now we just wrap the calculations around this class.
---
 src/video_core/pica.h | 75 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 858335d44..24b39a3ad 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -161,6 +161,81 @@ ASSERT_REG_POSITION(vertex_descriptor, 0x200);
 // The total number of registers is chosen arbitrarily, but let's make sure it's not some odd value anyway.
 static_assert(sizeof(Regs) == 0x300 * sizeof(u32), "Invalid total size of register set");
 
+
+struct float24 {
+    static float24 FromFloat32(float val) {
+        float24 ret;
+        ret.value = val;
+        return ret;
+    }
+
+    // 16 bit mantissa, 7 bit exponent, 1 bit sign
+    // TODO: No idea if this works as intended
+    static float24 FromRawFloat24(u32 hex) {
+        float24 ret;
+        if ((hex & 0xFFFFFF) == 0) {
+            ret.value = 0;
+        } else {
+            u32 mantissa = hex & 0xFFFF;
+            u32 exponent = (hex >> 16) & 0x7F;
+            u32 sign = hex >> 23;
+            ret.value = powf(2.0f, (float)exponent-63.0f) * (1.0f + mantissa * powf(2.0f, -16.f));
+            if (sign)
+                ret.value = -ret.value;
+        }
+        return ret;
+    }
+
+    // Not recommended for anything but logging
+    float ToFloat32() const {
+        return value;
+    }
+
+    float24 operator * (const float24& flt) const {
+        return float24::FromFloat32(ToFloat32() * flt.ToFloat32());
+    }
+
+    float24 operator / (const float24& flt) const {
+        return float24::FromFloat32(ToFloat32() / flt.ToFloat32());
+    }
+
+    float24 operator + (const float24& flt) const {
+        return float24::FromFloat32(ToFloat32() + flt.ToFloat32());
+    }
+
+    float24 operator - (const float24& flt) const {
+        return float24::FromFloat32(ToFloat32() - flt.ToFloat32());
+    }
+
+    float24 operator - () const {
+        return float24::FromFloat32(-ToFloat32());
+    }
+
+    bool operator < (const float24& flt) const {
+        return ToFloat32() < flt.ToFloat32();
+    }
+
+    bool operator > (const float24& flt) const {
+        return ToFloat32() > flt.ToFloat32();
+    }
+
+    bool operator >= (const float24& flt) const {
+        return ToFloat32() >= flt.ToFloat32();
+    }
+
+    bool operator <= (const float24& flt) const {
+        return ToFloat32() <= flt.ToFloat32();
+    }
+
+private:
+    float24() = default;
+
+    // Stored as a regular float, merely for convenience
+    // TODO: Perform proper arithmetic on this!
+    float value;
+};
+
+
 union CommandHeader {
     CommandHeader(u32 h) : hex(h) {}
 

From 76a586de4952df6d8dd9db9d97716c00690cebdd Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sat, 26 Jul 2014 14:42:46 +0200
Subject: [PATCH 05/11] Pica: Add command processor.

---
 src/citra_qt/debugger/graphics_cmdlists.cpp |  2 +-
 src/core/hw/gpu.cpp                         |  8 +--
 src/video_core/CMakeLists.txt               |  7 ++-
 src/video_core/command_processor.cpp        | 60 +++++++++++++++++++++
 src/video_core/command_processor.h          | 31 +++++++++++
 src/video_core/gpu_debugger.h               |  8 +--
 src/video_core/pica.h                       |  2 +
 src/video_core/video_core.vcxproj           |  2 +
 src/video_core/video_core.vcxproj.filters   |  2 +
 9 files changed, 113 insertions(+), 9 deletions(-)
 create mode 100644 src/video_core/command_processor.cpp
 create mode 100644 src/video_core/command_processor.h

diff --git a/src/citra_qt/debugger/graphics_cmdlists.cpp b/src/citra_qt/debugger/graphics_cmdlists.cpp
index 30b8b5dae..e98560a19 100644
--- a/src/citra_qt/debugger/graphics_cmdlists.cpp
+++ b/src/citra_qt/debugger/graphics_cmdlists.cpp
@@ -78,7 +78,7 @@ QVariant GPUCommandListModel::data(const QModelIndex& index, int role) const
         // index refers to a specific command
         const GraphicsDebugger::PicaCommandList& cmdlist = command_lists[item->parent->index].second;
         const GraphicsDebugger::PicaCommand& cmd = cmdlist[item->index];
-        const Pica::CommandHeader& header = cmd.GetHeader();
+        const Pica::CommandProcessor::CommandHeader& header = cmd.GetHeader();
 
         if (role == Qt::DisplayRole) {
             QString content;
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index 591997aa3..87cf93bac 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -14,6 +14,7 @@
 
 #include "core/hw/gpu.h"
 
+#include "video_core/command_processor.h"
 #include "video_core/video_core.h"
 
 
@@ -143,14 +144,15 @@ inline void Write(u32 addr, const T data) {
         break;
     }
 
+    // Seems like writing to this register triggers processing
     case GPU_REG_INDEX(command_processor_config.trigger):
     {
         const auto& config = g_regs.command_processor_config;
         if (config.trigger & 1)
         {
-            // u32* buffer = (u32*)Memory::GetPointer(config.GetPhysicalAddress());
-            ERROR_LOG(GPU, "Beginning 0x%08x bytes of commands from address 0x%08x", config.size, config.GetPhysicalAddress());
-            // TODO: Process command list!
+            u32* buffer = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetPhysicalAddress()));
+            u32 size = config.size << 3;
+            Pica::CommandProcessor::ProcessCommandList(buffer, size);
         }
         break;
     }
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 2503b9d18..8977c8dca 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,11 +1,14 @@
-set(SRCS    video_core.cpp
+set(SRCS    command_processor.cpp
             utils.cpp
+            video_core.cpp
             renderer_opengl/renderer_opengl.cpp)
 
-set(HEADERS math.h
+set(HEADERS command_processor.h
+            math.h
             utils.h
             video_core.h
             renderer_base.h
+            video_core.h
             renderer_opengl/renderer_opengl.h)
 
 add_library(video_core STATIC ${SRCS} ${HEADERS})
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
new file mode 100644
index 000000000..515c407ea
--- /dev/null
+++ b/src/video_core/command_processor.cpp
@@ -0,0 +1,60 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#include "pica.h"
+#include "command_processor.h"
+
+
+namespace Pica {
+
+Regs registers;
+
+namespace CommandProcessor {
+
+static inline void WritePicaReg(u32 id, u32 value) {
+    u32 old_value = registers[id];
+    registers[id] = value;
+
+    switch(id) {
+        // TODO: Perform actions for anything which requires special treatment here...
+
+        default:
+            break;
+    }
+}
+
+static std::ptrdiff_t ExecuteCommandBlock(const u32* first_command_word) {
+    const CommandHeader& header = *(const CommandHeader*)(&first_command_word[1]);
+
+    u32* read_pointer = (u32*)first_command_word;
+
+    // TODO: Take parameter mask into consideration!
+
+    WritePicaReg(header.cmd_id, *read_pointer);
+    read_pointer += 2;
+
+    for (int i = 1; i < 1+header.extra_data_length; ++i) {
+        u32 cmd = header.cmd_id + ((header.group_commands) ? i : 0);
+        WritePicaReg(cmd, *read_pointer);
+        ++read_pointer;
+    }
+
+    // align read pointer to 8 bytes
+    if ((first_command_word - read_pointer) % 2)
+        ++read_pointer;
+
+    return read_pointer - first_command_word;
+}
+
+void ProcessCommandList(const u32* list, u32 size) {
+    u32* read_pointer = (u32*)list;
+
+    while (read_pointer < list + size) {
+        read_pointer += ExecuteCommandBlock(read_pointer);
+    }
+}
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/command_processor.h b/src/video_core/command_processor.h
new file mode 100644
index 000000000..6b6241a25
--- /dev/null
+++ b/src/video_core/command_processor.h
@@ -0,0 +1,31 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+#include "pica.h"
+
+namespace Pica {
+
+namespace CommandProcessor {
+
+union CommandHeader {
+    u32 hex;
+
+    BitField< 0, 16, u32> cmd_id;
+    BitField<16,  4, u32> parameter_mask;
+    BitField<20, 11, u32> extra_data_length;
+    BitField<31,  1, u32> group_commands;
+};
+static_assert(std::is_standard_layout<CommandHeader>::value == true, "CommandHeader does not use standard layout");
+static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");
+
+void ProcessCommandList(const u32* list, u32 size);
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/gpu_debugger.h b/src/video_core/gpu_debugger.h
index 5d85f90b9..2ba873457 100644
--- a/src/video_core/gpu_debugger.h
+++ b/src/video_core/gpu_debugger.h
@@ -11,6 +11,8 @@
 #include "common/log.h"
 
 #include "core/hle/service/gsp.h"
+
+#include "command_processor.h"
 #include "pica.h"
 
 class GraphicsDebugger
@@ -20,10 +22,10 @@ public:
     // A vector of commands represented by their raw byte sequence
     struct PicaCommand : public std::vector<u32>
     {
-        const Pica::CommandHeader& GetHeader() const
+        const Pica::CommandProcessor::CommandHeader& GetHeader() const
         {
             const u32& val = at(1);
-            return *(Pica::CommandHeader*)&val;
+            return *(Pica::CommandProcessor::CommandHeader*)&val;
         }
     };
 
@@ -99,7 +101,7 @@ public:
         PicaCommandList cmdlist;
         for (u32* parse_pointer = command_list; parse_pointer < command_list + size_in_words;)
         {
-            const Pica::CommandHeader header = static_cast<Pica::CommandHeader>(parse_pointer[1]);
+            const Pica::CommandProcessor::CommandHeader& header = *(Pica::CommandProcessor::CommandHeader*)(&parse_pointer[1]);
 
             cmdlist.push_back(PicaCommand());
             auto& cmd = cmdlist.back();
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 24b39a3ad..0e231c6c9 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -161,6 +161,8 @@ ASSERT_REG_POSITION(vertex_descriptor, 0x200);
 // The total number of registers is chosen arbitrarily, but let's make sure it's not some odd value anyway.
 static_assert(sizeof(Regs) == 0x300 * sizeof(u32), "Invalid total size of register set");
 
+extern Regs registers; // TODO: Not sure if we want to have one global instance for this
+
 
 struct float24 {
     static float24 FromFloat32(float val) {
diff --git a/src/video_core/video_core.vcxproj b/src/video_core/video_core.vcxproj
index 2dbfc68dd..28eb21284 100644
--- a/src/video_core/video_core.vcxproj
+++ b/src/video_core/video_core.vcxproj
@@ -20,10 +20,12 @@
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="renderer_opengl\renderer_opengl.cpp" />
+    <ClCompile Include="command_processor.cpp" />
     <ClCompile Include="utils.cpp" />
     <ClCompile Include="video_core.cpp" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="command_processor.h" />
     <ClInclude Include="gpu_debugger.h" />
     <ClInclude Include="math.h" />
     <ClInclude Include="pica.h" />
diff --git a/src/video_core/video_core.vcxproj.filters b/src/video_core/video_core.vcxproj.filters
index b42823d2a..713458fcf 100644
--- a/src/video_core/video_core.vcxproj.filters
+++ b/src/video_core/video_core.vcxproj.filters
@@ -9,6 +9,7 @@
     <ClCompile Include="renderer_opengl\renderer_opengl.cpp">
       <Filter>renderer_opengl</Filter>
     </ClCompile>
+    <ClCompile Include="command_processor.cpp" />
     <ClCompile Include="utils.cpp" />
     <ClCompile Include="video_core.cpp" />
   </ItemGroup>
@@ -16,6 +17,7 @@
     <ClInclude Include="renderer_opengl\renderer_opengl.h">
       <Filter>renderer_opengl</Filter>
     </ClInclude>
+    <ClInclude Include="command_processor.h" />
     <ClInclude Include="gpu_debugger.h" />
     <ClInclude Include="math.h" />
     <ClInclude Include="pica.h" />

From 1a43f694777d356349c1e8a76eded883937efb87 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sat, 26 Jul 2014 15:17:37 +0200
Subject: [PATCH 06/11] Pica: Add register definition for vertex loading and
 rendering.

---
 src/video_core/pica.h | 157 +++++++++++++++++++++++++++++++++---------
 1 file changed, 126 insertions(+), 31 deletions(-)

diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 0e231c6c9..5bd7f416e 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -11,6 +11,8 @@
 #include "common/bit_field.h"
 #include "common/common_types.h"
 
+#include "core/mem_map.h"
+
 namespace Pica {
 
 // Returns index corresponding to the Regs member labeled by field_name
@@ -50,7 +52,7 @@ struct Regs {
 
     INSERT_PADDING_WORDS(0x1bc);
 
-    union {
+    struct {
         enum class Format : u64 {
             BYTE = 0,
             UBYTE = 1,
@@ -58,36 +60,127 @@ struct Regs {
             FLOAT = 3,
         };
 
-        BitField< 0,  2, Format> format0;
-        BitField< 2,  2, u64> size0;      // number of elements minus 1
-        BitField< 4,  2, Format> format1;
-        BitField< 6,  2, u64> size1;
-        BitField< 8,  2, Format> format2;
-        BitField<10,  2, u64> size2;
-        BitField<12,  2, Format> format3;
-        BitField<14,  2, u64> size3;
-        BitField<16,  2, Format> format4;
-        BitField<18,  2, u64> size4;
-        BitField<20,  2, Format> format5;
-        BitField<22,  2, u64> size5;
-        BitField<24,  2, Format> format6;
-        BitField<26,  2, u64> size6;
-        BitField<28,  2, Format> format7;
-        BitField<30,  2, u64> size7;
-        BitField<32,  2, Format> format8;
-        BitField<34,  2, u64> size8;
-        BitField<36,  2, Format> format9;
-        BitField<38,  2, u64> size9;
-        BitField<40,  2, Format> format10;
-        BitField<42,  2, u64> size10;
-        BitField<44,  2, Format> format11;
-        BitField<46,  2, u64> size11;
+        BitField<0, 29, u32> base_address;
 
-        BitField<48, 12, u64> attribute_mask;
-        BitField<60,  4, u64> num_attributes; // number of total attributes minus 1
-    } vertex_descriptor;
+        inline u32 GetBaseAddress() const {
+            // TODO: Ugly, should fix PhysicalToVirtualAddress instead
+            return (base_address * 8) - Memory::FCRAM_PADDR + Memory::HEAP_GSP_VADDR;
+        }
 
-    INSERT_PADDING_WORDS(0xfe);
+        // Descriptor for internal vertex attributes
+        union {
+            BitField< 0,  2, Format> format0; // size of one element
+            BitField< 2,  2, u64> size0;      // number of elements minus 1
+            BitField< 4,  2, Format> format1;
+            BitField< 6,  2, u64> size1;
+            BitField< 8,  2, Format> format2;
+            BitField<10,  2, u64> size2;
+            BitField<12,  2, Format> format3;
+            BitField<14,  2, u64> size3;
+            BitField<16,  2, Format> format4;
+            BitField<18,  2, u64> size4;
+            BitField<20,  2, Format> format5;
+            BitField<22,  2, u64> size5;
+            BitField<24,  2, Format> format6;
+            BitField<26,  2, u64> size6;
+            BitField<28,  2, Format> format7;
+            BitField<30,  2, u64> size7;
+            BitField<32,  2, Format> format8;
+            BitField<34,  2, u64> size8;
+            BitField<36,  2, Format> format9;
+            BitField<38,  2, u64> size9;
+            BitField<40,  2, Format> format10;
+            BitField<42,  2, u64> size10;
+            BitField<44,  2, Format> format11;
+            BitField<46,  2, u64> size11;
+
+            BitField<48, 12, u64> attribute_mask;
+
+            // number of total attributes minus 1
+            BitField<60,  4, u64> num_extra_attributes;
+        };
+
+        inline Format GetFormat(int n) const {
+            Format formats[] = {
+                format0, format1, format2, format3,
+                format4, format5, format6, format7,
+                format8, format9, format10, format11
+            };
+            return formats[n];
+        }
+
+        inline int GetNumElements(int n) const {
+            int sizes[] = {
+                size0, size1, size2, size3,
+                size4, size5, size6, size7,
+                size8, size9, size10, size11
+            };
+            return sizes[n]+1;
+        }
+
+        inline int GetElementSizeInBytes(int n) const {
+            return (GetFormat(n) == Format::FLOAT) ? 4 :
+                (GetFormat(n) == Format::SHORT) ? 2 : 1;
+        }
+
+        inline int GetStride(int n) const {
+            return GetNumElements(n) * GetElementSizeInBytes(n);
+        }
+
+        inline int GetNumTotalAttributes() const {
+            return num_extra_attributes+1;
+        }
+
+        // Attribute loaders map the source vertex data to input attributes
+        // This e.g. allows to load different attributes from different memory locations
+        struct Loader {
+            // Source attribute data offset from the base address
+            u32 data_offset;
+
+            union {
+                BitField< 0, 4, u64> comp0;
+                BitField< 4, 4, u64> comp1;
+                BitField< 8, 4, u64> comp2;
+                BitField<12, 4, u64> comp3;
+                BitField<16, 4, u64> comp4;
+                BitField<20, 4, u64> comp5;
+                BitField<24, 4, u64> comp6;
+                BitField<28, 4, u64> comp7;
+                BitField<32, 4, u64> comp8;
+                BitField<36, 4, u64> comp9;
+                BitField<40, 4, u64> comp10;
+                BitField<44, 4, u64> comp11;
+
+                // bytes for a single vertex in this loader
+                BitField<48, 8, u64> byte_count;
+
+                BitField<60, 4, u64> component_count;
+            };
+
+            inline int GetComponent(int n) const {
+                int components[] = {
+                    comp0, comp1, comp2, comp3,
+                    comp4, comp5, comp6, comp7,
+                    comp8, comp9, comp10, comp11
+                };
+                return components[n];
+            }
+        } attribute_loaders[12];
+    } vertex_attributes;
+
+    struct {
+        enum IndexFormat : u32 {
+            BYTE = 0,
+            SHORT = 1,
+        };
+
+        union {
+            BitField<0, 31, u32> offset; // relative to base attribute address
+            BitField<31, 1, IndexFormat> format;
+        };
+    } index_array;
+
+    INSERT_PADDING_WORDS(0xd8);
 
 #undef INSERT_PADDING_WORDS_HELPER1
 #undef INSERT_PADDING_WORDS_HELPER2
@@ -112,7 +205,8 @@ struct Regs {
 
         ADD_FIELD(viewport_size_x);
         ADD_FIELD(viewport_size_y);
-        ADD_FIELD(vertex_descriptor);
+        ADD_FIELD(vertex_attributes);
+        ADD_FIELD(index_array);
 
         #undef ADD_FIELD
         #endif // _MSC_VER
@@ -153,7 +247,8 @@ private:
 
 ASSERT_REG_POSITION(viewport_size_x, 0x41);
 ASSERT_REG_POSITION(viewport_size_y, 0x43);
-ASSERT_REG_POSITION(vertex_descriptor, 0x200);
+ASSERT_REG_POSITION(vertex_attributes, 0x200);
+ASSERT_REG_POSITION(index_array, 0x227);
 
 #undef ASSERT_REG_POSITION
 #endif // !defined(_MSC_VER)

From d443f0a92183c94eaa0c33dddbb450eb8fe2fd07 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sat, 26 Jul 2014 16:19:11 +0200
Subject: [PATCH 07/11] Pica: Implement vertex loading.

---
 src/video_core/command_processor.cpp | 81 +++++++++++++++++++++++++++-
 src/video_core/pica.h                | 29 +++++++---
 2 files changed, 102 insertions(+), 8 deletions(-)

diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 515c407ea..e909c8c32 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -4,6 +4,7 @@
 
 #include "pica.h"
 #include "command_processor.h"
+#include "math.h"
 
 
 namespace Pica {
@@ -17,7 +18,85 @@ static inline void WritePicaReg(u32 id, u32 value) {
     registers[id] = value;
 
     switch(id) {
-        // TODO: Perform actions for anything which requires special treatment here...
+        // It seems like these trigger vertex rendering
+        case PICA_REG_INDEX(trigger_draw):
+        case PICA_REG_INDEX(trigger_draw_indexed):
+        {
+            const auto& attribute_config = registers.vertex_attributes;
+            const u8* const base_address = Memory::GetPointer(attribute_config.GetBaseAddress());
+
+            // Information about internal vertex attributes
+            const u8* vertex_attribute_sources[16];
+            u32 vertex_attribute_strides[16];
+            u32 vertex_attribute_formats[16];
+            u32 vertex_attribute_elements[16];
+            u32 vertex_attribute_element_size[16];
+
+            // Setup attribute data from loaders
+            for (int loader = 0; loader < 12; ++loader) {
+                const auto& loader_config = attribute_config.attribute_loaders[loader];
+
+                const u8* load_address = base_address + loader_config.data_offset;
+
+                // TODO: What happens if a loader overwrites a previous one's data?
+                for (int component = 0; component < loader_config.component_count; ++component) {
+                    u32 attribute_index = loader_config.GetComponent(component);
+                    vertex_attribute_sources[attribute_index] = load_address;
+                    vertex_attribute_strides[attribute_index] = loader_config.byte_count;
+                    vertex_attribute_formats[attribute_index] = (u32)attribute_config.GetFormat(attribute_index);
+                    vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index);
+                    vertex_attribute_element_size[attribute_index] = attribute_config.GetElementSizeInBytes(attribute_index);
+                    load_address += attribute_config.GetStride(attribute_index);
+                }
+            }
+
+            // Load vertices
+            bool is_indexed = (id == PICA_REG_INDEX(trigger_draw_indexed));
+
+            const auto& index_info = registers.index_array;
+            const u8* index_address_8 = (u8*)base_address + index_info.offset;
+            const u16* index_address_16 = (u16*)index_address_8;
+            bool index_u16 = (bool)index_info.format;
+
+            for (int index = 0; index < registers.num_vertices; ++index)
+            {
+                int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index;
+
+                if (is_indexed) {
+                    // TODO: Implement some sort of vertex cache!
+                }
+
+                // Initialize data for the current vertex
+                struct {
+                    Math::Vec4<float24> attr[16];
+                } input;
+
+                for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
+                    for (int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
+                        const u8* srcdata = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i];
+                        const float srcval = (vertex_attribute_formats[i] == 0) ? *(s8*)srcdata :
+                                             (vertex_attribute_formats[i] == 1) ? *(u8*)srcdata :
+                                             (vertex_attribute_formats[i] == 2) ? *(s16*)srcdata :
+                                                                                  *(float*)srcdata;
+                        input.attr[i][comp] = float24::FromFloat32(srcval);
+                        DEBUG_LOG(GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08x + 0x%04x: %f",
+                                  comp, i, vertex, index,
+                                  attribute_config.GetBaseAddress(),
+                                  vertex_attribute_sources[i] - base_address,
+                                  srcdata - vertex_attribute_sources[i],
+                                  input.attr[i][comp].ToFloat32());
+                    }
+                }
+                // TODO: Run vertex data through vertex shader
+
+                if (is_indexed) {
+                    // TODO: Add processed vertex to vertex cache!
+                }
+
+                // TODO: Submit vertex to primitive assembly
+            }
+            break;
+        }
 
         default:
             break;
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 5bd7f416e..faf124c3d 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -64,7 +64,7 @@ struct Regs {
 
         inline u32 GetBaseAddress() const {
             // TODO: Ugly, should fix PhysicalToVirtualAddress instead
-            return (base_address * 8) - Memory::FCRAM_PADDR + Memory::HEAP_GSP_VADDR;
+            return DecodeAddressRegister(base_address) - Memory::FCRAM_PADDR + Memory::HEAP_GSP_VADDR;
         }
 
         // Descriptor for internal vertex attributes
@@ -110,12 +110,12 @@ struct Regs {
         }
 
         inline int GetNumElements(int n) const {
-            int sizes[] = {
+            u64 sizes[] = {
                 size0, size1, size2, size3,
                 size4, size5, size6, size7,
                 size8, size9, size10, size11
             };
-            return sizes[n]+1;
+            return (int)sizes[n]+1;
         }
 
         inline int GetElementSizeInBytes(int n) const {
@@ -128,7 +128,7 @@ struct Regs {
         }
 
         inline int GetNumTotalAttributes() const {
-            return num_extra_attributes+1;
+            return (int)num_extra_attributes+1;
         }
 
         // Attribute loaders map the source vertex data to input attributes
@@ -158,12 +158,12 @@ struct Regs {
             };
 
             inline int GetComponent(int n) const {
-                int components[] = {
+                u64 components[] = {
                     comp0, comp1, comp2, comp3,
                     comp4, comp5, comp6, comp7,
                     comp8, comp9, comp10, comp11
                 };
-                return components[n];
+                return (int)components[n];
             }
         } attribute_loaders[12];
     } vertex_attributes;
@@ -180,7 +180,16 @@ struct Regs {
         };
     } index_array;
 
-    INSERT_PADDING_WORDS(0xd8);
+    // Number of vertices to render
+    u32 num_vertices;
+
+    INSERT_PADDING_WORDS(0x5);
+
+    // These two trigger rendering of triangles
+    u32 trigger_draw;
+    u32 trigger_draw_indexed;
+
+    INSERT_PADDING_WORDS(0xd0);
 
 #undef INSERT_PADDING_WORDS_HELPER1
 #undef INSERT_PADDING_WORDS_HELPER2
@@ -207,6 +216,9 @@ struct Regs {
         ADD_FIELD(viewport_size_y);
         ADD_FIELD(vertex_attributes);
         ADD_FIELD(index_array);
+        ADD_FIELD(num_vertices);
+        ADD_FIELD(trigger_draw);
+        ADD_FIELD(trigger_draw_indexed);
 
         #undef ADD_FIELD
         #endif // _MSC_VER
@@ -249,6 +261,9 @@ ASSERT_REG_POSITION(viewport_size_x, 0x41);
 ASSERT_REG_POSITION(viewport_size_y, 0x43);
 ASSERT_REG_POSITION(vertex_attributes, 0x200);
 ASSERT_REG_POSITION(index_array, 0x227);
+ASSERT_REG_POSITION(num_vertices, 0x228);
+ASSERT_REG_POSITION(trigger_draw, 0x22e);
+ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f);
 
 #undef ASSERT_REG_POSITION
 #endif // !defined(_MSC_VER)

From c52651261916b136f2ea4ff022fb9cead5a73a93 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sat, 26 Jul 2014 19:17:09 +0200
Subject: [PATCH 08/11] Pica: Add vertex shader implementation.

---
 src/video_core/CMakeLists.txt             |   2 +
 src/video_core/command_processor.cpp      | 108 ++++++++-
 src/video_core/pica.h                     | 137 ++++++++++-
 src/video_core/vertex_shader.cpp          | 270 ++++++++++++++++++++++
 src/video_core/vertex_shader.h            | 211 +++++++++++++++++
 src/video_core/video_core.vcxproj         |   2 +
 src/video_core/video_core.vcxproj.filters |   2 +
 7 files changed, 722 insertions(+), 10 deletions(-)
 create mode 100644 src/video_core/vertex_shader.cpp
 create mode 100644 src/video_core/vertex_shader.h

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 8977c8dca..74304ee49 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(SRCS    command_processor.cpp
             utils.cpp
+            vertex_shader.cpp
             video_core.cpp
             renderer_opengl/renderer_opengl.cpp)
 
@@ -8,6 +9,7 @@ set(HEADERS command_processor.h
             utils.h
             video_core.h
             renderer_base.h
+            vertex_shader.h
             video_core.h
             renderer_opengl/renderer_opengl.h)
 
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index e909c8c32..339fa7726 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -2,9 +2,10 @@
 // Licensed under GPLv2
 // Refer to the license.txt file included.
 
-#include "pica.h"
 #include "command_processor.h"
 #include "math.h"
+#include "pica.h"
+#include "vertex_shader.h"
 
 
 namespace Pica {
@@ -13,6 +14,14 @@ Regs registers;
 
 namespace CommandProcessor {
 
+static int float_regs_counter = 0;
+
+static u32 uniform_write_buffer[4];
+
+// Used for VSLoadProgramData and VSLoadSwizzleData
+static u32 vs_binary_write_offset = 0;
+static u32 vs_swizzle_write_offset = 0;
+
 static inline void WritePicaReg(u32 id, u32 value) {
     u32 old_value = registers[id];
     registers[id] = value;
@@ -67,9 +76,7 @@ static inline void WritePicaReg(u32 id, u32 value) {
                 }
 
                 // Initialize data for the current vertex
-                struct {
-                    Math::Vec4<float24> attr[16];
-                } input;
+                VertexShader::InputVertex input;
 
                 for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
                     for (int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
@@ -87,7 +94,7 @@ static inline void WritePicaReg(u32 id, u32 value) {
                                   input.attr[i][comp].ToFloat32());
                     }
                 }
-                // TODO: Run vertex data through vertex shader
+                VertexShader::OutputVertex output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes());
 
                 if (is_indexed) {
                     // TODO: Add processed vertex to vertex cache!
@@ -98,6 +105,97 @@ static inline void WritePicaReg(u32 id, u32 value) {
             break;
         }
 
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[0], 0x2c1):
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[1], 0x2c2):
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[2], 0x2c3):
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[3], 0x2c4):
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[4], 0x2c5):
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[5], 0x2c6):
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[6], 0x2c7):
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[7], 0x2c8):
+        {
+            auto& uniform_setup = registers.vs_uniform_setup;
+
+            // TODO: Does actual hardware indeed keep an intermediate buffer or does
+            //       it directly write the values?
+            uniform_write_buffer[float_regs_counter++] = value;
+
+            // Uniforms are written in a packed format such that 4 float24 values are encoded in
+            // three 32-bit numbers. We write to internal memory once a full such vector is
+            // written.
+            if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) ||
+                (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) {
+                float_regs_counter = 0;
+
+                auto& uniform = VertexShader::GetFloatUniform(uniform_setup.index);
+
+                if (uniform_setup.index > 95) {
+                    ERROR_LOG(GPU, "Invalid VS uniform index %d", (int)uniform_setup.index);
+                    break;
+                }
+
+                // NOTE: The destination component order indeed is "backwards"
+                if (uniform_setup.IsFloat32()) {
+                    for (auto i : {0,1,2,3})
+                        uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i]));
+                } else {
+                    // TODO: Untested
+                    uniform.w = float24::FromRawFloat24(uniform_write_buffer[0] >> 8);
+                    uniform.z = float24::FromRawFloat24(((uniform_write_buffer[0] & 0xFF)<<16) | ((uniform_write_buffer[1] >> 16) & 0xFFFF));
+                    uniform.y = float24::FromRawFloat24(((uniform_write_buffer[1] & 0xFFFF)<<8) | ((uniform_write_buffer[2] >> 24) & 0xFF));
+                    uniform.x = float24::FromRawFloat24(uniform_write_buffer[2] & 0xFFFFFF);
+                }
+
+                DEBUG_LOG(GPU, "Set uniform %x to (%f %f %f %f)", (int)uniform_setup.index,
+                          uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(),
+                          uniform.w.ToFloat32());
+
+                // TODO: Verify that this actually modifies the register!
+                uniform_setup.index = uniform_setup.index + 1;
+            }
+            break;
+        }
+
+        // Seems to be used to reset the write pointer for VSLoadProgramData
+        case PICA_REG_INDEX(vs_program.begin_load):
+            vs_binary_write_offset = 0;
+            break;
+
+        // Load shader program code
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd):
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[2], 0x2ce):
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[3], 0x2cf):
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[4], 0x2d0):
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[5], 0x2d1):
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2):
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3):
+        {
+            VertexShader::SubmitShaderMemoryChange(vs_binary_write_offset, value);
+            vs_binary_write_offset++;
+            break;
+        }
+
+        // Seems to be used to reset the write pointer for VSLoadSwizzleData
+        case PICA_REG_INDEX(vs_swizzle_patterns.begin_load):
+            vs_swizzle_write_offset = 0;
+            break;
+
+        // Load swizzle pattern data
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6):
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7):
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[2], 0x2d8):
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[3], 0x2d9):
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[4], 0x2da):
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[5], 0x2db):
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc):
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd):
+        {
+            VertexShader::SubmitSwizzleDataChange(vs_swizzle_write_offset, value);
+            vs_swizzle_write_offset++;
+            break;
+        }
+
         default:
             break;
     }
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index faf124c3d..42303a585 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -50,7 +50,39 @@ struct Regs {
     INSERT_PADDING_WORDS(0x1);
     BitField<0, 24, u32> viewport_size_y;
 
-    INSERT_PADDING_WORDS(0x1bc);
+    INSERT_PADDING_WORDS(0xc);
+
+    union {
+        // Maps components of output vertex attributes to semantics
+        enum Semantic : u32
+        {
+            POSITION_X   =  0,
+            POSITION_Y   =  1,
+            POSITION_Z   =  2,
+            POSITION_W   =  3,
+
+            COLOR_R      =  8,
+            COLOR_G      =  9,
+            COLOR_B      = 10,
+            COLOR_A      = 11,
+
+            TEXCOORD0_U  = 12,
+            TEXCOORD0_V  = 13,
+            TEXCOORD1_U  = 14,
+            TEXCOORD1_V  = 15,
+            TEXCOORD2_U  = 22,
+            TEXCOORD2_V  = 23,
+
+            INVALID      = 31,
+        };
+
+        BitField< 0, 5, Semantic> map_x;
+        BitField< 8, 5, Semantic> map_y;
+        BitField<16, 5, Semantic> map_z;
+        BitField<24, 5, Semantic> map_w;
+    } vs_output_attributes[7];
+
+    INSERT_PADDING_WORDS(0x1a9);
 
     struct {
         enum class Format : u64 {
@@ -133,7 +165,7 @@ struct Regs {
 
         // Attribute loaders map the source vertex data to input attributes
         // This e.g. allows to load different attributes from different memory locations
-        struct Loader {
+        struct {
             // Source attribute data offset from the base address
             u32 data_offset;
 
@@ -189,7 +221,90 @@ struct Regs {
     u32 trigger_draw;
     u32 trigger_draw_indexed;
 
-    INSERT_PADDING_WORDS(0xd0);
+    INSERT_PADDING_WORDS(0x8a);
+
+    // Offset to shader program entry point (in words)
+    BitField<0, 16, u32> vs_main_offset;
+
+    union {
+        BitField< 0, 4, u64> attribute0_register;
+        BitField< 4, 4, u64> attribute1_register;
+        BitField< 8, 4, u64> attribute2_register;
+        BitField<12, 4, u64> attribute3_register;
+        BitField<16, 4, u64> attribute4_register;
+        BitField<20, 4, u64> attribute5_register;
+        BitField<24, 4, u64> attribute6_register;
+        BitField<28, 4, u64> attribute7_register;
+        BitField<32, 4, u64> attribute8_register;
+        BitField<36, 4, u64> attribute9_register;
+        BitField<40, 4, u64> attribute10_register;
+        BitField<44, 4, u64> attribute11_register;
+        BitField<48, 4, u64> attribute12_register;
+        BitField<52, 4, u64> attribute13_register;
+        BitField<56, 4, u64> attribute14_register;
+        BitField<60, 4, u64> attribute15_register;
+
+        int GetRegisterForAttribute(int attribute_index) {
+            u64 fields[] = {
+                attribute0_register,  attribute1_register,  attribute2_register,  attribute3_register,
+                attribute4_register,  attribute5_register,  attribute6_register,  attribute7_register,
+                attribute8_register,  attribute9_register,  attribute10_register, attribute11_register,
+                attribute12_register, attribute13_register, attribute14_register, attribute15_register,
+            };
+            return (int)fields[attribute_index];
+        }
+    } vs_input_register_map;
+
+    INSERT_PADDING_WORDS(0x3);
+
+    struct {
+        enum Format : u32
+        {
+            FLOAT24 = 0,
+            FLOAT32 = 1
+        };
+
+        bool IsFloat32() const {
+            return format == FLOAT32;
+        }
+
+        union {
+            // Index of the next uniform to write to
+            // TODO: ctrulib uses 8 bits for this, however that seems to yield lots of invalid indices
+            BitField<0, 7, u32> index;
+
+            BitField<31, 1, Format> format;
+        };
+
+        // Writing to these registers sets the "current" uniform.
+        // TODO: It's not clear how the hardware stores what the "current" uniform is.
+        u32 set_value[8];
+
+    } vs_uniform_setup;
+
+    INSERT_PADDING_WORDS(0x2);
+
+    struct {
+        u32 begin_load;
+
+        // Writing to these registers sets the "current" word in the shader program.
+        // TODO: It's not clear how the hardware stores what the "current" word is.
+        u32 set_word[8];
+    } vs_program;
+
+    INSERT_PADDING_WORDS(0x1);
+
+    // This register group is used to load an internal table of swizzling patterns,
+    // which are indexed by each shader instruction to specify vector component swizzling.
+    struct {
+        u32 begin_load;
+
+        // Writing to these registers sets the "current" swizzle pattern in the table.
+        // TODO: It's not clear how the hardware stores what the "current" swizzle pattern is.
+        u32 set_word[8];
+    } vs_swizzle_patterns;
+
+    INSERT_PADDING_WORDS(0x22);
 
 #undef INSERT_PADDING_WORDS_HELPER1
 #undef INSERT_PADDING_WORDS_HELPER2
@@ -219,6 +334,11 @@ struct Regs {
         ADD_FIELD(num_vertices);
         ADD_FIELD(trigger_draw);
         ADD_FIELD(trigger_draw_indexed);
+        ADD_FIELD(vs_main_offset);
+        ADD_FIELD(vs_input_register_map);
+        ADD_FIELD(vs_uniform_setup);
+        ADD_FIELD(vs_program);
+        ADD_FIELD(vs_swizzle_patterns);
 
         #undef ADD_FIELD
         #endif // _MSC_VER
@@ -259,17 +379,25 @@ private:
 
 ASSERT_REG_POSITION(viewport_size_x, 0x41);
 ASSERT_REG_POSITION(viewport_size_y, 0x43);
+ASSERT_REG_POSITION(vs_output_attributes[0], 0x50);
+ASSERT_REG_POSITION(vs_output_attributes[1], 0x51);
 ASSERT_REG_POSITION(vertex_attributes, 0x200);
 ASSERT_REG_POSITION(index_array, 0x227);
 ASSERT_REG_POSITION(num_vertices, 0x228);
 ASSERT_REG_POSITION(trigger_draw, 0x22e);
 ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f);
+ASSERT_REG_POSITION(vs_main_offset, 0x2ba);
+ASSERT_REG_POSITION(vs_input_register_map, 0x2bb);
+ASSERT_REG_POSITION(vs_uniform_setup, 0x2c0);
+ASSERT_REG_POSITION(vs_program, 0x2cb);
+ASSERT_REG_POSITION(vs_swizzle_patterns, 0x2d5);
 
 #undef ASSERT_REG_POSITION
 #endif // !defined(_MSC_VER)
 
 // The total number of registers is chosen arbitrarily, but let's make sure it's not some odd value anyway.
-static_assert(sizeof(Regs) == 0x300 * sizeof(u32), "Invalid total size of register set");
+static_assert(sizeof(Regs) <= 0x300 * sizeof(u32), "Register set structure larger than it should be");
+static_assert(sizeof(Regs) >= 0x300 * sizeof(u32), "Register set structure smaller than it should be");
 
 extern Regs registers; // TODO: Not sure if we want to have one global instance for this
 
@@ -347,7 +475,6 @@ private:
     float value;
 };
 
-
 union CommandHeader {
     CommandHeader(u32 h) : hex(h) {}
 
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
new file mode 100644
index 000000000..93830a96a
--- /dev/null
+++ b/src/video_core/vertex_shader.cpp
@@ -0,0 +1,270 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#include "pica.h"
+#include "vertex_shader.h"
+#include <core/mem_map.h>
+#include <common/file_util.h>
+
+namespace Pica {
+
+namespace VertexShader {
+
+static struct {
+    Math::Vec4<float24> f[96];
+} shader_uniforms;
+
+
+// TODO: Not sure where the shader binary and swizzle patterns are supposed to be loaded to!
+// For now, we just keep these local arrays around.
+static u32 shader_memory[1024];
+static u32 swizzle_data[1024];
+
+void SubmitShaderMemoryChange(u32 addr, u32 value)
+{
+    shader_memory[addr] = value;
+}
+
+void SubmitSwizzleDataChange(u32 addr, u32 value)
+{
+    swizzle_data[addr] = value;
+}
+
+Math::Vec4<float24>& GetFloatUniform(u32 index)
+{
+    return shader_uniforms.f[index];
+}
+
+struct VertexShaderState {
+    u32* program_counter;
+
+    const float24* input_register_table[16];
+    float24* output_register_table[7*4];
+
+    Math::Vec4<float24> temporary_registers[16];
+    bool status_registers[2];
+
+    enum {
+        INVALID_ADDRESS = 0xFFFFFFFF
+    };
+    u32 call_stack[8]; // TODO: What is the maximal call stack depth?
+    u32* call_stack_pointer;
+};
+
+static void ProcessShaderCode(VertexShaderState& state) {
+    while (true) {
+        bool increment_pc = true;
+        bool exit_loop = false;
+        const Instruction& instr = *(const Instruction*)state.program_counter;
+
+        const float24* src1_ = (instr.common.src1 < 0x10) ? state.input_register_table[instr.common.src1]
+                             : (instr.common.src1 < 0x20) ? &state.temporary_registers[instr.common.src1-0x10].x
+                             : (instr.common.src1 < 0x80) ? &shader_uniforms.f[instr.common.src1-0x20].x
+                             : nullptr;
+        const float24* src2_ = (instr.common.src2 < 0x10) ? state.input_register_table[instr.common.src2]
+                             : &state.temporary_registers[instr.common.src2-0x10].x;
+        // TODO: Unsure about the limit values
+        float24* dest = (instr.common.dest <= 0x1C) ? state.output_register_table[instr.common.dest]
+                             : (instr.common.dest <= 0x3C) ? nullptr
+                             : (instr.common.dest <= 0x7C) ? &state.temporary_registers[(instr.common.dest-0x40)/4][instr.common.dest%4]
+                             : nullptr;
+
+        const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
+
+        const float24 src1[4] = {
+            src1_[(int)swizzle.GetSelectorSrc1(0)],
+            src1_[(int)swizzle.GetSelectorSrc1(1)],
+            src1_[(int)swizzle.GetSelectorSrc1(2)],
+            src1_[(int)swizzle.GetSelectorSrc1(3)],
+        };
+        const float24 src2[4] = {
+            src2_[(int)swizzle.GetSelectorSrc2(0)],
+            src2_[(int)swizzle.GetSelectorSrc2(1)],
+            src2_[(int)swizzle.GetSelectorSrc2(2)],
+            src2_[(int)swizzle.GetSelectorSrc2(3)],
+        };
+
+        switch (instr.opcode) {
+            case Instruction::OpCode::ADD:
+            {
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    dest[i] = src1[i] + src2[i];
+                }
+
+                break;
+            }
+
+            case Instruction::OpCode::MUL:
+            {
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    dest[i] = src1[i] * src2[i];
+                }
+
+                break;
+            }
+
+            case Instruction::OpCode::DP3:
+            case Instruction::OpCode::DP4:
+            {
+                float24 dot = float24::FromFloat32(0.f);
+                int num_components = (instr.opcode == Instruction::OpCode::DP3) ? 3 : 4;
+                for (int i = 0; i < num_components; ++i)
+                    dot = dot + src1[i] * src2[i];
+
+                for (int i = 0; i < num_components; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    dest[i] = dot;
+                }
+                break;
+            }
+
+            // Reciprocal
+            case Instruction::OpCode::RCP:
+            {
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    // TODO: Be stable against division by zero!
+                    // TODO: I think this might be wrong... we should only use one component here
+                    dest[i] = float24::FromFloat32(1.0 / src1[i].ToFloat32());
+                }
+
+                break;
+            }
+
+            // Reciprocal Square Root
+            case Instruction::OpCode::RSQ:
+            {
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    // TODO: Be stable against division by zero!
+                    // TODO: I think this might be wrong... we should only use one component here
+                    dest[i] = float24::FromFloat32(1.0 / sqrt(src1[i].ToFloat32()));
+                }
+
+                break;
+            }
+
+            case Instruction::OpCode::MOV:
+            {
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    dest[i] = src1[i];
+                }
+                break;
+            }
+
+            case Instruction::OpCode::RET:
+                if (*state.call_stack_pointer == VertexShaderState::INVALID_ADDRESS) {
+                    exit_loop = true;
+                } else {
+                    state.program_counter = &shader_memory[*state.call_stack_pointer--];
+                    *state.call_stack_pointer = VertexShaderState::INVALID_ADDRESS;
+                }
+
+                break;
+
+            case Instruction::OpCode::CALL:
+                increment_pc = false;
+
+                _dbg_assert_(GPU, state.call_stack_pointer - state.call_stack < sizeof(state.call_stack));
+
+                *++state.call_stack_pointer = state.program_counter - shader_memory;
+                // TODO: Does this offset refer to the beginning of shader memory?
+                state.program_counter = &shader_memory[instr.flow_control.offset_words];
+                break;
+
+            case Instruction::OpCode::FLS:
+                // TODO: Do whatever needs to be done here?
+                break;
+
+            default:
+                ERROR_LOG(GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
+                          (int)instr.opcode.Value(), instr.GetOpCodeName().c_str(), instr.hex);
+                break;
+        }
+
+        if (increment_pc)
+            ++state.program_counter;
+
+        if (exit_loop)
+            break;
+    }
+}
+
+OutputVertex RunShader(const InputVertex& input, int num_attributes)
+{
+    VertexShaderState state;
+
+    const u32* main = &shader_memory[registers.vs_main_offset];
+    state.program_counter = (u32*)main;
+
+    // Setup input register table
+    const auto& attribute_register_map = registers.vs_input_register_map;
+    float24 dummy_register;
+    std::fill(&state.input_register_table[0], &state.input_register_table[16], &dummy_register);
+    if(num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x;
+    if(num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x;
+    if(num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x;
+    if(num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x;
+    if(num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x;
+    if(num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x;
+    if(num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x;
+    if(num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x;
+    if(num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x;
+    if(num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x;
+    if(num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x;
+    if(num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x;
+    if(num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x;
+    if(num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x;
+    if(num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x;
+    if(num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x;
+
+    // Setup output register table
+    OutputVertex ret;
+    for (int i = 0; i < 7; ++i) {
+        const auto& output_register_map = registers.vs_output_attributes[i];
+
+        u32 semantics[4] = {
+            output_register_map.map_x, output_register_map.map_y,
+            output_register_map.map_z, output_register_map.map_w
+        };
+
+        for (int comp = 0; comp < 4; ++comp)
+            state.output_register_table[4*i+comp] = ((float24*)&ret) + semantics[comp];
+    }
+
+    state.status_registers[0] = false;
+    state.status_registers[1] = false;
+    std::fill(state.call_stack, state.call_stack + sizeof(state.call_stack) / sizeof(state.call_stack[0]),
+              VertexShaderState::INVALID_ADDRESS);
+    state.call_stack_pointer = &state.call_stack[0];
+
+    ProcessShaderCode(state);
+
+    DEBUG_LOG(GPU, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
+        ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
+        ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),
+        ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32());
+
+    return ret;
+}
+
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h
new file mode 100644
index 000000000..1b71e367b
--- /dev/null
+++ b/src/video_core/vertex_shader.h
@@ -0,0 +1,211 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <initializer_list>
+
+#include <common/common_types.h>
+
+#include "math.h"
+#include "pica.h"
+
+namespace Pica {
+
+namespace VertexShader {
+
+struct InputVertex {
+    Math::Vec4<float24> attr[16];
+};
+
+struct OutputVertex {
+    OutputVertex() = default;
+
+    // VS output attributes
+    Math::Vec4<float24> pos;
+    Math::Vec4<float24> dummy; // quaternions (not implemented, yet)
+    Math::Vec4<float24> color;
+    Math::Vec2<float24> tc0;
+    float24 tc0_v;
+
+    // Padding for optimal alignment
+    float24 pad[14];
+
+    // Attributes used to store intermediate results
+
+    // position after perspective divide
+    Math::Vec3<float24> screenpos;
+
+    // Linear interpolation
+    // factor: 0=this, 1=vtx
+    void Lerp(float24 factor, const OutputVertex& vtx) {
+        pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor);
+
+        // TODO: Should perform perspective correct interpolation here...
+        tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor);
+
+        screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor);
+
+        color = color * factor + vtx.color * (float24::FromFloat32(1) - factor);
+    }
+
+    // Linear interpolation
+    // factor: 0=v0, 1=v1
+    static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) {
+        OutputVertex ret = v0;
+        ret.Lerp(factor, v1);
+        return ret;
+    }
+};
+static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
+
+union Instruction {
+    enum class OpCode : u32 {
+        ADD = 0x0,
+        DP3 = 0x1,
+        DP4 = 0x2,
+
+        MUL = 0x8,
+
+        MAX = 0xC,
+        MIN = 0xD,
+        RCP = 0xE,
+        RSQ = 0xF,
+
+        MOV = 0x13,
+
+        RET = 0x21,
+        FLS = 0x22, // Flush
+        CALL = 0x24,
+    };
+
+    std::string GetOpCodeName() const {
+        std::map<OpCode, std::string> map = {
+            { OpCode::ADD, "ADD" },
+            { OpCode::DP3, "DP3" },
+            { OpCode::DP4, "DP4" },
+            { OpCode::MUL, "MUL" },
+            { OpCode::MAX, "MAX" },
+            { OpCode::MIN, "MIN" },
+            { OpCode::RCP, "RCP" },
+            { OpCode::RSQ, "RSQ" },
+            { OpCode::MOV, "MOV" },
+            { OpCode::RET, "RET" },
+            { OpCode::FLS, "FLS" },
+        };
+        auto it = map.find(opcode);
+        if (it == map.end())
+            return "UNK";
+        else
+            return it->second;
+    }
+
+    u32 hex;
+
+    BitField<0x1a, 0x6, OpCode> opcode;
+
+    // General notes:
+    //
+    // When two input registers are used, one of them uses a 5-bit index while the other
+    // one uses a 7-bit index. This is because at most one floating point uniform may be used
+    // as an input.
+
+
+    // Format used e.g. by arithmetic instructions and comparisons
+    // "src1" and "src2" specify register indices (i.e. indices referring to groups of 4 floats),
+    // while "dest" addresses individual floats.
+    union {
+        BitField<0x00, 0x5, u32> operand_desc_id;
+        BitField<0x07, 0x5, u32> src2;
+        BitField<0x0c, 0x7, u32> src1;
+        BitField<0x13, 0x7, u32> dest;
+    } common;
+
+    // Format used for flow control instructions ("if")
+    union {
+        BitField<0x00, 0x8, u32> num_instructions;
+        BitField<0x0a, 0xc, u32> offset_words;
+    } flow_control;
+};
+
+union SwizzlePattern {
+    u32 hex;
+
+    enum class Selector : u32 {
+        x = 0,
+        y = 1,
+        z = 2,
+        w = 3
+    };
+
+    Selector GetSelectorSrc1(int comp) const {
+        Selector selectors[] = {
+            src1_selector_0, src1_selector_1, src1_selector_2, src1_selector_3
+        };
+        return selectors[comp];
+    }
+
+    Selector GetSelectorSrc2(int comp) const {
+        Selector selectors[] = {
+            src2_selector_0, src2_selector_1, src2_selector_2, src2_selector_3
+        };
+        return selectors[comp];
+    }
+
+    bool DestComponentEnabled(int i) const {
+        return (dest_mask & (0x8 >> i));
+    }
+
+    std::string SelectorToString(bool src2) const {
+        std::map<Selector, std::string> map = {
+            { Selector::x, "x" },
+            { Selector::y, "y" },
+            { Selector::z, "z" },
+            { Selector::w, "w" }
+        };
+        std::string ret;
+        for (int i = 0; i < 4; ++i) {
+            ret += map.at(src2 ? GetSelectorSrc2(i) : GetSelectorSrc1(i));
+        }
+        return ret;
+    }
+
+    std::string DestMaskToString() const {
+        std::string ret;
+        for (int i = 0; i < 4; ++i) {
+            if (!DestComponentEnabled(i))
+                ret += "_";
+            else
+                ret += "xyzw"[i];
+        }
+        return ret;
+    }
+
+    // Components of "dest" that should be written to: LSB=dest.w, MSB=dest.x
+    BitField< 0, 4, u32> dest_mask;
+
+    BitField< 5, 2, Selector> src1_selector_3;
+    BitField< 7, 2, Selector> src1_selector_2;
+    BitField< 9, 2, Selector> src1_selector_1;
+    BitField<11, 2, Selector> src1_selector_0;
+
+    BitField<14, 2, Selector> src2_selector_3;
+    BitField<16, 2, Selector> src2_selector_2;
+    BitField<18, 2, Selector> src2_selector_1;
+    BitField<20, 2, Selector> src2_selector_0;
+
+    BitField<31, 1, u32> flag; // not sure what this means, maybe it's the sign?
+};
+
+void SubmitShaderMemoryChange(u32 addr, u32 value);
+void SubmitSwizzleDataChange(u32 addr, u32 value);
+
+OutputVertex RunShader(const InputVertex& input, int num_attributes);
+
+Math::Vec4<float24>& GetFloatUniform(u32 index);
+
+} // namespace
+
+} // namespace
+
diff --git a/src/video_core/video_core.vcxproj b/src/video_core/video_core.vcxproj
index 28eb21284..56729dc03 100644
--- a/src/video_core/video_core.vcxproj
+++ b/src/video_core/video_core.vcxproj
@@ -22,6 +22,7 @@
     <ClCompile Include="renderer_opengl\renderer_opengl.cpp" />
     <ClCompile Include="command_processor.cpp" />
     <ClCompile Include="utils.cpp" />
+    <ClCompile Include="vertex_shader.cpp" />
     <ClCompile Include="video_core.cpp" />
   </ItemGroup>
   <ItemGroup>
@@ -31,6 +32,7 @@
     <ClInclude Include="pica.h" />
     <ClInclude Include="renderer_base.h" />
     <ClInclude Include="utils.h" />
+    <ClInclude Include="vertex_shader.h" />
     <ClInclude Include="video_core.h" />
     <ClInclude Include="renderer_opengl\renderer_opengl.h" />
   </ItemGroup>
diff --git a/src/video_core/video_core.vcxproj.filters b/src/video_core/video_core.vcxproj.filters
index 713458fcf..db0b37018 100644
--- a/src/video_core/video_core.vcxproj.filters
+++ b/src/video_core/video_core.vcxproj.filters
@@ -11,6 +11,7 @@
     </ClCompile>
     <ClCompile Include="command_processor.cpp" />
     <ClCompile Include="utils.cpp" />
+    <ClCompile Include="vertex_shader.cpp" />
     <ClCompile Include="video_core.cpp" />
   </ItemGroup>
   <ItemGroup>
@@ -23,6 +24,7 @@
     <ClInclude Include="pica.h" />
     <ClInclude Include="renderer_base.h" />
     <ClInclude Include="utils.h" />
+    <ClInclude Include="vertex_shader.h" />
     <ClInclude Include="video_core.h" />
   </ItemGroup>
   <ItemGroup>

From 9a76a2d0611fc0c35b665fb886d437e8f4d5b4df Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 27 Jul 2014 14:58:30 +0200
Subject: [PATCH 09/11] Pica: Add primitive assembly stage.

---
 src/video_core/CMakeLists.txt             |  2 +
 src/video_core/command_processor.cpp      |  3 +-
 src/video_core/pica.h                     | 15 ++++++-
 src/video_core/primitive_assembly.cpp     | 52 +++++++++++++++++++++++
 src/video_core/primitive_assembly.h       | 21 +++++++++
 src/video_core/video_core.vcxproj         |  2 +
 src/video_core/video_core.vcxproj.filters |  2 +
 7 files changed, 95 insertions(+), 2 deletions(-)
 create mode 100644 src/video_core/primitive_assembly.cpp
 create mode 100644 src/video_core/primitive_assembly.h

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 74304ee49..b06f14db0 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(SRCS    command_processor.cpp
+            primitive_assembly.cpp
             utils.cpp
             vertex_shader.cpp
             video_core.cpp
@@ -6,6 +7,7 @@ set(SRCS    command_processor.cpp
 
 set(HEADERS command_processor.h
             math.h
+            primitive_assembly.h
             utils.h
             video_core.h
             renderer_base.h
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 339fa7726..020a4da3f 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -5,6 +5,7 @@
 #include "command_processor.h"
 #include "math.h"
 #include "pica.h"
+#include "primitive_assembly.h"
 #include "vertex_shader.h"
 
 
@@ -100,7 +101,7 @@ static inline void WritePicaReg(u32 id, u32 value) {
                     // TODO: Add processed vertex to vertex cache!
                 }
 
-                // TODO: Submit vertex to primitive assembly
+                PrimitiveAssembly::SubmitVertex(output);
             }
             break;
         }
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 42303a585..6bbd3ce33 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -221,7 +221,18 @@ struct Regs {
     u32 trigger_draw;
     u32 trigger_draw_indexed;
 
-    INSERT_PADDING_WORDS(0x8a);
+    INSERT_PADDING_WORDS(0x2e);
+
+    enum class TriangleTopology : u32 {
+        List        = 0,
+        Strip       = 1,
+        Fan         = 2,
+        ListIndexed = 3, // TODO: No idea if this is correct
+    };
+
+    BitField<8, 2, TriangleTopology> triangle_topology;
+
+    INSERT_PADDING_WORDS(0x5b);
 
     // Offset to shader program entry point (in words)
     BitField<0, 16, u32> vs_main_offset;
@@ -334,6 +345,7 @@ struct Regs {
         ADD_FIELD(num_vertices);
         ADD_FIELD(trigger_draw);
         ADD_FIELD(trigger_draw_indexed);
+        ADD_FIELD(triangle_topology);
         ADD_FIELD(vs_main_offset);
         ADD_FIELD(vs_input_register_map);
         ADD_FIELD(vs_uniform_setup);
@@ -386,6 +398,7 @@ ASSERT_REG_POSITION(index_array, 0x227);
 ASSERT_REG_POSITION(num_vertices, 0x228);
 ASSERT_REG_POSITION(trigger_draw, 0x22e);
 ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f);
+ASSERT_REG_POSITION(triangle_topology, 0x25e);
 ASSERT_REG_POSITION(vs_main_offset, 0x2ba);
 ASSERT_REG_POSITION(vs_input_register_map, 0x2bb);
 ASSERT_REG_POSITION(vs_uniform_setup, 0x2c0);
diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp
new file mode 100644
index 000000000..b2196d13c
--- /dev/null
+++ b/src/video_core/primitive_assembly.cpp
@@ -0,0 +1,52 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#include "pica.h"
+#include "primitive_assembly.h"
+#include "vertex_shader.h"
+
+namespace Pica {
+
+namespace PrimitiveAssembly {
+
+static OutputVertex buffer[2];
+static int buffer_index = 0; // TODO: reset this on emulation restart
+
+void SubmitVertex(OutputVertex& vtx)
+{
+    switch (registers.triangle_topology) {
+        case Regs::TriangleTopology::List:
+        case Regs::TriangleTopology::ListIndexed:
+            if (buffer_index < 2) {
+                buffer[buffer_index++] = vtx;
+            } else {
+                buffer_index = 0;
+
+                // TODO
+                // Clipper::ProcessTriangle(buffer[0], buffer[1], vtx);
+            }
+            break;
+
+        case Regs::TriangleTopology::Fan:
+            if (buffer_index == 2) {
+                buffer_index = 0;
+
+                // TODO
+                // Clipper::ProcessTriangle(buffer[0], buffer[1], vtx);
+
+                buffer[1] = vtx;
+            } else {
+                buffer[buffer_index++] = vtx;
+            }
+            break;
+
+        default:
+            ERROR_LOG(GPU, "Unknown triangle mode %x:", (int)registers.triangle_topology.Value());
+            break;
+    }
+}
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h
new file mode 100644
index 000000000..2a2b0c170
--- /dev/null
+++ b/src/video_core/primitive_assembly.h
@@ -0,0 +1,21 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#pragma once
+
+namespace Pica {
+
+namespace VertexShader {
+    struct OutputVertex;
+}
+
+namespace PrimitiveAssembly {
+
+using VertexShader::OutputVertex;
+
+void SubmitVertex(OutputVertex& vtx);
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/video_core.vcxproj b/src/video_core/video_core.vcxproj
index 56729dc03..9cf3b0858 100644
--- a/src/video_core/video_core.vcxproj
+++ b/src/video_core/video_core.vcxproj
@@ -21,6 +21,7 @@
   <ItemGroup>
     <ClCompile Include="renderer_opengl\renderer_opengl.cpp" />
     <ClCompile Include="command_processor.cpp" />
+    <ClCompile Include="primitive_assembly.cpp" />
     <ClCompile Include="utils.cpp" />
     <ClCompile Include="vertex_shader.cpp" />
     <ClCompile Include="video_core.cpp" />
@@ -30,6 +31,7 @@
     <ClInclude Include="gpu_debugger.h" />
     <ClInclude Include="math.h" />
     <ClInclude Include="pica.h" />
+    <ClInclude Include="primitive_assembly.h" />
     <ClInclude Include="renderer_base.h" />
     <ClInclude Include="utils.h" />
     <ClInclude Include="vertex_shader.h" />
diff --git a/src/video_core/video_core.vcxproj.filters b/src/video_core/video_core.vcxproj.filters
index db0b37018..9da20b284 100644
--- a/src/video_core/video_core.vcxproj.filters
+++ b/src/video_core/video_core.vcxproj.filters
@@ -10,6 +10,7 @@
       <Filter>renderer_opengl</Filter>
     </ClCompile>
     <ClCompile Include="command_processor.cpp" />
+    <ClCompile Include="primitive_assembly.cpp" />
     <ClCompile Include="utils.cpp" />
     <ClCompile Include="vertex_shader.cpp" />
     <ClCompile Include="video_core.cpp" />
@@ -22,6 +23,7 @@
     <ClInclude Include="gpu_debugger.h" />
     <ClInclude Include="math.h" />
     <ClInclude Include="pica.h" />
+    <ClInclude Include="primitive_assembly.h" />
     <ClInclude Include="renderer_base.h" />
     <ClInclude Include="utils.h" />
     <ClInclude Include="vertex_shader.h" />

From 94aa9da562457e1fed4911d1cda770c3e42bd419 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 27 Jul 2014 17:34:11 +0200
Subject: [PATCH 10/11] Pica: Add triangle clipper.

---
 src/video_core/CMakeLists.txt             |   6 +-
 src/video_core/clipper.cpp                | 178 ++++++++++++++++++++++
 src/video_core/clipper.h                  |  21 +++
 src/video_core/pica.h                     |  22 ++-
 src/video_core/primitive_assembly.cpp     |   7 +-
 src/video_core/video_core.vcxproj         |   2 +
 src/video_core/video_core.vcxproj.filters |   2 +
 7 files changed, 230 insertions(+), 8 deletions(-)
 create mode 100644 src/video_core/clipper.cpp
 create mode 100644 src/video_core/clipper.h

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index b06f14db0..828bf30fc 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,11 +1,13 @@
-set(SRCS    command_processor.cpp
+set(SRCS    clipper.cpp
+            command_processor.cpp
             primitive_assembly.cpp
             utils.cpp
             vertex_shader.cpp
             video_core.cpp
             renderer_opengl/renderer_opengl.cpp)
 
-set(HEADERS command_processor.h
+set(HEADERS clipper.h
+            command_processor.h
             math.h
             primitive_assembly.h
             utils.h
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
new file mode 100644
index 000000000..e9ab6242c
--- /dev/null
+++ b/src/video_core/clipper.cpp
@@ -0,0 +1,178 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#include <vector>
+
+#include "clipper.h"
+#include "pica.h"
+#include "vertex_shader.h"
+
+namespace Pica {
+
+namespace Clipper {
+
+struct ClippingEdge {
+public:
+    enum Type {
+        POS_X = 0,
+        NEG_X = 1,
+        POS_Y = 2,
+        NEG_Y = 3,
+        POS_Z = 4,
+        NEG_Z = 5,
+    };
+
+    ClippingEdge(Type type, float24 position) : type(type), pos(position) {}
+
+    bool IsInside(const OutputVertex& vertex) const {
+        switch (type) {
+        case POS_X: return vertex.pos.x <= pos * vertex.pos.w;
+        case NEG_X: return vertex.pos.x >= pos * vertex.pos.w;
+        case POS_Y: return vertex.pos.y <= pos * vertex.pos.w;
+        case NEG_Y: return vertex.pos.y >= pos * vertex.pos.w;
+
+        // TODO: Check z compares ... should be 0..1 instead?
+        case POS_Z: return vertex.pos.z <= pos * vertex.pos.w;
+
+        default:
+        case NEG_Z: return vertex.pos.z >= pos * vertex.pos.w;
+        }
+    }
+
+    bool IsOutSide(const OutputVertex& vertex) const {
+        return !IsInside(vertex);
+    }
+
+    OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const {
+        auto dotpr = [this](const OutputVertex& vtx) {
+            switch (type) {
+            case POS_X: return vtx.pos.x - vtx.pos.w;
+            case NEG_X: return -vtx.pos.x - vtx.pos.w;
+            case POS_Y: return vtx.pos.y - vtx.pos.w;
+            case NEG_Y: return -vtx.pos.y - vtx.pos.w;
+
+            // TODO: Verify z clipping
+            case POS_Z: return vtx.pos.z - vtx.pos.w;
+
+            default:
+            case NEG_Z: return -vtx.pos.w;
+            }
+        };
+
+        float24 dp = dotpr(v0);
+        float24 dp_prev = dotpr(v1);
+        float24 factor = dp_prev / (dp_prev - dp);
+
+        return OutputVertex::Lerp(factor, v0, v1);
+    }
+
+private:
+    Type type;
+    float24 pos;
+};
+
+static void InitScreenCoordinates(OutputVertex& vtx)
+{
+    struct {
+        float24 halfsize_x;
+        float24 offset_x;
+        float24 halfsize_y;
+        float24 offset_y;
+        float24 zscale;
+        float24 offset_z;
+    } viewport;
+
+    viewport.halfsize_x = float24::FromRawFloat24(registers.viewport_size_x);
+    viewport.halfsize_y = float24::FromRawFloat24(registers.viewport_size_y);
+    viewport.offset_x   = float24::FromFloat32(registers.viewport_corner.x);
+    viewport.offset_y   = float24::FromFloat32(registers.viewport_corner.y);
+    viewport.zscale     = float24::FromRawFloat24(registers.viewport_depth_range);
+    viewport.offset_z   = float24::FromRawFloat24(registers.viewport_depth_far_plane);
+
+    // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
+    vtx.screenpos[0] = (vtx.pos.x / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_x / float24::FromFloat32(2.0) + viewport.offset_x;
+    vtx.screenpos[1] = (vtx.pos.y / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
+    vtx.screenpos[2] = viewport.offset_z - vtx.pos.z / vtx.pos.w * viewport.zscale;
+}
+
+void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
+
+    // TODO (neobrain):
+    // The list of output vertices has some fixed maximum size,
+    // however I haven't taken the time to figure out what it is exactly.
+    // For now, we hence just assume a maximal size of 1000 vertices.
+    const size_t max_vertices = 1000;
+    std::vector<OutputVertex> buffer_vertices;
+    std::vector<OutputVertex*> output_list{ &v0, &v1, &v2 };
+
+    // Make sure to reserve space for all vertices.
+    // Without this, buffer reallocation would invalidate references.
+    buffer_vertices.reserve(max_vertices);
+
+    // Simple implementation of the Sutherland-Hodgman clipping algorithm.
+    // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
+    for (auto edge : { ClippingEdge(ClippingEdge::POS_X, float24::FromFloat32(+1.0)),
+                       ClippingEdge(ClippingEdge::NEG_X, float24::FromFloat32(-1.0)),
+                       ClippingEdge(ClippingEdge::POS_Y, float24::FromFloat32(+1.0)),
+                       ClippingEdge(ClippingEdge::NEG_Y, float24::FromFloat32(-1.0)),
+                       ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
+                       ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
+
+        const std::vector<OutputVertex*> input_list = output_list;
+        output_list.clear();
+
+        const OutputVertex* reference_vertex = input_list.back();
+
+        for (const auto& vertex : input_list) {
+            // NOTE: This algorithm changes vertex order in some cases!
+            if (edge.IsInside(*vertex)) {
+                if (edge.IsOutSide(*reference_vertex)) {
+                    buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex));
+                    output_list.push_back(&(buffer_vertices.back()));
+                }
+
+                output_list.push_back(vertex);
+            } else if (edge.IsInside(*reference_vertex)) {
+                buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex));
+                output_list.push_back(&(buffer_vertices.back()));
+            }
+
+            reference_vertex = vertex;
+        }
+
+        // Need to have at least a full triangle to continue...
+        if (output_list.size() < 3)
+            return;
+    }
+
+    InitScreenCoordinates(*(output_list[0]));
+    InitScreenCoordinates(*(output_list[1]));
+
+    for (int i = 0; i < output_list.size() - 2; i ++) {
+        OutputVertex& vtx0 = *(output_list[0]);
+        OutputVertex& vtx1 = *(output_list[i+1]);
+        OutputVertex& vtx2 = *(output_list[i+2]);
+
+        InitScreenCoordinates(vtx2);
+
+        DEBUG_LOG(GPU,
+                  "Triangle %d/%d (%d buffer vertices) at position (%.3f, %.3f, %.3f, %.3f), "
+                  "(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and "
+                  "screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)",
+                  i,output_list.size(), buffer_vertices.size(),
+                  vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),output_list.size(),
+                  vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),
+                  vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(),
+                  vtx0.screenpos.x.ToFloat32(), vtx0.screenpos.y.ToFloat32(), vtx0.screenpos.z.ToFloat32(),
+                  vtx1.screenpos.x.ToFloat32(), vtx1.screenpos.y.ToFloat32(), vtx1.screenpos.z.ToFloat32(),
+                  vtx2.screenpos.x.ToFloat32(), vtx2.screenpos.y.ToFloat32(), vtx2.screenpos.z.ToFloat32());
+
+        // TODO: Send triangle to rasterizer
+    }
+}
+
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/clipper.h b/src/video_core/clipper.h
new file mode 100644
index 000000000..14d31ca1e
--- /dev/null
+++ b/src/video_core/clipper.h
@@ -0,0 +1,21 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#pragma once
+
+namespace Pica {
+
+namespace VertexShader {
+    struct OutputVertex;
+}
+
+namespace Clipper {
+
+using VertexShader::OutputVertex;
+
+void ProcessTriangle(OutputVertex& v0, OutputVertex& v1, OutputVertex& v2);
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 6bbd3ce33..1ced0d323 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -50,7 +50,12 @@ struct Regs {
     INSERT_PADDING_WORDS(0x1);
     BitField<0, 24, u32> viewport_size_y;
 
-    INSERT_PADDING_WORDS(0xc);
+    INSERT_PADDING_WORDS(0x9);
+
+    BitField<0, 24, u32> viewport_depth_range; // float24
+    BitField<0, 24, u32> viewport_depth_far_plane; // float24
+
+    INSERT_PADDING_WORDS(0x1);
 
     union {
         // Maps components of output vertex attributes to semantics
@@ -82,7 +87,14 @@ struct Regs {
         BitField<24, 5, Semantic> map_w;
     } vs_output_attributes[7];
 
-    INSERT_PADDING_WORDS(0x1a9);
+    INSERT_PADDING_WORDS(0x11);
+
+    union {
+        BitField< 0, 16, u32> x;
+        BitField<16, 16, u32> y;
+    } viewport_corner;
+
+    INSERT_PADDING_WORDS(0x197);
 
     struct {
         enum class Format : u64 {
@@ -340,6 +352,9 @@ struct Regs {
 
         ADD_FIELD(viewport_size_x);
         ADD_FIELD(viewport_size_y);
+        ADD_FIELD(viewport_depth_range);
+        ADD_FIELD(viewport_depth_far_plane);
+        ADD_FIELD(viewport_corner);
         ADD_FIELD(vertex_attributes);
         ADD_FIELD(index_array);
         ADD_FIELD(num_vertices);
@@ -391,8 +406,11 @@ private:
 
 ASSERT_REG_POSITION(viewport_size_x, 0x41);
 ASSERT_REG_POSITION(viewport_size_y, 0x43);
+ASSERT_REG_POSITION(viewport_depth_range, 0x4d);
+ASSERT_REG_POSITION(viewport_depth_far_plane, 0x4e);
 ASSERT_REG_POSITION(vs_output_attributes[0], 0x50);
 ASSERT_REG_POSITION(vs_output_attributes[1], 0x51);
+ASSERT_REG_POSITION(viewport_corner, 0x68);
 ASSERT_REG_POSITION(vertex_attributes, 0x200);
 ASSERT_REG_POSITION(index_array, 0x227);
 ASSERT_REG_POSITION(num_vertices, 0x228);
diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp
index b2196d13c..2354ffb99 100644
--- a/src/video_core/primitive_assembly.cpp
+++ b/src/video_core/primitive_assembly.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2
 // Refer to the license.txt file included.
 
+#include "clipper.h"
 #include "pica.h"
 #include "primitive_assembly.h"
 #include "vertex_shader.h"
@@ -23,8 +24,7 @@ void SubmitVertex(OutputVertex& vtx)
             } else {
                 buffer_index = 0;
 
-                // TODO
-                // Clipper::ProcessTriangle(buffer[0], buffer[1], vtx);
+                Clipper::ProcessTriangle(buffer[0], buffer[1], vtx);
             }
             break;
 
@@ -32,8 +32,7 @@ void SubmitVertex(OutputVertex& vtx)
             if (buffer_index == 2) {
                 buffer_index = 0;
 
-                // TODO
-                // Clipper::ProcessTriangle(buffer[0], buffer[1], vtx);
+                Clipper::ProcessTriangle(buffer[0], buffer[1], vtx);
 
                 buffer[1] = vtx;
             } else {
diff --git a/src/video_core/video_core.vcxproj b/src/video_core/video_core.vcxproj
index 9cf3b0858..99ab63dce 100644
--- a/src/video_core/video_core.vcxproj
+++ b/src/video_core/video_core.vcxproj
@@ -20,6 +20,7 @@
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="renderer_opengl\renderer_opengl.cpp" />
+    <ClCompile Include="clipper.cpp" />
     <ClCompile Include="command_processor.cpp" />
     <ClCompile Include="primitive_assembly.cpp" />
     <ClCompile Include="utils.cpp" />
@@ -27,6 +28,7 @@
     <ClCompile Include="video_core.cpp" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="clipper.h" />
     <ClInclude Include="command_processor.h" />
     <ClInclude Include="gpu_debugger.h" />
     <ClInclude Include="math.h" />
diff --git a/src/video_core/video_core.vcxproj.filters b/src/video_core/video_core.vcxproj.filters
index 9da20b284..5222f2fa0 100644
--- a/src/video_core/video_core.vcxproj.filters
+++ b/src/video_core/video_core.vcxproj.filters
@@ -9,6 +9,7 @@
     <ClCompile Include="renderer_opengl\renderer_opengl.cpp">
       <Filter>renderer_opengl</Filter>
     </ClCompile>
+    <ClCompile Include="clipper.cpp" />
     <ClCompile Include="command_processor.cpp" />
     <ClCompile Include="primitive_assembly.cpp" />
     <ClCompile Include="utils.cpp" />
@@ -19,6 +20,7 @@
     <ClInclude Include="renderer_opengl\renderer_opengl.h">
       <Filter>renderer_opengl</Filter>
     </ClInclude>
+    <ClInclude Include="clipper.h" />
     <ClInclude Include="command_processor.h" />
     <ClInclude Include="gpu_debugger.h" />
     <ClInclude Include="math.h" />

From 94d742fe172ba933af321bfb0e02889b40d0c179 Mon Sep 17 00:00:00 2001
From: Tony Wasserka <NeoBrainX@gmail.com>
Date: Sun, 27 Jul 2014 18:02:35 +0200
Subject: [PATCH 11/11] Pica: Add basic rasterizer.

---
 src/video_core/CMakeLists.txt             |   2 +
 src/video_core/clipper.cpp                |   3 +-
 src/video_core/pica.h                     |  52 ++++++-
 src/video_core/rasterizer.cpp             | 180 ++++++++++++++++++++++
 src/video_core/rasterizer.h               |  21 +++
 src/video_core/video_core.vcxproj         |   2 +
 src/video_core/video_core.vcxproj.filters |   2 +
 7 files changed, 260 insertions(+), 2 deletions(-)
 create mode 100644 src/video_core/rasterizer.cpp
 create mode 100644 src/video_core/rasterizer.h

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 828bf30fc..8e7b93acb 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(SRCS    clipper.cpp
             command_processor.cpp
             primitive_assembly.cpp
+            rasterizer.cpp
             utils.cpp
             vertex_shader.cpp
             video_core.cpp
@@ -10,6 +11,7 @@ set(HEADERS clipper.h
             command_processor.h
             math.h
             primitive_assembly.h
+            rasterizer.h
             utils.h
             video_core.h
             renderer_base.h
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index e9ab6242c..b7180328c 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -6,6 +6,7 @@
 
 #include "clipper.h"
 #include "pica.h"
+#include "rasterizer.h"
 #include "vertex_shader.h"
 
 namespace Pica {
@@ -168,7 +169,7 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
                   vtx1.screenpos.x.ToFloat32(), vtx1.screenpos.y.ToFloat32(), vtx1.screenpos.z.ToFloat32(),
                   vtx2.screenpos.x.ToFloat32(), vtx2.screenpos.y.ToFloat32(), vtx2.screenpos.z.ToFloat32());
 
-        // TODO: Send triangle to rasterizer
+        Rasterizer::ProcessTriangle(vtx0, vtx1, vtx2);
     }
 }
 
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 1ced0d323..81af57336 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -94,7 +94,55 @@ struct Regs {
         BitField<16, 16, u32> y;
     } viewport_corner;
 
-    INSERT_PADDING_WORDS(0x197);
+    INSERT_PADDING_WORDS(0xa7);
+
+    struct {
+        enum ColorFormat : u32 {
+            RGBA8    = 0,
+            RGB8     = 1,
+            RGBA5551 = 2,
+            RGB565   = 3,
+            RGBA4    = 4,
+        };
+
+        INSERT_PADDING_WORDS(0x6);
+
+        u32 depth_format;
+        u32 color_format;
+
+        INSERT_PADDING_WORDS(0x4);
+
+        u32 depth_buffer_address;
+        u32 color_buffer_address;
+
+        union {
+            // Apparently, the framebuffer width is stored as expected,
+            // while the height is stored as the actual height minus one.
+            // Hence, don't access these fields directly but use the accessors
+            // GetWidth() and GetHeight() instead.
+            BitField< 0, 11, u32> width;
+            BitField<12, 10, u32> height;
+        };
+
+        INSERT_PADDING_WORDS(0x1);
+
+        inline u32 GetColorBufferAddress() const {
+            return Memory::PhysicalToVirtualAddress(DecodeAddressRegister(color_buffer_address));
+        }
+        inline u32 GetDepthBufferAddress() const {
+            return Memory::PhysicalToVirtualAddress(DecodeAddressRegister(depth_buffer_address));
+        }
+
+        inline u32 GetWidth() const {
+            return width;
+        }
+
+        inline u32 GetHeight() const {
+            return height + 1;
+        }
+    } framebuffer;
+
+    INSERT_PADDING_WORDS(0xe0);
 
     struct {
         enum class Format : u64 {
@@ -355,6 +403,7 @@ struct Regs {
         ADD_FIELD(viewport_depth_range);
         ADD_FIELD(viewport_depth_far_plane);
         ADD_FIELD(viewport_corner);
+        ADD_FIELD(framebuffer);
         ADD_FIELD(vertex_attributes);
         ADD_FIELD(index_array);
         ADD_FIELD(num_vertices);
@@ -411,6 +460,7 @@ ASSERT_REG_POSITION(viewport_depth_far_plane, 0x4e);
 ASSERT_REG_POSITION(vs_output_attributes[0], 0x50);
 ASSERT_REG_POSITION(vs_output_attributes[1], 0x51);
 ASSERT_REG_POSITION(viewport_corner, 0x68);
+ASSERT_REG_POSITION(framebuffer, 0x110);
 ASSERT_REG_POSITION(vertex_attributes, 0x200);
 ASSERT_REG_POSITION(index_array, 0x227);
 ASSERT_REG_POSITION(num_vertices, 0x228);
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
new file mode 100644
index 000000000..a7c1bab3e
--- /dev/null
+++ b/src/video_core/rasterizer.cpp
@@ -0,0 +1,180 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#include <algorithm>
+
+#include "common/common_types.h"
+
+#include "math.h"
+#include "pica.h"
+#include "rasterizer.h"
+#include "vertex_shader.h"
+
+namespace Pica {
+
+namespace Rasterizer {
+
+static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
+    u32* color_buffer = (u32*)Memory::GetPointer(registers.framebuffer.GetColorBufferAddress());
+    u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
+
+    // Assuming RGBA8 format until actual framebuffer format handling is implemented
+    *(color_buffer + x + y * registers.framebuffer.GetWidth() / 2) = value;
+}
+
+static u32 GetDepth(int x, int y) {
+    u16* depth_buffer = (u16*)Memory::GetPointer(registers.framebuffer.GetDepthBufferAddress());
+
+    // Assuming 16-bit depth buffer format until actual format handling is implemented
+    return *(depth_buffer + x + y * registers.framebuffer.GetWidth() / 2);
+}
+
+static void SetDepth(int x, int y, u16 value) {
+    u16* depth_buffer = (u16*)Memory::GetPointer(registers.framebuffer.GetDepthBufferAddress());
+
+    // Assuming 16-bit depth buffer format until actual format handling is implemented
+    *(depth_buffer + x + y * registers.framebuffer.GetWidth() / 2) = value;
+}
+
+void ProcessTriangle(const VertexShader::OutputVertex& v0,
+                     const VertexShader::OutputVertex& v1,
+                     const VertexShader::OutputVertex& v2)
+{
+    // NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
+    struct Fix12P4 {
+        Fix12P4() {}
+        Fix12P4(u16 val) : val(val) {}
+
+        static u16 FracMask() { return 0xF; }
+        static u16 IntMask() { return (u16)~0xF; }
+
+        operator u16() const {
+            return val;
+        }
+
+        bool operator < (const Fix12P4& oth) const {
+            return (u16)*this < (u16)oth;
+        }
+
+    private:
+        u16 val;
+    };
+
+    // vertex positions in rasterizer coordinates
+    auto FloatToFix = [](float24 flt) {
+                          return Fix12P4(flt.ToFloat32() * 16.0f);
+                      };
+    auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3<float24> vec) {
+                                             return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
+                                         };
+    Math::Vec3<Fix12P4> vtxpos[3]{ ScreenToRasterizerCoordinates(v0.screenpos),
+                                   ScreenToRasterizerCoordinates(v1.screenpos),
+                                   ScreenToRasterizerCoordinates(v2.screenpos) };
+
+    // TODO: Proper scissor rect test!
+    u16 min_x = std::min({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x});
+    u16 min_y = std::min({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y});
+    u16 max_x = std::max({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x});
+    u16 max_y = std::max({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y});
+
+    min_x = min_x & Fix12P4::IntMask();
+    min_y = min_y & Fix12P4::IntMask();
+    max_x = (max_x + Fix12P4::FracMask()) & Fix12P4::IntMask();
+    max_y = (max_y + Fix12P4::FracMask()) & Fix12P4::IntMask();
+
+    // Triangle filling rules: Pixels on the right-sided edge or on flat bottom edges are not
+    // drawn. Pixels on any other triangle border are drawn. This is implemented with three bias
+    // values which are added to the barycentric coordinates w0, w1 and w2, respectively.
+    // NOTE: These are the PSP filling rules. Not sure if the 3DS uses the same ones...
+    auto IsRightSideOrFlatBottomEdge = [](const Math::Vec2<Fix12P4>& vtx,
+                                          const Math::Vec2<Fix12P4>& line1,
+                                          const Math::Vec2<Fix12P4>& line2)
+    {
+        if (line1.y == line2.y) {
+            // just check if vertex is above us => bottom line parallel to x-axis
+            return vtx.y < line1.y;
+        } else {
+            // check if vertex is on our left => right side
+            // TODO: Not sure how likely this is to overflow
+            return (int)vtx.x < (int)line1.x + ((int)line2.x - (int)line1.x) * ((int)vtx.y - (int)line1.y) / ((int)line2.y - (int)line1.y);
+        }
+    };
+    int bias0 = IsRightSideOrFlatBottomEdge(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) ? -1 : 0;
+    int bias1 = IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0;
+    int bias2 = IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0;
+
+    // TODO: Not sure if looping through x first might be faster
+    for (u16 y = min_y; y < max_y; y += 0x10) {
+        for (u16 x = min_x; x < max_x; x += 0x10) {
+
+            // Calculate the barycentric coordinates w0, w1 and w2
+            auto orient2d = [](const Math::Vec2<Fix12P4>& vtx1,
+                               const Math::Vec2<Fix12P4>& vtx2,
+                               const Math::Vec2<Fix12P4>& vtx3) {
+                const auto vec1 = (vtx2.Cast<int>() - vtx1.Cast<int>()).Append(0);
+                const auto vec2 = (vtx3.Cast<int>() - vtx1.Cast<int>()).Append(0);
+                // TODO: There is a very small chance this will overflow for sizeof(int) == 4
+                return Cross(vec1, vec2).z;
+            };
+
+            int w0 = bias0 + orient2d(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
+            int w1 = bias1 + orient2d(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
+            int w2 = bias2 + orient2d(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
+            int wsum = w0 + w1 + w2;
+
+            // If current pixel is not covered by the current primitive
+            if (w0 < 0 || w1 < 0 || w2 < 0)
+                continue;
+
+            // Perspective correct attribute interpolation:
+            // Attribute values cannot be calculated by simple linear interpolation since
+            // they are not linear in screen space. For example, when interpolating a
+            // texture coordinate across two vertices, something simple like
+            //     u = (u0*w0 + u1*w1)/(w0+w1)
+            // will not work. However, the attribute value divided by the
+            // clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear
+            // in screenspace. Hence, we can linearly interpolate these two independently and
+            // calculate the interpolated attribute by dividing the results.
+            // I.e.
+            //     u_over_w   = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1)
+            //     one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1)
+            //     u = u_over_w / one_over_w
+            //
+            // The generalization to three vertices is straightforward in baricentric coordinates.
+            auto GetInterpolatedAttribute = [&](float24 attr0, float24 attr1, float24 attr2) {
+                auto attr_over_w = Math::MakeVec3(attr0 / v0.pos.w,
+                                                  attr1 / v1.pos.w,
+                                                  attr2 / v2.pos.w);
+                auto w_inverse   = Math::MakeVec3(float24::FromFloat32(1.f) / v0.pos.w,
+                                                  float24::FromFloat32(1.f) / v1.pos.w,
+                                                  float24::FromFloat32(1.f) / v2.pos.w);
+                auto baricentric_coordinates = Math::MakeVec3(float24::FromFloat32(w0),
+                                                              float24::FromFloat32(w1),
+                                                              float24::FromFloat32(w2));
+
+                float24 interpolated_attr_over_w = Math::Dot(attr_over_w, baricentric_coordinates);
+                float24 interpolated_w_inverse   = Math::Dot(w_inverse,   baricentric_coordinates);
+                return interpolated_attr_over_w / interpolated_w_inverse;
+            };
+
+            Math::Vec4<u8> primary_color{
+                (u8)(GetInterpolatedAttribute(v0.color.r(), v1.color.r(), v2.color.r()).ToFloat32() * 255),
+                (u8)(GetInterpolatedAttribute(v0.color.g(), v1.color.g(), v2.color.g()).ToFloat32() * 255),
+                (u8)(GetInterpolatedAttribute(v0.color.b(), v1.color.b(), v2.color.b()).ToFloat32() * 255),
+                (u8)(GetInterpolatedAttribute(v0.color.a(), v1.color.a(), v2.color.a()).ToFloat32() * 255)
+            };
+
+            u16 z = (u16)(((float)v0.screenpos[2].ToFloat32() * w0 +
+                           (float)v1.screenpos[2].ToFloat32() * w1 +
+                           (float)v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum); // TODO: Shouldn't need to multiply by 65536?
+            SetDepth(x >> 4, y >> 4, z);
+
+            DrawPixel(x >> 4, y >> 4, primary_color);
+        }
+    }
+}
+
+} // namespace Rasterizer
+
+} // namespace Pica
diff --git a/src/video_core/rasterizer.h b/src/video_core/rasterizer.h
new file mode 100644
index 000000000..500be9462
--- /dev/null
+++ b/src/video_core/rasterizer.h
@@ -0,0 +1,21 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#pragma once
+
+namespace Pica {
+
+namespace VertexShader {
+    struct OutputVertex;
+}
+
+namespace Rasterizer {
+
+void ProcessTriangle(const VertexShader::OutputVertex& v0,
+                     const VertexShader::OutputVertex& v1,
+                     const VertexShader::OutputVertex& v2);
+
+} // namespace Rasterizer
+
+} // namespace Pica
diff --git a/src/video_core/video_core.vcxproj b/src/video_core/video_core.vcxproj
index 99ab63dce..48d77cdc4 100644
--- a/src/video_core/video_core.vcxproj
+++ b/src/video_core/video_core.vcxproj
@@ -23,6 +23,7 @@
     <ClCompile Include="clipper.cpp" />
     <ClCompile Include="command_processor.cpp" />
     <ClCompile Include="primitive_assembly.cpp" />
+    <ClCompile Include="rasterizer.cpp" />
     <ClCompile Include="utils.cpp" />
     <ClCompile Include="vertex_shader.cpp" />
     <ClCompile Include="video_core.cpp" />
@@ -34,6 +35,7 @@
     <ClInclude Include="math.h" />
     <ClInclude Include="pica.h" />
     <ClInclude Include="primitive_assembly.h" />
+    <ClInclude Include="rasterizer.h" />
     <ClInclude Include="renderer_base.h" />
     <ClInclude Include="utils.h" />
     <ClInclude Include="vertex_shader.h" />
diff --git a/src/video_core/video_core.vcxproj.filters b/src/video_core/video_core.vcxproj.filters
index 5222f2fa0..31af4f1df 100644
--- a/src/video_core/video_core.vcxproj.filters
+++ b/src/video_core/video_core.vcxproj.filters
@@ -12,6 +12,7 @@
     <ClCompile Include="clipper.cpp" />
     <ClCompile Include="command_processor.cpp" />
     <ClCompile Include="primitive_assembly.cpp" />
+    <ClCompile Include="rasterizer.cpp" />
     <ClCompile Include="utils.cpp" />
     <ClCompile Include="vertex_shader.cpp" />
     <ClCompile Include="video_core.cpp" />
@@ -26,6 +27,7 @@
     <ClInclude Include="math.h" />
     <ClInclude Include="pica.h" />
     <ClInclude Include="primitive_assembly.h" />
+    <ClInclude Include="rasterizer.h" />
     <ClInclude Include="renderer_base.h" />
     <ClInclude Include="utils.h" />
     <ClInclude Include="vertex_shader.h" />