diff --git a/src/citra_qt/debugger/graphics_cmdlists.cpp b/src/citra_qt/debugger/graphics_cmdlists.cpp index 30b8b5dae..e98560a19 100644 --- a/src/citra_qt/debugger/graphics_cmdlists.cpp +++ b/src/citra_qt/debugger/graphics_cmdlists.cpp @@ -78,7 +78,7 @@ QVariant GPUCommandListModel::data(const QModelIndex& index, int role) const // index refers to a specific command const GraphicsDebugger::PicaCommandList& cmdlist = command_lists[item->parent->index].second; const GraphicsDebugger::PicaCommand& cmd = cmdlist[item->index]; - const Pica::CommandHeader& header = cmd.GetHeader(); + const Pica::CommandProcessor::CommandHeader& header = cmd.GetHeader(); if (role == Qt::DisplayRole) { QString content; diff --git a/src/core/hle/service/gsp.cpp b/src/core/hle/service/gsp.cpp index 08e65612e..635f50a53 100644 --- a/src/core/hle/service/gsp.cpp +++ b/src/core/hle/service/gsp.cpp @@ -32,7 +32,7 @@ static inline u8* GetCommandBuffer(u32 thread_id) { if (0 == g_shared_memory) return nullptr; - return Kernel::GetSharedMemoryPointer(g_shared_memory, + return Kernel::GetSharedMemoryPointer(g_shared_memory, 0x800 + (thread_id * sizeof(CommandBuffer))); } @@ -173,7 +173,7 @@ void ExecuteCommand(const Command& command) { case CommandId::SET_COMMAND_LIST_LAST: { auto& params = command.set_command_list_last; - WriteGPURegister(GPU_REG_INDEX(command_processor_config.address), params.address >> 3); + WriteGPURegister(GPU_REG_INDEX(command_processor_config.address), Memory::VirtualToPhysicalAddress(params.address) >> 3); WriteGPURegister(GPU_REG_INDEX(command_processor_config.size), params.size >> 3); // TODO: Not sure if we are supposed to always write this .. seems to trigger processing though @@ -193,20 +193,28 @@ void ExecuteCommand(const Command& command) { case CommandId::SET_MEMORY_FILL: { auto& params = command.memory_fill; - WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].address_start), params.start1 >> 3); - WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].address_end), params.end1 >> 3); + WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].address_start), Memory::VirtualToPhysicalAddress(params.start1) >> 3); + WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].address_end), Memory::VirtualToPhysicalAddress(params.end1) >> 3); WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].size), params.end1 - params.start1); WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].value), params.value1); - WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].address_start), params.start2 >> 3); - WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].address_end), params.end2 >> 3); + WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].address_start), Memory::VirtualToPhysicalAddress(params.start2) >> 3); + WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].address_end), Memory::VirtualToPhysicalAddress(params.end2) >> 3); WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].size), params.end2 - params.start2); WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].value), params.value2); break; } - // TODO: Check if texture copies are implemented correctly.. case CommandId::SET_DISPLAY_TRANSFER: + { + auto& params = command.image_copy; + WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_address), Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3); + WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_address), Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3); + WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_size), params.in_buffer_size); + WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_size), params.out_buffer_size); + WriteGPURegister(GPU_REG_INDEX(display_transfer_config.flags), params.flags); + WriteGPURegister(GPU_REG_INDEX(display_transfer_config.trigger), 1); + // TODO(bunnei): Signalling all of these interrupts here is totally wrong, but it seems to // work well enough for running demos. Need to figure out how these all work and trigger // them correctly. @@ -216,18 +224,19 @@ void ExecuteCommand(const Command& command) { SignalInterrupt(InterruptId::P3D); SignalInterrupt(InterruptId::DMA); break; + } + // TODO: Check if texture copies are implemented correctly.. case CommandId::SET_TEXTURE_COPY: { auto& params = command.image_copy; - WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_address), params.in_buffer_address >> 3); - WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_address), params.out_buffer_address >> 3); + WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_address), Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3); + WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_address), Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3); WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_size), params.in_buffer_size); WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_size), params.out_buffer_size); WriteGPURegister(GPU_REG_INDEX(display_transfer_config.flags), params.flags); - // TODO: Should this only be ORed with 1 for texture copies? - // trigger transfer + // TODO: Should this register be set to 1 or should instead its value be OR-ed with 1? WriteGPURegister(GPU_REG_INDEX(display_transfer_config.trigger), 1); break; } diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp index fd40f8ac0..87cf93bac 100644 --- a/src/core/hw/gpu.cpp +++ b/src/core/hw/gpu.cpp @@ -14,6 +14,7 @@ #include "core/hw/gpu.h" +#include "video_core/command_processor.h" #include "video_core/video_core.h" @@ -24,83 +25,6 @@ Regs g_regs; u32 g_cur_line = 0; ///< Current vertical screen line u64 g_last_line_ticks = 0; ///< CPU tick count from last vertical screen line -/** - * Sets whether the framebuffers are in the GSP heap (FCRAM) or VRAM - * @param - */ -void SetFramebufferLocation(const FramebufferLocation mode) { - switch (mode) { - case FRAMEBUFFER_LOCATION_FCRAM: - { - auto& framebuffer_top = g_regs.framebuffer_config[0]; - auto& framebuffer_sub = g_regs.framebuffer_config[1]; - - framebuffer_top.address_left1 = PADDR_TOP_LEFT_FRAME1; - framebuffer_top.address_left2 = PADDR_TOP_LEFT_FRAME2; - framebuffer_top.address_right1 = PADDR_TOP_RIGHT_FRAME1; - framebuffer_top.address_right2 = PADDR_TOP_RIGHT_FRAME2; - framebuffer_sub.address_left1 = PADDR_SUB_FRAME1; - //framebuffer_sub.address_left2 = unknown; - framebuffer_sub.address_right1 = PADDR_SUB_FRAME2; - //framebuffer_sub.address_right2 = unknown; - break; - } - - case FRAMEBUFFER_LOCATION_VRAM: - { - auto& framebuffer_top = g_regs.framebuffer_config[0]; - auto& framebuffer_sub = g_regs.framebuffer_config[1]; - - framebuffer_top.address_left1 = PADDR_VRAM_TOP_LEFT_FRAME1; - framebuffer_top.address_left2 = PADDR_VRAM_TOP_LEFT_FRAME2; - framebuffer_top.address_right1 = PADDR_VRAM_TOP_RIGHT_FRAME1; - framebuffer_top.address_right2 = PADDR_VRAM_TOP_RIGHT_FRAME2; - framebuffer_sub.address_left1 = PADDR_VRAM_SUB_FRAME1; - //framebuffer_sub.address_left2 = unknown; - framebuffer_sub.address_right1 = PADDR_VRAM_SUB_FRAME2; - //framebuffer_sub.address_right2 = unknown; - break; - } - } -} - -/** - * Gets the location of the framebuffers - * @return Location of framebuffers as FramebufferLocation enum - */ -FramebufferLocation GetFramebufferLocation(u32 address) { - if ((address & ~Memory::VRAM_MASK) == Memory::VRAM_PADDR) { - return FRAMEBUFFER_LOCATION_VRAM; - } else if ((address & ~Memory::FCRAM_MASK) == Memory::FCRAM_PADDR) { - return FRAMEBUFFER_LOCATION_FCRAM; - } else { - ERROR_LOG(GPU, "unknown framebuffer location!"); - } - return FRAMEBUFFER_LOCATION_UNKNOWN; -} - -u32 GetFramebufferAddr(const u32 address) { - switch (GetFramebufferLocation(address)) { - case FRAMEBUFFER_LOCATION_FCRAM: - return Memory::VirtualAddressFromPhysical_FCRAM(address); - case FRAMEBUFFER_LOCATION_VRAM: - return Memory::VirtualAddressFromPhysical_VRAM(address); - default: - ERROR_LOG(GPU, "unknown framebuffer location"); - } - return 0; -} - -/** - * Gets a read-only pointer to a framebuffer in memory - * @param address Physical address of framebuffer - * @return Returns const pointer to raw framebuffer - */ -const u8* GetFramebufferPointer(const u32 address) { - u32 addr = GetFramebufferAddr(address); - return (addr != 0) ? Memory::GetPointer(addr) : nullptr; -} - template inline void Read(T &var, const u32 raw_addr) { u32 addr = raw_addr - 0x1EF00000; @@ -141,8 +65,8 @@ inline void Write(u32 addr, const T data) { // TODO: Not sure if this check should be done at GSP level instead if (config.address_start) { // TODO: Not sure if this algorithm is correct, particularly because it doesn't use the size member at all - u32* start = (u32*)Memory::GetPointer(config.GetStartAddress()); - u32* end = (u32*)Memory::GetPointer(config.GetEndAddress()); + u32* start = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress())); + u32* end = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress())); for (u32* ptr = start; ptr < end; ++ptr) *ptr = bswap32(config.value); // TODO: This is just a workaround to missing framebuffer format emulation @@ -155,8 +79,8 @@ inline void Write(u32 addr, const T data) { { const auto& config = g_regs.display_transfer_config; if (config.trigger & 1) { - u8* source_pointer = Memory::GetPointer(config.GetPhysicalInputAddress()); - u8* dest_pointer = Memory::GetPointer(config.GetPhysicalOutputAddress()); + u8* source_pointer = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetPhysicalInputAddress())); + u8* dest_pointer = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetPhysicalOutputAddress())); for (int y = 0; y < config.output_height; ++y) { // TODO: Why does the register seem to hold twice the framebuffer width? @@ -220,14 +144,15 @@ inline void Write(u32 addr, const T data) { break; } + // Seems like writing to this register triggers processing case GPU_REG_INDEX(command_processor_config.trigger): { const auto& config = g_regs.command_processor_config; if (config.trigger & 1) { - // u32* buffer = (u32*)Memory::GetPointer(config.GetPhysicalAddress()); - ERROR_LOG(GPU, "Beginning 0x%08x bytes of commands from address 0x%08x", config.size, config.GetPhysicalAddress()); - // TODO: Process command list! + u32* buffer = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetPhysicalAddress())); + u32 size = config.size << 3; + Pica::CommandProcessor::ProcessCommandList(buffer, size); } break; } @@ -276,11 +201,22 @@ void Init() { g_cur_line = 0; g_last_line_ticks = Core::g_app_core->GetTicks(); -// SetFramebufferLocation(FRAMEBUFFER_LOCATION_FCRAM); - SetFramebufferLocation(FRAMEBUFFER_LOCATION_VRAM); - auto& framebuffer_top = g_regs.framebuffer_config[0]; auto& framebuffer_sub = g_regs.framebuffer_config[1]; + + // Setup default framebuffer addresses (located in VRAM) + // .. or at least these are the ones used by system applets. + // There's probably a smarter way to come up with addresses + // like this which does not require hardcoding. + framebuffer_top.address_left1 = 0x181E6000; + framebuffer_top.address_left2 = 0x1822C800; + framebuffer_top.address_right1 = 0x18273000; + framebuffer_top.address_right2 = 0x182B9800; + framebuffer_sub.address_left1 = 0x1848F000; + //framebuffer_sub.address_left2 = unknown; + framebuffer_sub.address_right1 = 0x184C7800; + //framebuffer_sub.address_right2 = unknown; + // TODO: Width should be 240 instead? framebuffer_top.width = 480; framebuffer_top.height = 400; diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h index 3065da891..d20311a00 100644 --- a/src/core/hw/gpu.h +++ b/src/core/hw/gpu.h @@ -249,72 +249,6 @@ static_assert(sizeof(Regs) == 0x1000 * sizeof(u32), "Invalid total size of regis extern Regs g_regs; -enum { - TOP_ASPECT_X = 0x5, - TOP_ASPECT_Y = 0x3, - - TOP_HEIGHT = 240, - TOP_WIDTH = 400, - BOTTOM_WIDTH = 320, - - // Physical addresses in FCRAM (chosen arbitrarily) - PADDR_TOP_LEFT_FRAME1 = 0x201D4C00, - PADDR_TOP_LEFT_FRAME2 = 0x202D4C00, - PADDR_TOP_RIGHT_FRAME1 = 0x203D4C00, - PADDR_TOP_RIGHT_FRAME2 = 0x204D4C00, - PADDR_SUB_FRAME1 = 0x205D4C00, - PADDR_SUB_FRAME2 = 0x206D4C00, - // Physical addresses in FCRAM used by ARM9 applications -/* PADDR_TOP_LEFT_FRAME1 = 0x20184E60, - PADDR_TOP_LEFT_FRAME2 = 0x201CB370, - PADDR_TOP_RIGHT_FRAME1 = 0x20282160, - PADDR_TOP_RIGHT_FRAME2 = 0x202C8670, - PADDR_SUB_FRAME1 = 0x202118E0, - PADDR_SUB_FRAME2 = 0x20249CF0,*/ - - // Physical addresses in VRAM - // TODO: These should just be deduced from the ones above - PADDR_VRAM_TOP_LEFT_FRAME1 = 0x181D4C00, - PADDR_VRAM_TOP_LEFT_FRAME2 = 0x182D4C00, - PADDR_VRAM_TOP_RIGHT_FRAME1 = 0x183D4C00, - PADDR_VRAM_TOP_RIGHT_FRAME2 = 0x184D4C00, - PADDR_VRAM_SUB_FRAME1 = 0x185D4C00, - PADDR_VRAM_SUB_FRAME2 = 0x186D4C00, - // Physical addresses in VRAM used by ARM9 applications -/* PADDR_VRAM_TOP_LEFT_FRAME2 = 0x181CB370, - PADDR_VRAM_TOP_RIGHT_FRAME1 = 0x18282160, - PADDR_VRAM_TOP_RIGHT_FRAME2 = 0x182C8670, - PADDR_VRAM_SUB_FRAME1 = 0x182118E0, - PADDR_VRAM_SUB_FRAME2 = 0x18249CF0,*/ -}; - -/// Framebuffer location -enum FramebufferLocation { - FRAMEBUFFER_LOCATION_UNKNOWN, ///< Framebuffer location is unknown - FRAMEBUFFER_LOCATION_FCRAM, ///< Framebuffer is in the GSP heap - FRAMEBUFFER_LOCATION_VRAM, ///< Framebuffer is in VRAM -}; - -/** - * Sets whether the framebuffers are in the GSP heap (FCRAM) or VRAM - * @param - */ -void SetFramebufferLocation(const FramebufferLocation mode); - -/** - * Gets a read-only pointer to a framebuffer in memory - * @param address Physical address of framebuffer - * @return Returns const pointer to raw framebuffer - */ -const u8* GetFramebufferPointer(const u32 address); - -u32 GetFramebufferAddr(const u32 address); - -/** - * Gets the location of the framebuffers - */ -FramebufferLocation GetFramebufferLocation(u32 address); - template void Read(T &var, const u32 addr); diff --git a/src/core/mem_map.cpp b/src/core/mem_map.cpp index c45746be9..14fc01471 100644 --- a/src/core/mem_map.cpp +++ b/src/core/mem_map.cpp @@ -72,14 +72,14 @@ void Init() { g_base = MemoryMap_Setup(g_views, kNumMemViews, flags, &g_arena); - NOTICE_LOG(MEMMAP, "initialized OK, RAM at %p (mirror at 0 @ %p)", g_heap, + NOTICE_LOG(MEMMAP, "initialized OK, RAM at %p (mirror at 0 @ %p)", g_heap, g_physical_fcram); } void Shutdown() { u32 flags = 0; MemoryMap_Shutdown(g_views, kNumMemViews, flags, &g_arena); - + g_arena.ReleaseSpace(); g_base = NULL; diff --git a/src/core/mem_map.h b/src/core/mem_map.h index 12941f558..3c7810573 100644 --- a/src/core/mem_map.h +++ b/src/core/mem_map.h @@ -14,7 +14,6 @@ namespace Memory { enum { BOOTROM_SIZE = 0x00010000, ///< Bootrom (super secret code/data @ 0x8000) size MPCORE_PRIV_SIZE = 0x00002000, ///< MPCore private memory region size - VRAM_SIZE = 0x00600000, ///< VRAM size DSP_SIZE = 0x00080000, ///< DSP memory size AXI_WRAM_SIZE = 0x00080000, ///< AXI WRAM size @@ -23,8 +22,6 @@ enum { FCRAM_PADDR_END = (FCRAM_PADDR + FCRAM_SIZE), ///< FCRAM end of physical space FCRAM_VADDR = 0x08000000, ///< FCRAM virtual address FCRAM_VADDR_END = (FCRAM_VADDR + FCRAM_SIZE), ///< FCRAM end of virtual space - FCRAM_VADDR_FW0B = 0xF0000000, ///< FCRAM adress for firmare FW0B - FCRAM_VADDR_FW0B_END = (FCRAM_VADDR_FW0B + FCRAM_SIZE), ///< FCRAM adress end for FW0B FCRAM_MASK = (FCRAM_SIZE - 1), ///< FCRAM mask SHARED_MEMORY_SIZE = 0x04000000, ///< Shared memory size @@ -73,6 +70,7 @@ enum { HARDWARE_IO_PADDR_END = (HARDWARE_IO_PADDR + HARDWARE_IO_SIZE), HARDWARE_IO_VADDR_END = (HARDWARE_IO_VADDR + HARDWARE_IO_SIZE), + VRAM_SIZE = 0x00600000, VRAM_PADDR = 0x18000000, VRAM_VADDR = 0x1F000000, VRAM_PADDR_END = (VRAM_PADDR + VRAM_SIZE), @@ -112,7 +110,7 @@ struct MemoryBlock { // In 64-bit, this might point to "high memory" (above the 32-bit limit), // so be sure to load it into a 64-bit register. -extern u8 *g_base; +extern u8 *g_base; // These are guaranteed to point to "low memory" addresses (sub-32-bit). // 64-bit: Pointers to low-mem (sub-0x10000000) mirror @@ -147,7 +145,7 @@ void Write32(const u32 addr, const u32 data); void WriteBlock(const u32 addr, const u8* data, const int size); -u8* GetPointer(const u32 Address); +u8* GetPointer(const u32 virtual_address); /** * Maps a block of memory on the heap @@ -169,16 +167,10 @@ inline const char* GetCharPointer(const u32 address) { return (const char *)GetPointer(address); } -inline const u32 VirtualAddressFromPhysical_FCRAM(const u32 address) { - return ((address & FCRAM_MASK) | FCRAM_VADDR); -} +/// Converts a physical address to virtual address +u32 PhysicalToVirtualAddress(const u32 addr); -inline const u32 VirtualAddressFromPhysical_IO(const u32 address) { - return (address + 0x0EB00000); -} - -inline const u32 VirtualAddressFromPhysical_VRAM(const u32 address) { - return (address + 0x07000000); -} +/// Converts a virtual address to physical address +u32 VirtualToPhysicalAddress(const u32 addr); } // namespace diff --git a/src/core/mem_map_funcs.cpp b/src/core/mem_map_funcs.cpp index 305be8468..5772cca52 100644 --- a/src/core/mem_map_funcs.cpp +++ b/src/core/mem_map_funcs.cpp @@ -17,37 +17,44 @@ std::map g_heap_map; std::map g_heap_gsp_map; std::map g_shared_map; -/// Convert a physical address (or firmware-specific virtual address) to primary virtual address -u32 _VirtualAddress(const u32 addr) { - // Our memory interface read/write functions assume virtual addresses. Put any physical address - // to virtual address translations here. This is obviously quite hacky... But we're not doing - // any MMU emulation yet or anything - if ((addr >= FCRAM_PADDR) && (addr < FCRAM_PADDR_END)) { - return VirtualAddressFromPhysical_FCRAM(addr); - - // Virtual address mapping FW0B - } else if ((addr >= FCRAM_VADDR_FW0B) && (addr < FCRAM_VADDR_FW0B_END)) { - return VirtualAddressFromPhysical_FCRAM(addr); - - // Hardware IO - // TODO(bunnei): FixMe - // This isn't going to work... The physical address of HARDWARE_IO conflicts with the virtual - // address of shared memory. - //} else if ((addr >= HARDWARE_IO_PADDR) && (addr < HARDWARE_IO_PADDR_END)) { - // return (addr + 0x0EB00000); - +/// Convert a physical address to virtual address +u32 PhysicalToVirtualAddress(const u32 addr) { + // Our memory interface read/write functions assume virtual addresses. Put any physical address + // to virtual address translations here. This is quite hacky, but necessary until we implement + // proper MMU emulation. + // TODO: Screw it, I'll let bunnei figure out how to do this properly. + if ((addr >= VRAM_PADDR) && (addr < VRAM_PADDR_END)) { + return addr - VRAM_PADDR + VRAM_VADDR; + }else if ((addr >= FCRAM_PADDR) && (addr < FCRAM_PADDR_END)) { + return addr - FCRAM_PADDR + FCRAM_VADDR; } + + ERROR_LOG(MEMMAP, "Unknown physical address @ 0x%08x", addr); + return addr; +} + +/// Convert a physical address to virtual address +u32 VirtualToPhysicalAddress(const u32 addr) { + // Our memory interface read/write functions assume virtual addresses. Put any physical address + // to virtual address translations here. This is quite hacky, but necessary until we implement + // proper MMU emulation. + // TODO: Screw it, I'll let bunnei figure out how to do this properly. + if ((addr >= VRAM_VADDR) && (addr < VRAM_VADDR_END)) { + return addr - 0x07000000; + } else if ((addr >= FCRAM_VADDR) && (addr < FCRAM_VADDR_END)) { + return addr - FCRAM_VADDR + FCRAM_PADDR; + } + + ERROR_LOG(MEMMAP, "Unknown virtual address @ 0x%08x", addr); return addr; } template -inline void Read(T &var, const u32 addr) { +inline void Read(T &var, const u32 vaddr) { // TODO: Figure out the fastest order of tests for both read and write (they are probably different). // TODO: Make sure this represents the mirrors in a correct way. // Could just do a base-relative read, too.... TODO - const u32 vaddr = _VirtualAddress(addr); - // Kernel memory command buffer if (vaddr >= KERNEL_MEMORY_VADDR && vaddr < KERNEL_MEMORY_VADDR_END) { var = *((const T*)&g_kernel_mem[vaddr & KERNEL_MEMORY_MASK]); @@ -91,9 +98,8 @@ inline void Read(T &var, const u32 addr) { } template -inline void Write(u32 addr, const T data) { - u32 vaddr = _VirtualAddress(addr); - +inline void Write(u32 vaddr, const T data) { + // Kernel memory command buffer if (vaddr >= KERNEL_MEMORY_VADDR && vaddr < KERNEL_MEMORY_VADDR_END) { *(T*)&g_kernel_mem[vaddr & KERNEL_MEMORY_MASK] = data; @@ -133,16 +139,14 @@ inline void Write(u32 addr, const T data) { // _assert_msg_(MEMMAP, false, "umimplemented write to Configuration Memory"); //} else if ((vaddr & 0xFFFFF000) == 0x1FF81000) { // _assert_msg_(MEMMAP, false, "umimplemented write to shared page"); - + // Error out... } else { ERROR_LOG(MEMMAP, "unknown Write%d 0x%08X @ 0x%08X", sizeof(data) * 8, data, vaddr); } } -u8 *GetPointer(const u32 addr) { - const u32 vaddr = _VirtualAddress(addr); - +u8 *GetPointer(const u32 vaddr) { // Kernel memory command buffer if (vaddr >= KERNEL_MEMORY_VADDR && vaddr < KERNEL_MEMORY_VADDR_END) { return g_kernel_mem + (vaddr & KERNEL_MEMORY_MASK); @@ -185,12 +189,12 @@ u8 *GetPointer(const u32 addr) { */ u32 MapBlock_Heap(u32 size, u32 operation, u32 permissions) { MemoryBlock block; - + block.base_address = HEAP_VADDR; block.size = size; block.operation = operation; block.permissions = permissions; - + if (g_heap_map.size() > 0) { const MemoryBlock last_block = g_heap_map.rbegin()->second; block.address = last_block.address + last_block.size; @@ -208,12 +212,12 @@ u32 MapBlock_Heap(u32 size, u32 operation, u32 permissions) { */ u32 MapBlock_HeapGSP(u32 size, u32 operation, u32 permissions) { MemoryBlock block; - + block.base_address = HEAP_GSP_VADDR; block.size = size; block.operation = operation; block.permissions = permissions; - + if (g_heap_gsp_map.size() > 0) { const MemoryBlock last_block = g_heap_gsp_map.rbegin()->second; block.address = last_block.address + last_block.size; diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index e43e6e1bb..8e7b93acb 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -1,10 +1,22 @@ -set(SRCS video_core.cpp +set(SRCS clipper.cpp + command_processor.cpp + primitive_assembly.cpp + rasterizer.cpp utils.cpp + vertex_shader.cpp + video_core.cpp renderer_opengl/renderer_opengl.cpp) -set(HEADERS video_core.h +set(HEADERS clipper.h + command_processor.h + math.h + primitive_assembly.h + rasterizer.h utils.h + video_core.h renderer_base.h + vertex_shader.h + video_core.h renderer_opengl/renderer_opengl.h) add_library(video_core STATIC ${SRCS} ${HEADERS}) diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp new file mode 100644 index 000000000..b7180328c --- /dev/null +++ b/src/video_core/clipper.cpp @@ -0,0 +1,179 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#include + +#include "clipper.h" +#include "pica.h" +#include "rasterizer.h" +#include "vertex_shader.h" + +namespace Pica { + +namespace Clipper { + +struct ClippingEdge { +public: + enum Type { + POS_X = 0, + NEG_X = 1, + POS_Y = 2, + NEG_Y = 3, + POS_Z = 4, + NEG_Z = 5, + }; + + ClippingEdge(Type type, float24 position) : type(type), pos(position) {} + + bool IsInside(const OutputVertex& vertex) const { + switch (type) { + case POS_X: return vertex.pos.x <= pos * vertex.pos.w; + case NEG_X: return vertex.pos.x >= pos * vertex.pos.w; + case POS_Y: return vertex.pos.y <= pos * vertex.pos.w; + case NEG_Y: return vertex.pos.y >= pos * vertex.pos.w; + + // TODO: Check z compares ... should be 0..1 instead? + case POS_Z: return vertex.pos.z <= pos * vertex.pos.w; + + default: + case NEG_Z: return vertex.pos.z >= pos * vertex.pos.w; + } + } + + bool IsOutSide(const OutputVertex& vertex) const { + return !IsInside(vertex); + } + + OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const { + auto dotpr = [this](const OutputVertex& vtx) { + switch (type) { + case POS_X: return vtx.pos.x - vtx.pos.w; + case NEG_X: return -vtx.pos.x - vtx.pos.w; + case POS_Y: return vtx.pos.y - vtx.pos.w; + case NEG_Y: return -vtx.pos.y - vtx.pos.w; + + // TODO: Verify z clipping + case POS_Z: return vtx.pos.z - vtx.pos.w; + + default: + case NEG_Z: return -vtx.pos.w; + } + }; + + float24 dp = dotpr(v0); + float24 dp_prev = dotpr(v1); + float24 factor = dp_prev / (dp_prev - dp); + + return OutputVertex::Lerp(factor, v0, v1); + } + +private: + Type type; + float24 pos; +}; + +static void InitScreenCoordinates(OutputVertex& vtx) +{ + struct { + float24 halfsize_x; + float24 offset_x; + float24 halfsize_y; + float24 offset_y; + float24 zscale; + float24 offset_z; + } viewport; + + viewport.halfsize_x = float24::FromRawFloat24(registers.viewport_size_x); + viewport.halfsize_y = float24::FromRawFloat24(registers.viewport_size_y); + viewport.offset_x = float24::FromFloat32(registers.viewport_corner.x); + viewport.offset_y = float24::FromFloat32(registers.viewport_corner.y); + viewport.zscale = float24::FromRawFloat24(registers.viewport_depth_range); + viewport.offset_z = float24::FromRawFloat24(registers.viewport_depth_far_plane); + + // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not + vtx.screenpos[0] = (vtx.pos.x / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_x / float24::FromFloat32(2.0) + viewport.offset_x; + vtx.screenpos[1] = (vtx.pos.y / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y; + vtx.screenpos[2] = viewport.offset_z - vtx.pos.z / vtx.pos.w * viewport.zscale; +} + +void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { + + // TODO (neobrain): + // The list of output vertices has some fixed maximum size, + // however I haven't taken the time to figure out what it is exactly. + // For now, we hence just assume a maximal size of 1000 vertices. + const size_t max_vertices = 1000; + std::vector buffer_vertices; + std::vector output_list{ &v0, &v1, &v2 }; + + // Make sure to reserve space for all vertices. + // Without this, buffer reallocation would invalidate references. + buffer_vertices.reserve(max_vertices); + + // Simple implementation of the Sutherland-Hodgman clipping algorithm. + // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here) + for (auto edge : { ClippingEdge(ClippingEdge::POS_X, float24::FromFloat32(+1.0)), + ClippingEdge(ClippingEdge::NEG_X, float24::FromFloat32(-1.0)), + ClippingEdge(ClippingEdge::POS_Y, float24::FromFloat32(+1.0)), + ClippingEdge(ClippingEdge::NEG_Y, float24::FromFloat32(-1.0)), + ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)), + ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) { + + const std::vector input_list = output_list; + output_list.clear(); + + const OutputVertex* reference_vertex = input_list.back(); + + for (const auto& vertex : input_list) { + // NOTE: This algorithm changes vertex order in some cases! + if (edge.IsInside(*vertex)) { + if (edge.IsOutSide(*reference_vertex)) { + buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex)); + output_list.push_back(&(buffer_vertices.back())); + } + + output_list.push_back(vertex); + } else if (edge.IsInside(*reference_vertex)) { + buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex)); + output_list.push_back(&(buffer_vertices.back())); + } + + reference_vertex = vertex; + } + + // Need to have at least a full triangle to continue... + if (output_list.size() < 3) + return; + } + + InitScreenCoordinates(*(output_list[0])); + InitScreenCoordinates(*(output_list[1])); + + for (int i = 0; i < output_list.size() - 2; i ++) { + OutputVertex& vtx0 = *(output_list[0]); + OutputVertex& vtx1 = *(output_list[i+1]); + OutputVertex& vtx2 = *(output_list[i+2]); + + InitScreenCoordinates(vtx2); + + DEBUG_LOG(GPU, + "Triangle %d/%d (%d buffer vertices) at position (%.3f, %.3f, %.3f, %.3f), " + "(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and " + "screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)", + i,output_list.size(), buffer_vertices.size(), + vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),output_list.size(), + vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(), + vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(), + vtx0.screenpos.x.ToFloat32(), vtx0.screenpos.y.ToFloat32(), vtx0.screenpos.z.ToFloat32(), + vtx1.screenpos.x.ToFloat32(), vtx1.screenpos.y.ToFloat32(), vtx1.screenpos.z.ToFloat32(), + vtx2.screenpos.x.ToFloat32(), vtx2.screenpos.y.ToFloat32(), vtx2.screenpos.z.ToFloat32()); + + Rasterizer::ProcessTriangle(vtx0, vtx1, vtx2); + } +} + + +} // namespace + +} // namespace diff --git a/src/video_core/clipper.h b/src/video_core/clipper.h new file mode 100644 index 000000000..14d31ca1e --- /dev/null +++ b/src/video_core/clipper.h @@ -0,0 +1,21 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#pragma once + +namespace Pica { + +namespace VertexShader { + struct OutputVertex; +} + +namespace Clipper { + +using VertexShader::OutputVertex; + +void ProcessTriangle(OutputVertex& v0, OutputVertex& v1, OutputVertex& v2); + +} // namespace + +} // namespace diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp new file mode 100644 index 000000000..020a4da3f --- /dev/null +++ b/src/video_core/command_processor.cpp @@ -0,0 +1,238 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#include "command_processor.h" +#include "math.h" +#include "pica.h" +#include "primitive_assembly.h" +#include "vertex_shader.h" + + +namespace Pica { + +Regs registers; + +namespace CommandProcessor { + +static int float_regs_counter = 0; + +static u32 uniform_write_buffer[4]; + +// Used for VSLoadProgramData and VSLoadSwizzleData +static u32 vs_binary_write_offset = 0; +static u32 vs_swizzle_write_offset = 0; + +static inline void WritePicaReg(u32 id, u32 value) { + u32 old_value = registers[id]; + registers[id] = value; + + switch(id) { + // It seems like these trigger vertex rendering + case PICA_REG_INDEX(trigger_draw): + case PICA_REG_INDEX(trigger_draw_indexed): + { + const auto& attribute_config = registers.vertex_attributes; + const u8* const base_address = Memory::GetPointer(attribute_config.GetBaseAddress()); + + // Information about internal vertex attributes + const u8* vertex_attribute_sources[16]; + u32 vertex_attribute_strides[16]; + u32 vertex_attribute_formats[16]; + u32 vertex_attribute_elements[16]; + u32 vertex_attribute_element_size[16]; + + // Setup attribute data from loaders + for (int loader = 0; loader < 12; ++loader) { + const auto& loader_config = attribute_config.attribute_loaders[loader]; + + const u8* load_address = base_address + loader_config.data_offset; + + // TODO: What happens if a loader overwrites a previous one's data? + for (int component = 0; component < loader_config.component_count; ++component) { + u32 attribute_index = loader_config.GetComponent(component); + vertex_attribute_sources[attribute_index] = load_address; + vertex_attribute_strides[attribute_index] = loader_config.byte_count; + vertex_attribute_formats[attribute_index] = (u32)attribute_config.GetFormat(attribute_index); + vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index); + vertex_attribute_element_size[attribute_index] = attribute_config.GetElementSizeInBytes(attribute_index); + load_address += attribute_config.GetStride(attribute_index); + } + } + + // Load vertices + bool is_indexed = (id == PICA_REG_INDEX(trigger_draw_indexed)); + + const auto& index_info = registers.index_array; + const u8* index_address_8 = (u8*)base_address + index_info.offset; + const u16* index_address_16 = (u16*)index_address_8; + bool index_u16 = (bool)index_info.format; + + for (int index = 0; index < registers.num_vertices; ++index) + { + int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index; + + if (is_indexed) { + // TODO: Implement some sort of vertex cache! + } + + // Initialize data for the current vertex + VertexShader::InputVertex input; + + for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) { + for (int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { + const u8* srcdata = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]; + const float srcval = (vertex_attribute_formats[i] == 0) ? *(s8*)srcdata : + (vertex_attribute_formats[i] == 1) ? *(u8*)srcdata : + (vertex_attribute_formats[i] == 2) ? *(s16*)srcdata : + *(float*)srcdata; + input.attr[i][comp] = float24::FromFloat32(srcval); + DEBUG_LOG(GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08x + 0x%04x: %f", + comp, i, vertex, index, + attribute_config.GetBaseAddress(), + vertex_attribute_sources[i] - base_address, + srcdata - vertex_attribute_sources[i], + input.attr[i][comp].ToFloat32()); + } + } + VertexShader::OutputVertex output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes()); + + if (is_indexed) { + // TODO: Add processed vertex to vertex cache! + } + + PrimitiveAssembly::SubmitVertex(output); + } + break; + } + + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[0], 0x2c1): + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[1], 0x2c2): + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[2], 0x2c3): + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[3], 0x2c4): + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[4], 0x2c5): + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[5], 0x2c6): + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[6], 0x2c7): + case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[7], 0x2c8): + { + auto& uniform_setup = registers.vs_uniform_setup; + + // TODO: Does actual hardware indeed keep an intermediate buffer or does + // it directly write the values? + uniform_write_buffer[float_regs_counter++] = value; + + // Uniforms are written in a packed format such that 4 float24 values are encoded in + // three 32-bit numbers. We write to internal memory once a full such vector is + // written. + if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) || + (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) { + float_regs_counter = 0; + + auto& uniform = VertexShader::GetFloatUniform(uniform_setup.index); + + if (uniform_setup.index > 95) { + ERROR_LOG(GPU, "Invalid VS uniform index %d", (int)uniform_setup.index); + break; + } + + // NOTE: The destination component order indeed is "backwards" + if (uniform_setup.IsFloat32()) { + for (auto i : {0,1,2,3}) + uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i])); + } else { + // TODO: Untested + uniform.w = float24::FromRawFloat24(uniform_write_buffer[0] >> 8); + uniform.z = float24::FromRawFloat24(((uniform_write_buffer[0] & 0xFF)<<16) | ((uniform_write_buffer[1] >> 16) & 0xFFFF)); + uniform.y = float24::FromRawFloat24(((uniform_write_buffer[1] & 0xFFFF)<<8) | ((uniform_write_buffer[2] >> 24) & 0xFF)); + uniform.x = float24::FromRawFloat24(uniform_write_buffer[2] & 0xFFFFFF); + } + + DEBUG_LOG(GPU, "Set uniform %x to (%f %f %f %f)", (int)uniform_setup.index, + uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(), + uniform.w.ToFloat32()); + + // TODO: Verify that this actually modifies the register! + uniform_setup.index = uniform_setup.index + 1; + } + break; + } + + // Seems to be used to reset the write pointer for VSLoadProgramData + case PICA_REG_INDEX(vs_program.begin_load): + vs_binary_write_offset = 0; + break; + + // Load shader program code + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc): + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd): + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[2], 0x2ce): + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[3], 0x2cf): + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[4], 0x2d0): + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[5], 0x2d1): + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2): + case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3): + { + VertexShader::SubmitShaderMemoryChange(vs_binary_write_offset, value); + vs_binary_write_offset++; + break; + } + + // Seems to be used to reset the write pointer for VSLoadSwizzleData + case PICA_REG_INDEX(vs_swizzle_patterns.begin_load): + vs_swizzle_write_offset = 0; + break; + + // Load swizzle pattern data + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6): + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7): + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[2], 0x2d8): + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[3], 0x2d9): + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[4], 0x2da): + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[5], 0x2db): + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc): + case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd): + { + VertexShader::SubmitSwizzleDataChange(vs_swizzle_write_offset, value); + vs_swizzle_write_offset++; + break; + } + + default: + break; + } +} + +static std::ptrdiff_t ExecuteCommandBlock(const u32* first_command_word) { + const CommandHeader& header = *(const CommandHeader*)(&first_command_word[1]); + + u32* read_pointer = (u32*)first_command_word; + + // TODO: Take parameter mask into consideration! + + WritePicaReg(header.cmd_id, *read_pointer); + read_pointer += 2; + + for (int i = 1; i < 1+header.extra_data_length; ++i) { + u32 cmd = header.cmd_id + ((header.group_commands) ? i : 0); + WritePicaReg(cmd, *read_pointer); + ++read_pointer; + } + + // align read pointer to 8 bytes + if ((first_command_word - read_pointer) % 2) + ++read_pointer; + + return read_pointer - first_command_word; +} + +void ProcessCommandList(const u32* list, u32 size) { + u32* read_pointer = (u32*)list; + + while (read_pointer < list + size) { + read_pointer += ExecuteCommandBlock(read_pointer); + } +} + +} // namespace + +} // namespace diff --git a/src/video_core/command_processor.h b/src/video_core/command_processor.h new file mode 100644 index 000000000..6b6241a25 --- /dev/null +++ b/src/video_core/command_processor.h @@ -0,0 +1,31 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#pragma once + +#include "common/bit_field.h" +#include "common/common_types.h" + +#include "pica.h" + +namespace Pica { + +namespace CommandProcessor { + +union CommandHeader { + u32 hex; + + BitField< 0, 16, u32> cmd_id; + BitField<16, 4, u32> parameter_mask; + BitField<20, 11, u32> extra_data_length; + BitField<31, 1, u32> group_commands; +}; +static_assert(std::is_standard_layout::value == true, "CommandHeader does not use standard layout"); +static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!"); + +void ProcessCommandList(const u32* list, u32 size); + +} // namespace + +} // namespace diff --git a/src/video_core/gpu_debugger.h b/src/video_core/gpu_debugger.h index 5d85f90b9..2ba873457 100644 --- a/src/video_core/gpu_debugger.h +++ b/src/video_core/gpu_debugger.h @@ -11,6 +11,8 @@ #include "common/log.h" #include "core/hle/service/gsp.h" + +#include "command_processor.h" #include "pica.h" class GraphicsDebugger @@ -20,10 +22,10 @@ public: // A vector of commands represented by their raw byte sequence struct PicaCommand : public std::vector { - const Pica::CommandHeader& GetHeader() const + const Pica::CommandProcessor::CommandHeader& GetHeader() const { const u32& val = at(1); - return *(Pica::CommandHeader*)&val; + return *(Pica::CommandProcessor::CommandHeader*)&val; } }; @@ -99,7 +101,7 @@ public: PicaCommandList cmdlist; for (u32* parse_pointer = command_list; parse_pointer < command_list + size_in_words;) { - const Pica::CommandHeader header = static_cast(parse_pointer[1]); + const Pica::CommandProcessor::CommandHeader& header = *(Pica::CommandProcessor::CommandHeader*)(&parse_pointer[1]); cmdlist.push_back(PicaCommand()); auto& cmd = cmdlist.back(); diff --git a/src/video_core/math.h b/src/video_core/math.h new file mode 100644 index 000000000..7030f2cfb --- /dev/null +++ b/src/video_core/math.h @@ -0,0 +1,578 @@ +// Licensed under GPLv2 +// Refer to the license.txt file included. + + +// Copyright 2014 Tony Wasserka +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of the owner nor the names of its contributors may +// be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include + +namespace Math { + +template class Vec2; +template class Vec3; +template class Vec4; + + +template +class Vec2 { +public: + struct { + T x,y; + }; + + T* AsArray() { return &x; } + + Vec2() = default; + Vec2(const T a[2]) : x(a[0]), y(a[1]) {} + Vec2(const T& _x, const T& _y) : x(_x), y(_y) {} + + template + Vec2 Cast() const { + return Vec2((T2)x, (T2)y); + } + + static Vec2 AssignToAll(const T& f) + { + return Vec2(f, f); + } + + void Write(T a[2]) + { + a[0] = x; a[1] = y; + } + + Vec2 operator +(const Vec2& other) const + { + return Vec2(x+other.x, y+other.y); + } + void operator += (const Vec2 &other) + { + x+=other.x; y+=other.y; + } + Vec2 operator -(const Vec2& other) const + { + return Vec2(x-other.x, y-other.y); + } + void operator -= (const Vec2& other) + { + x-=other.x; y-=other.y; + } + Vec2 operator -() const + { + return Vec2(-x,-y); + } + Vec2 operator * (const Vec2& other) const + { + return Vec2(x*other.x, y*other.y); + } + template + Vec2 operator * (const V& f) const + { + return Vec2(x*f,y*f); + } + template + void operator *= (const V& f) + { + x*=f; y*=f; + } + template + Vec2 operator / (const V& f) const + { + return Vec2(x/f,y/f); + } + template + void operator /= (const V& f) + { + *this = *this / f; + } + + T Length2() const + { + return x*x + y*y; + } + + // Only implemented for T=float + float Length() const; + void SetLength(const float l); + Vec2 WithLength(const float l) const; + float Distance2To(Vec2 &other); + Vec2 Normalized() const; + float Normalize(); // returns the previous length, which is often useful + + T& operator [] (int i) //allow vector[1] = 3 (vector.y=3) + { + return *((&x) + i); + } + T operator [] (const int i) const + { + return *((&x) + i); + } + + void SetZero() + { + x=0; y=0; + } + + // Common aliases: UV (texel coordinates), ST (texture coordinates) + T& u() { return x; } + T& v() { return y; } + T& s() { return x; } + T& t() { return y; } + + const T& u() const { return x; } + const T& v() const { return y; } + const T& s() const { return x; } + const T& t() const { return y; } + + // swizzlers - create a subvector of specific components + Vec2 yx() const { return Vec2(y, x); } + Vec2 vu() const { return Vec2(y, x); } + Vec2 ts() const { return Vec2(y, x); } + + // Inserters to add new elements to effectively create larger vectors containing this Vec2 + Vec3 InsertBeforeX(const T& value) { + return Vec3(value, x, y); + } + Vec3 InsertBeforeY(const T& value) { + return Vec3(x, value, y); + } + Vec3 Append(const T& value) { + return Vec3(x, y, value); + } +}; + +template +Vec2 operator * (const V& f, const Vec2& vec) +{ + return Vec2(f*vec.x,f*vec.y); +} + +typedef Vec2 Vec2f; + +template +class Vec3 +{ +public: + struct + { + T x,y,z; + }; + + T* AsArray() { return &x; } + + Vec3() = default; + Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {} + Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {} + + template + Vec3 Cast() const { + return Vec3((T2)x, (T2)y, (T2)z); + } + + // Only implemented for T=int and T=float + static Vec3 FromRGB(unsigned int rgb); + unsigned int ToRGB() const; // alpha bits set to zero + + static Vec3 AssignToAll(const T& f) + { + return Vec3(f, f, f); + } + + void Write(T a[3]) + { + a[0] = x; a[1] = y; a[2] = z; + } + + Vec3 operator +(const Vec3 &other) const + { + return Vec3(x+other.x, y+other.y, z+other.z); + } + void operator += (const Vec3 &other) + { + x+=other.x; y+=other.y; z+=other.z; + } + Vec3 operator -(const Vec3 &other) const + { + return Vec3(x-other.x, y-other.y, z-other.z); + } + void operator -= (const Vec3 &other) + { + x-=other.x; y-=other.y; z-=other.z; + } + Vec3 operator -() const + { + return Vec3(-x,-y,-z); + } + Vec3 operator * (const Vec3 &other) const + { + return Vec3(x*other.x, y*other.y, z*other.z); + } + template + Vec3 operator * (const V& f) const + { + return Vec3(x*f,y*f,z*f); + } + template + void operator *= (const V& f) + { + x*=f; y*=f; z*=f; + } + template + Vec3 operator / (const V& f) const + { + return Vec3(x/f,y/f,z/f); + } + template + void operator /= (const V& f) + { + *this = *this / f; + } + + T Length2() const + { + return x*x + y*y + z*z; + } + + // Only implemented for T=float + float Length() const; + void SetLength(const float l); + Vec3 WithLength(const float l) const; + float Distance2To(Vec3 &other); + Vec3 Normalized() const; + float Normalize(); // returns the previous length, which is often useful + + T& operator [] (int i) //allow vector[2] = 3 (vector.z=3) + { + return *((&x) + i); + } + T operator [] (const int i) const + { + return *((&x) + i); + } + + void SetZero() + { + x=0; y=0; z=0; + } + + // Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates) + T& u() { return x; } + T& v() { return y; } + T& w() { return z; } + + T& r() { return x; } + T& g() { return y; } + T& b() { return z; } + + T& s() { return x; } + T& t() { return y; } + T& q() { return z; } + + const T& u() const { return x; } + const T& v() const { return y; } + const T& w() const { return z; } + + const T& r() const { return x; } + const T& g() const { return y; } + const T& b() const { return z; } + + const T& s() const { return x; } + const T& t() const { return y; } + const T& q() const { return z; } + + // swizzlers - create a subvector of specific components + // e.g. Vec2 uv() { return Vec2(x,y); } + // _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx) +#define _DEFINE_SWIZZLER2(a, b, name) Vec2 name() const { return Vec2(a, b); } +#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \ + _DEFINE_SWIZZLER2(a, b, a##b); \ + _DEFINE_SWIZZLER2(a, b, a2##b2); \ + _DEFINE_SWIZZLER2(a, b, a3##b3); \ + _DEFINE_SWIZZLER2(a, b, a4##b4); \ + _DEFINE_SWIZZLER2(b, a, b##a); \ + _DEFINE_SWIZZLER2(b, a, b2##a2); \ + _DEFINE_SWIZZLER2(b, a, b3##a3); \ + _DEFINE_SWIZZLER2(b, a, b4##a4); + + DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t); + DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q); + DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q); +#undef DEFINE_SWIZZLER2 +#undef _DEFINE_SWIZZLER2 + + // Inserters to add new elements to effectively create larger vectors containing this Vec2 + Vec4 InsertBeforeX(const T& value) { + return Vec4(value, x, y, z); + } + Vec4 InsertBeforeY(const T& value) { + return Vec4(x, value, y, z); + } + Vec4 InsertBeforeZ(const T& value) { + return Vec4(x, y, value, z); + } + Vec4 Append(const T& value) { + return Vec4(x, y, z, value); + } +}; + +template +Vec3 operator * (const V& f, const Vec3& vec) +{ + return Vec3(f*vec.x,f*vec.y,f*vec.z); +} + +typedef Vec3 Vec3f; + +template +class Vec4 +{ +public: + struct + { + T x,y,z,w; + }; + + T* AsArray() { return &x; } + + Vec4() = default; + Vec4(const T a[4]) : x(a[0]), y(a[1]), z(a[2]), w(a[3]) {} + Vec4(const T& _x, const T& _y, const T& _z, const T& _w) : x(_x), y(_y), z(_z), w(_w) {} + + template + Vec4 Cast() const { + return Vec4((T2)x, (T2)y, (T2)z, (T2)w); + } + + // Only implemented for T=int and T=float + static Vec4 FromRGBA(unsigned int rgba); + unsigned int ToRGBA() const; + + static Vec4 AssignToAll(const T& f) { + return Vec4(f, f, f, f); + } + + void Write(T a[4]) + { + a[0] = x; a[1] = y; a[2] = z; a[3] = w; + } + + Vec4 operator +(const Vec4& other) const + { + return Vec4(x+other.x, y+other.y, z+other.z, w+other.w); + } + void operator += (const Vec4& other) + { + x+=other.x; y+=other.y; z+=other.z; w+=other.w; + } + Vec4 operator -(const Vec4 &other) const + { + return Vec4(x-other.x, y-other.y, z-other.z, w-other.w); + } + void operator -= (const Vec4 &other) + { + x-=other.x; y-=other.y; z-=other.z; w-=other.w; + } + Vec4 operator -() const + { + return Vec4(-x,-y,-z,-w); + } + Vec4 operator * (const Vec4 &other) const + { + return Vec4(x*other.x, y*other.y, z*other.z, w*other.w); + } + template + Vec4 operator * (const V& f) const + { + return Vec4(x*f,y*f,z*f,w*f); + } + template + void operator *= (const V& f) + { + x*=f; y*=f; z*=f; w*=f; + } + template + Vec4 operator / (const V& f) const + { + return Vec4(x/f,y/f,z/f,w/f); + } + template + void operator /= (const V& f) + { + *this = *this / f; + } + + T Length2() const + { + return x*x + y*y + z*z + w*w; + } + + // Only implemented for T=float + float Length() const; + void SetLength(const float l); + Vec4 WithLength(const float l) const; + float Distance2To(Vec4 &other); + Vec4 Normalized() const; + float Normalize(); // returns the previous length, which is often useful + + T& operator [] (int i) //allow vector[2] = 3 (vector.z=3) + { + return *((&x) + i); + } + T operator [] (const int i) const + { + return *((&x) + i); + } + + void SetZero() + { + x=0; y=0; z=0; + } + + // Common alias: RGBA (colors) + T& r() { return x; } + T& g() { return y; } + T& b() { return z; } + T& a() { return w; } + + const T& r() const { return x; } + const T& g() const { return y; } + const T& b() const { return z; } + const T& a() const { return w; } + + // swizzlers - create a subvector of specific components + // e.g. Vec2 uv() { return Vec2(x,y); } + // _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx) +#define _DEFINE_SWIZZLER2(a, b, name) Vec2 name() const { return Vec2(a, b); } +#define DEFINE_SWIZZLER2(a, b, a2, b2) \ + _DEFINE_SWIZZLER2(a, b, a##b); \ + _DEFINE_SWIZZLER2(a, b, a2##b2); \ + _DEFINE_SWIZZLER2(b, a, b##a); \ + _DEFINE_SWIZZLER2(b, a, b2##a2); + + DEFINE_SWIZZLER2(x, y, r, g); + DEFINE_SWIZZLER2(x, z, r, b); + DEFINE_SWIZZLER2(x, w, r, a); + DEFINE_SWIZZLER2(y, z, g, b); + DEFINE_SWIZZLER2(y, w, g, a); + DEFINE_SWIZZLER2(z, w, b, a); +#undef DEFINE_SWIZZLER2 +#undef _DEFINE_SWIZZLER2 + +#define _DEFINE_SWIZZLER3(a, b, c, name) Vec3 name() const { return Vec3(a, b, c); } +#define DEFINE_SWIZZLER3(a, b, c, a2, b2, c2) \ + _DEFINE_SWIZZLER3(a, b, c, a##b##c); \ + _DEFINE_SWIZZLER3(a, c, b, a##c##b); \ + _DEFINE_SWIZZLER3(b, a, c, b##a##c); \ + _DEFINE_SWIZZLER3(b, c, a, b##c##a); \ + _DEFINE_SWIZZLER3(c, a, b, c##a##b); \ + _DEFINE_SWIZZLER3(c, b, a, c##b##a); \ + _DEFINE_SWIZZLER3(a, b, c, a2##b2##c2); \ + _DEFINE_SWIZZLER3(a, c, b, a2##c2##b2); \ + _DEFINE_SWIZZLER3(b, a, c, b2##a2##c2); \ + _DEFINE_SWIZZLER3(b, c, a, b2##c2##a2); \ + _DEFINE_SWIZZLER3(c, a, b, c2##a2##b2); \ + _DEFINE_SWIZZLER3(c, b, a, c2##b2##a2); + + DEFINE_SWIZZLER3(x, y, z, r, g, b); + DEFINE_SWIZZLER3(x, y, w, r, g, a); + DEFINE_SWIZZLER3(x, z, w, r, b, a); + DEFINE_SWIZZLER3(y, z, w, g, b, a); +#undef DEFINE_SWIZZLER3 +#undef _DEFINE_SWIZZLER3 +}; + + +template +Vec4 operator * (const V& f, const Vec4& vec) +{ + return Vec4(f*vec.x,f*vec.y,f*vec.z,f*vec.w); +} + +typedef Vec4 Vec4f; + + +template +static inline T Dot(const Vec2& a, const Vec2& b) +{ + return a.x*b.x + a.y*b.y; +} + +template +static inline T Dot(const Vec3& a, const Vec3& b) +{ + return a.x*b.x + a.y*b.y + a.z*b.z; +} + +template +static inline T Dot(const Vec4& a, const Vec4& b) +{ + return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w; +} + +template +static inline Vec3 Cross(const Vec3& a, const Vec3& b) +{ + return Vec3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x); +} + +// linear interpolation via float: 0.0=begin, 1.0=end +template +static inline X Lerp(const X& begin, const X& end, const float t) +{ + return begin*(1.f-t) + end*t; +} + +// linear interpolation via int: 0=begin, base=end +template +static inline X LerpInt(const X& begin, const X& end, const int t) +{ + return (begin*(base-t) + end*t) / base; +} + +// Utility vector factories +template +static inline Vec2 MakeVec2(const T& x, const T& y) +{ + return Vec2{x, y}; +} + +template +static inline Vec3 MakeVec3(const T& x, const T& y, const T& z) +{ + return Vec3{x, y, z}; +} + +template +static inline Vec4 MakeVec4(const T& x, const T& y, const T& z, const T& w) +{ + return Vec4{x, y, z, w}; +} + +} // namespace diff --git a/src/video_core/pica.h b/src/video_core/pica.h index d64559d72..81af57336 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h @@ -11,6 +11,8 @@ #include "common/bit_field.h" #include "common/common_types.h" +#include "core/mem_map.h" + namespace Pica { // Returns index corresponding to the Regs member labeled by field_name @@ -45,12 +47,104 @@ struct Regs { INSERT_PADDING_WORDS(0x41); BitField<0, 24, u32> viewport_size_x; - INSERT_PADDING_WORDS(1); + INSERT_PADDING_WORDS(0x1); BitField<0, 24, u32> viewport_size_y; - INSERT_PADDING_WORDS(0x1bc); + INSERT_PADDING_WORDS(0x9); + + BitField<0, 24, u32> viewport_depth_range; // float24 + BitField<0, 24, u32> viewport_depth_far_plane; // float24 + + INSERT_PADDING_WORDS(0x1); union { + // Maps components of output vertex attributes to semantics + enum Semantic : u32 + { + POSITION_X = 0, + POSITION_Y = 1, + POSITION_Z = 2, + POSITION_W = 3, + + COLOR_R = 8, + COLOR_G = 9, + COLOR_B = 10, + COLOR_A = 11, + + TEXCOORD0_U = 12, + TEXCOORD0_V = 13, + TEXCOORD1_U = 14, + TEXCOORD1_V = 15, + TEXCOORD2_U = 22, + TEXCOORD2_V = 23, + + INVALID = 31, + }; + + BitField< 0, 5, Semantic> map_x; + BitField< 8, 5, Semantic> map_y; + BitField<16, 5, Semantic> map_z; + BitField<24, 5, Semantic> map_w; + } vs_output_attributes[7]; + + INSERT_PADDING_WORDS(0x11); + + union { + BitField< 0, 16, u32> x; + BitField<16, 16, u32> y; + } viewport_corner; + + INSERT_PADDING_WORDS(0xa7); + + struct { + enum ColorFormat : u32 { + RGBA8 = 0, + RGB8 = 1, + RGBA5551 = 2, + RGB565 = 3, + RGBA4 = 4, + }; + + INSERT_PADDING_WORDS(0x6); + + u32 depth_format; + u32 color_format; + + INSERT_PADDING_WORDS(0x4); + + u32 depth_buffer_address; + u32 color_buffer_address; + + union { + // Apparently, the framebuffer width is stored as expected, + // while the height is stored as the actual height minus one. + // Hence, don't access these fields directly but use the accessors + // GetWidth() and GetHeight() instead. + BitField< 0, 11, u32> width; + BitField<12, 10, u32> height; + }; + + INSERT_PADDING_WORDS(0x1); + + inline u32 GetColorBufferAddress() const { + return Memory::PhysicalToVirtualAddress(DecodeAddressRegister(color_buffer_address)); + } + inline u32 GetDepthBufferAddress() const { + return Memory::PhysicalToVirtualAddress(DecodeAddressRegister(depth_buffer_address)); + } + + inline u32 GetWidth() const { + return width; + } + + inline u32 GetHeight() const { + return height + 1; + } + } framebuffer; + + INSERT_PADDING_WORDS(0xe0); + + struct { enum class Format : u64 { BYTE = 0, UBYTE = 1, @@ -58,36 +152,230 @@ struct Regs { FLOAT = 3, }; - BitField< 0, 2, Format> format0; - BitField< 2, 2, u64> size0; // number of elements minus 1 - BitField< 4, 2, Format> format1; - BitField< 6, 2, u64> size1; - BitField< 8, 2, Format> format2; - BitField<10, 2, u64> size2; - BitField<12, 2, Format> format3; - BitField<14, 2, u64> size3; - BitField<16, 2, Format> format4; - BitField<18, 2, u64> size4; - BitField<20, 2, Format> format5; - BitField<22, 2, u64> size5; - BitField<24, 2, Format> format6; - BitField<26, 2, u64> size6; - BitField<28, 2, Format> format7; - BitField<30, 2, u64> size7; - BitField<32, 2, Format> format8; - BitField<34, 2, u64> size8; - BitField<36, 2, Format> format9; - BitField<38, 2, u64> size9; - BitField<40, 2, Format> format10; - BitField<42, 2, u64> size10; - BitField<44, 2, Format> format11; - BitField<46, 2, u64> size11; + BitField<0, 29, u32> base_address; - BitField<48, 12, u64> attribute_mask; - BitField<60, 4, u64> num_attributes; // number of total attributes minus 1 - } vertex_descriptor; + inline u32 GetBaseAddress() const { + // TODO: Ugly, should fix PhysicalToVirtualAddress instead + return DecodeAddressRegister(base_address) - Memory::FCRAM_PADDR + Memory::HEAP_GSP_VADDR; + } - INSERT_PADDING_WORDS(0xfe); + // Descriptor for internal vertex attributes + union { + BitField< 0, 2, Format> format0; // size of one element + BitField< 2, 2, u64> size0; // number of elements minus 1 + BitField< 4, 2, Format> format1; + BitField< 6, 2, u64> size1; + BitField< 8, 2, Format> format2; + BitField<10, 2, u64> size2; + BitField<12, 2, Format> format3; + BitField<14, 2, u64> size3; + BitField<16, 2, Format> format4; + BitField<18, 2, u64> size4; + BitField<20, 2, Format> format5; + BitField<22, 2, u64> size5; + BitField<24, 2, Format> format6; + BitField<26, 2, u64> size6; + BitField<28, 2, Format> format7; + BitField<30, 2, u64> size7; + BitField<32, 2, Format> format8; + BitField<34, 2, u64> size8; + BitField<36, 2, Format> format9; + BitField<38, 2, u64> size9; + BitField<40, 2, Format> format10; + BitField<42, 2, u64> size10; + BitField<44, 2, Format> format11; + BitField<46, 2, u64> size11; + + BitField<48, 12, u64> attribute_mask; + + // number of total attributes minus 1 + BitField<60, 4, u64> num_extra_attributes; + }; + + inline Format GetFormat(int n) const { + Format formats[] = { + format0, format1, format2, format3, + format4, format5, format6, format7, + format8, format9, format10, format11 + }; + return formats[n]; + } + + inline int GetNumElements(int n) const { + u64 sizes[] = { + size0, size1, size2, size3, + size4, size5, size6, size7, + size8, size9, size10, size11 + }; + return (int)sizes[n]+1; + } + + inline int GetElementSizeInBytes(int n) const { + return (GetFormat(n) == Format::FLOAT) ? 4 : + (GetFormat(n) == Format::SHORT) ? 2 : 1; + } + + inline int GetStride(int n) const { + return GetNumElements(n) * GetElementSizeInBytes(n); + } + + inline int GetNumTotalAttributes() const { + return (int)num_extra_attributes+1; + } + + // Attribute loaders map the source vertex data to input attributes + // This e.g. allows to load different attributes from different memory locations + struct { + // Source attribute data offset from the base address + u32 data_offset; + + union { + BitField< 0, 4, u64> comp0; + BitField< 4, 4, u64> comp1; + BitField< 8, 4, u64> comp2; + BitField<12, 4, u64> comp3; + BitField<16, 4, u64> comp4; + BitField<20, 4, u64> comp5; + BitField<24, 4, u64> comp6; + BitField<28, 4, u64> comp7; + BitField<32, 4, u64> comp8; + BitField<36, 4, u64> comp9; + BitField<40, 4, u64> comp10; + BitField<44, 4, u64> comp11; + + // bytes for a single vertex in this loader + BitField<48, 8, u64> byte_count; + + BitField<60, 4, u64> component_count; + }; + + inline int GetComponent(int n) const { + u64 components[] = { + comp0, comp1, comp2, comp3, + comp4, comp5, comp6, comp7, + comp8, comp9, comp10, comp11 + }; + return (int)components[n]; + } + } attribute_loaders[12]; + } vertex_attributes; + + struct { + enum IndexFormat : u32 { + BYTE = 0, + SHORT = 1, + }; + + union { + BitField<0, 31, u32> offset; // relative to base attribute address + BitField<31, 1, IndexFormat> format; + }; + } index_array; + + // Number of vertices to render + u32 num_vertices; + + INSERT_PADDING_WORDS(0x5); + + // These two trigger rendering of triangles + u32 trigger_draw; + u32 trigger_draw_indexed; + + INSERT_PADDING_WORDS(0x2e); + + enum class TriangleTopology : u32 { + List = 0, + Strip = 1, + Fan = 2, + ListIndexed = 3, // TODO: No idea if this is correct + }; + + BitField<8, 2, TriangleTopology> triangle_topology; + + INSERT_PADDING_WORDS(0x5b); + + // Offset to shader program entry point (in words) + BitField<0, 16, u32> vs_main_offset; + + union { + BitField< 0, 4, u64> attribute0_register; + BitField< 4, 4, u64> attribute1_register; + BitField< 8, 4, u64> attribute2_register; + BitField<12, 4, u64> attribute3_register; + BitField<16, 4, u64> attribute4_register; + BitField<20, 4, u64> attribute5_register; + BitField<24, 4, u64> attribute6_register; + BitField<28, 4, u64> attribute7_register; + BitField<32, 4, u64> attribute8_register; + BitField<36, 4, u64> attribute9_register; + BitField<40, 4, u64> attribute10_register; + BitField<44, 4, u64> attribute11_register; + BitField<48, 4, u64> attribute12_register; + BitField<52, 4, u64> attribute13_register; + BitField<56, 4, u64> attribute14_register; + BitField<60, 4, u64> attribute15_register; + + int GetRegisterForAttribute(int attribute_index) { + u64 fields[] = { + attribute0_register, attribute1_register, attribute2_register, attribute3_register, + attribute4_register, attribute5_register, attribute6_register, attribute7_register, + attribute8_register, attribute9_register, attribute10_register, attribute11_register, + attribute12_register, attribute13_register, attribute14_register, attribute15_register, + }; + return (int)fields[attribute_index]; + } + } vs_input_register_map; + + INSERT_PADDING_WORDS(0x3); + + struct { + enum Format : u32 + { + FLOAT24 = 0, + FLOAT32 = 1 + }; + + bool IsFloat32() const { + return format == FLOAT32; + } + + union { + // Index of the next uniform to write to + // TODO: ctrulib uses 8 bits for this, however that seems to yield lots of invalid indices + BitField<0, 7, u32> index; + + BitField<31, 1, Format> format; + }; + + // Writing to these registers sets the "current" uniform. + // TODO: It's not clear how the hardware stores what the "current" uniform is. + u32 set_value[8]; + + } vs_uniform_setup; + + INSERT_PADDING_WORDS(0x2); + + struct { + u32 begin_load; + + // Writing to these registers sets the "current" word in the shader program. + // TODO: It's not clear how the hardware stores what the "current" word is. + u32 set_word[8]; + } vs_program; + + INSERT_PADDING_WORDS(0x1); + + // This register group is used to load an internal table of swizzling patterns, + // which are indexed by each shader instruction to specify vector component swizzling. + struct { + u32 begin_load; + + // Writing to these registers sets the "current" swizzle pattern in the table. + // TODO: It's not clear how the hardware stores what the "current" swizzle pattern is. + u32 set_word[8]; + } vs_swizzle_patterns; + + INSERT_PADDING_WORDS(0x22); #undef INSERT_PADDING_WORDS_HELPER1 #undef INSERT_PADDING_WORDS_HELPER2 @@ -112,7 +400,21 @@ struct Regs { ADD_FIELD(viewport_size_x); ADD_FIELD(viewport_size_y); - ADD_FIELD(vertex_descriptor); + ADD_FIELD(viewport_depth_range); + ADD_FIELD(viewport_depth_far_plane); + ADD_FIELD(viewport_corner); + ADD_FIELD(framebuffer); + ADD_FIELD(vertex_attributes); + ADD_FIELD(index_array); + ADD_FIELD(num_vertices); + ADD_FIELD(trigger_draw); + ADD_FIELD(trigger_draw_indexed); + ADD_FIELD(triangle_topology); + ADD_FIELD(vs_main_offset); + ADD_FIELD(vs_input_register_map); + ADD_FIELD(vs_uniform_setup); + ADD_FIELD(vs_program); + ADD_FIELD(vs_swizzle_patterns); #undef ADD_FIELD #endif // _MSC_VER @@ -153,13 +455,106 @@ private: ASSERT_REG_POSITION(viewport_size_x, 0x41); ASSERT_REG_POSITION(viewport_size_y, 0x43); -ASSERT_REG_POSITION(vertex_descriptor, 0x200); +ASSERT_REG_POSITION(viewport_depth_range, 0x4d); +ASSERT_REG_POSITION(viewport_depth_far_plane, 0x4e); +ASSERT_REG_POSITION(vs_output_attributes[0], 0x50); +ASSERT_REG_POSITION(vs_output_attributes[1], 0x51); +ASSERT_REG_POSITION(viewport_corner, 0x68); +ASSERT_REG_POSITION(framebuffer, 0x110); +ASSERT_REG_POSITION(vertex_attributes, 0x200); +ASSERT_REG_POSITION(index_array, 0x227); +ASSERT_REG_POSITION(num_vertices, 0x228); +ASSERT_REG_POSITION(trigger_draw, 0x22e); +ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f); +ASSERT_REG_POSITION(triangle_topology, 0x25e); +ASSERT_REG_POSITION(vs_main_offset, 0x2ba); +ASSERT_REG_POSITION(vs_input_register_map, 0x2bb); +ASSERT_REG_POSITION(vs_uniform_setup, 0x2c0); +ASSERT_REG_POSITION(vs_program, 0x2cb); +ASSERT_REG_POSITION(vs_swizzle_patterns, 0x2d5); #undef ASSERT_REG_POSITION #endif // !defined(_MSC_VER) // The total number of registers is chosen arbitrarily, but let's make sure it's not some odd value anyway. -static_assert(sizeof(Regs) == 0x300 * sizeof(u32), "Invalid total size of register set"); +static_assert(sizeof(Regs) <= 0x300 * sizeof(u32), "Register set structure larger than it should be"); +static_assert(sizeof(Regs) >= 0x300 * sizeof(u32), "Register set structure smaller than it should be"); + +extern Regs registers; // TODO: Not sure if we want to have one global instance for this + + +struct float24 { + static float24 FromFloat32(float val) { + float24 ret; + ret.value = val; + return ret; + } + + // 16 bit mantissa, 7 bit exponent, 1 bit sign + // TODO: No idea if this works as intended + static float24 FromRawFloat24(u32 hex) { + float24 ret; + if ((hex & 0xFFFFFF) == 0) { + ret.value = 0; + } else { + u32 mantissa = hex & 0xFFFF; + u32 exponent = (hex >> 16) & 0x7F; + u32 sign = hex >> 23; + ret.value = powf(2.0f, (float)exponent-63.0f) * (1.0f + mantissa * powf(2.0f, -16.f)); + if (sign) + ret.value = -ret.value; + } + return ret; + } + + // Not recommended for anything but logging + float ToFloat32() const { + return value; + } + + float24 operator * (const float24& flt) const { + return float24::FromFloat32(ToFloat32() * flt.ToFloat32()); + } + + float24 operator / (const float24& flt) const { + return float24::FromFloat32(ToFloat32() / flt.ToFloat32()); + } + + float24 operator + (const float24& flt) const { + return float24::FromFloat32(ToFloat32() + flt.ToFloat32()); + } + + float24 operator - (const float24& flt) const { + return float24::FromFloat32(ToFloat32() - flt.ToFloat32()); + } + + float24 operator - () const { + return float24::FromFloat32(-ToFloat32()); + } + + bool operator < (const float24& flt) const { + return ToFloat32() < flt.ToFloat32(); + } + + bool operator > (const float24& flt) const { + return ToFloat32() > flt.ToFloat32(); + } + + bool operator >= (const float24& flt) const { + return ToFloat32() >= flt.ToFloat32(); + } + + bool operator <= (const float24& flt) const { + return ToFloat32() <= flt.ToFloat32(); + } + +private: + float24() = default; + + // Stored as a regular float, merely for convenience + // TODO: Perform proper arithmetic on this! + float value; +}; union CommandHeader { CommandHeader(u32 h) : hex(h) {} diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp new file mode 100644 index 000000000..2354ffb99 --- /dev/null +++ b/src/video_core/primitive_assembly.cpp @@ -0,0 +1,51 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#include "clipper.h" +#include "pica.h" +#include "primitive_assembly.h" +#include "vertex_shader.h" + +namespace Pica { + +namespace PrimitiveAssembly { + +static OutputVertex buffer[2]; +static int buffer_index = 0; // TODO: reset this on emulation restart + +void SubmitVertex(OutputVertex& vtx) +{ + switch (registers.triangle_topology) { + case Regs::TriangleTopology::List: + case Regs::TriangleTopology::ListIndexed: + if (buffer_index < 2) { + buffer[buffer_index++] = vtx; + } else { + buffer_index = 0; + + Clipper::ProcessTriangle(buffer[0], buffer[1], vtx); + } + break; + + case Regs::TriangleTopology::Fan: + if (buffer_index == 2) { + buffer_index = 0; + + Clipper::ProcessTriangle(buffer[0], buffer[1], vtx); + + buffer[1] = vtx; + } else { + buffer[buffer_index++] = vtx; + } + break; + + default: + ERROR_LOG(GPU, "Unknown triangle mode %x:", (int)registers.triangle_topology.Value()); + break; + } +} + +} // namespace + +} // namespace diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h new file mode 100644 index 000000000..2a2b0c170 --- /dev/null +++ b/src/video_core/primitive_assembly.h @@ -0,0 +1,21 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#pragma once + +namespace Pica { + +namespace VertexShader { + struct OutputVertex; +} + +namespace PrimitiveAssembly { + +using VertexShader::OutputVertex; + +void SubmitVertex(OutputVertex& vtx); + +} // namespace + +} // namespace diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp new file mode 100644 index 000000000..a7c1bab3e --- /dev/null +++ b/src/video_core/rasterizer.cpp @@ -0,0 +1,180 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#include + +#include "common/common_types.h" + +#include "math.h" +#include "pica.h" +#include "rasterizer.h" +#include "vertex_shader.h" + +namespace Pica { + +namespace Rasterizer { + +static void DrawPixel(int x, int y, const Math::Vec4& color) { + u32* color_buffer = (u32*)Memory::GetPointer(registers.framebuffer.GetColorBufferAddress()); + u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b(); + + // Assuming RGBA8 format until actual framebuffer format handling is implemented + *(color_buffer + x + y * registers.framebuffer.GetWidth() / 2) = value; +} + +static u32 GetDepth(int x, int y) { + u16* depth_buffer = (u16*)Memory::GetPointer(registers.framebuffer.GetDepthBufferAddress()); + + // Assuming 16-bit depth buffer format until actual format handling is implemented + return *(depth_buffer + x + y * registers.framebuffer.GetWidth() / 2); +} + +static void SetDepth(int x, int y, u16 value) { + u16* depth_buffer = (u16*)Memory::GetPointer(registers.framebuffer.GetDepthBufferAddress()); + + // Assuming 16-bit depth buffer format until actual format handling is implemented + *(depth_buffer + x + y * registers.framebuffer.GetWidth() / 2) = value; +} + +void ProcessTriangle(const VertexShader::OutputVertex& v0, + const VertexShader::OutputVertex& v1, + const VertexShader::OutputVertex& v2) +{ + // NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values + struct Fix12P4 { + Fix12P4() {} + Fix12P4(u16 val) : val(val) {} + + static u16 FracMask() { return 0xF; } + static u16 IntMask() { return (u16)~0xF; } + + operator u16() const { + return val; + } + + bool operator < (const Fix12P4& oth) const { + return (u16)*this < (u16)oth; + } + + private: + u16 val; + }; + + // vertex positions in rasterizer coordinates + auto FloatToFix = [](float24 flt) { + return Fix12P4(flt.ToFloat32() * 16.0f); + }; + auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3 vec) { + return Math::Vec3{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)}; + }; + Math::Vec3 vtxpos[3]{ ScreenToRasterizerCoordinates(v0.screenpos), + ScreenToRasterizerCoordinates(v1.screenpos), + ScreenToRasterizerCoordinates(v2.screenpos) }; + + // TODO: Proper scissor rect test! + u16 min_x = std::min({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x}); + u16 min_y = std::min({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y}); + u16 max_x = std::max({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x}); + u16 max_y = std::max({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y}); + + min_x = min_x & Fix12P4::IntMask(); + min_y = min_y & Fix12P4::IntMask(); + max_x = (max_x + Fix12P4::FracMask()) & Fix12P4::IntMask(); + max_y = (max_y + Fix12P4::FracMask()) & Fix12P4::IntMask(); + + // Triangle filling rules: Pixels on the right-sided edge or on flat bottom edges are not + // drawn. Pixels on any other triangle border are drawn. This is implemented with three bias + // values which are added to the barycentric coordinates w0, w1 and w2, respectively. + // NOTE: These are the PSP filling rules. Not sure if the 3DS uses the same ones... + auto IsRightSideOrFlatBottomEdge = [](const Math::Vec2& vtx, + const Math::Vec2& line1, + const Math::Vec2& line2) + { + if (line1.y == line2.y) { + // just check if vertex is above us => bottom line parallel to x-axis + return vtx.y < line1.y; + } else { + // check if vertex is on our left => right side + // TODO: Not sure how likely this is to overflow + return (int)vtx.x < (int)line1.x + ((int)line2.x - (int)line1.x) * ((int)vtx.y - (int)line1.y) / ((int)line2.y - (int)line1.y); + } + }; + int bias0 = IsRightSideOrFlatBottomEdge(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) ? -1 : 0; + int bias1 = IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0; + int bias2 = IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0; + + // TODO: Not sure if looping through x first might be faster + for (u16 y = min_y; y < max_y; y += 0x10) { + for (u16 x = min_x; x < max_x; x += 0x10) { + + // Calculate the barycentric coordinates w0, w1 and w2 + auto orient2d = [](const Math::Vec2& vtx1, + const Math::Vec2& vtx2, + const Math::Vec2& vtx3) { + const auto vec1 = (vtx2.Cast() - vtx1.Cast()).Append(0); + const auto vec2 = (vtx3.Cast() - vtx1.Cast()).Append(0); + // TODO: There is a very small chance this will overflow for sizeof(int) == 4 + return Cross(vec1, vec2).z; + }; + + int w0 = bias0 + orient2d(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); + int w1 = bias1 + orient2d(vtxpos[2].xy(), vtxpos[0].xy(), {x, y}); + int w2 = bias2 + orient2d(vtxpos[0].xy(), vtxpos[1].xy(), {x, y}); + int wsum = w0 + w1 + w2; + + // If current pixel is not covered by the current primitive + if (w0 < 0 || w1 < 0 || w2 < 0) + continue; + + // Perspective correct attribute interpolation: + // Attribute values cannot be calculated by simple linear interpolation since + // they are not linear in screen space. For example, when interpolating a + // texture coordinate across two vertices, something simple like + // u = (u0*w0 + u1*w1)/(w0+w1) + // will not work. However, the attribute value divided by the + // clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear + // in screenspace. Hence, we can linearly interpolate these two independently and + // calculate the interpolated attribute by dividing the results. + // I.e. + // u_over_w = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1) + // one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1) + // u = u_over_w / one_over_w + // + // The generalization to three vertices is straightforward in baricentric coordinates. + auto GetInterpolatedAttribute = [&](float24 attr0, float24 attr1, float24 attr2) { + auto attr_over_w = Math::MakeVec3(attr0 / v0.pos.w, + attr1 / v1.pos.w, + attr2 / v2.pos.w); + auto w_inverse = Math::MakeVec3(float24::FromFloat32(1.f) / v0.pos.w, + float24::FromFloat32(1.f) / v1.pos.w, + float24::FromFloat32(1.f) / v2.pos.w); + auto baricentric_coordinates = Math::MakeVec3(float24::FromFloat32(w0), + float24::FromFloat32(w1), + float24::FromFloat32(w2)); + + float24 interpolated_attr_over_w = Math::Dot(attr_over_w, baricentric_coordinates); + float24 interpolated_w_inverse = Math::Dot(w_inverse, baricentric_coordinates); + return interpolated_attr_over_w / interpolated_w_inverse; + }; + + Math::Vec4 primary_color{ + (u8)(GetInterpolatedAttribute(v0.color.r(), v1.color.r(), v2.color.r()).ToFloat32() * 255), + (u8)(GetInterpolatedAttribute(v0.color.g(), v1.color.g(), v2.color.g()).ToFloat32() * 255), + (u8)(GetInterpolatedAttribute(v0.color.b(), v1.color.b(), v2.color.b()).ToFloat32() * 255), + (u8)(GetInterpolatedAttribute(v0.color.a(), v1.color.a(), v2.color.a()).ToFloat32() * 255) + }; + + u16 z = (u16)(((float)v0.screenpos[2].ToFloat32() * w0 + + (float)v1.screenpos[2].ToFloat32() * w1 + + (float)v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum); // TODO: Shouldn't need to multiply by 65536? + SetDepth(x >> 4, y >> 4, z); + + DrawPixel(x >> 4, y >> 4, primary_color); + } + } +} + +} // namespace Rasterizer + +} // namespace Pica diff --git a/src/video_core/rasterizer.h b/src/video_core/rasterizer.h new file mode 100644 index 000000000..500be9462 --- /dev/null +++ b/src/video_core/rasterizer.h @@ -0,0 +1,21 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#pragma once + +namespace Pica { + +namespace VertexShader { + struct OutputVertex; +} + +namespace Rasterizer { + +void ProcessTriangle(const VertexShader::OutputVertex& v0, + const VertexShader::OutputVertex& v1, + const VertexShader::OutputVertex& v2); + +} // namespace Rasterizer + +} // namespace Pica diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 02b174562..f11a64fad 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -81,20 +81,20 @@ void RendererOpenGL::RenderXFB(const common::Rect& src_rect, const common::Rect& const auto& framebuffer_top = GPU::g_regs.framebuffer_config[0]; const auto& framebuffer_sub = GPU::g_regs.framebuffer_config[1]; const u32 active_fb_top = (framebuffer_top.active_fb == 1) - ? framebuffer_top.address_left2 - : framebuffer_top.address_left1; + ? Memory::PhysicalToVirtualAddress(framebuffer_top.address_left2) + : Memory::PhysicalToVirtualAddress(framebuffer_top.address_left1); const u32 active_fb_sub = (framebuffer_sub.active_fb == 1) - ? framebuffer_sub.address_left2 - : framebuffer_sub.address_left1; + ? Memory::PhysicalToVirtualAddress(framebuffer_sub.address_left2) + : Memory::PhysicalToVirtualAddress(framebuffer_sub.address_left1); DEBUG_LOG(GPU, "RenderXFB: 0x%08x bytes from 0x%08x(%dx%d), fmt %x", framebuffer_top.stride * framebuffer_top.height, - GPU::GetFramebufferAddr(active_fb_top), (int)framebuffer_top.width, + active_fb_top, (int)framebuffer_top.width, (int)framebuffer_top.height, (int)framebuffer_top.format); // TODO: This should consider the GPU registers for framebuffer width, height and stride. - FlipFramebuffer(GPU::GetFramebufferPointer(active_fb_top), m_xfb_top_flipped); - FlipFramebuffer(GPU::GetFramebufferPointer(active_fb_sub), m_xfb_bottom_flipped); + FlipFramebuffer(Memory::GetPointer(active_fb_top), m_xfb_top_flipped); + FlipFramebuffer(Memory::GetPointer(active_fb_sub), m_xfb_bottom_flipped); // Blit the top framebuffer // ------------------------ diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp new file mode 100644 index 000000000..93830a96a --- /dev/null +++ b/src/video_core/vertex_shader.cpp @@ -0,0 +1,270 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#include "pica.h" +#include "vertex_shader.h" +#include +#include + +namespace Pica { + +namespace VertexShader { + +static struct { + Math::Vec4 f[96]; +} shader_uniforms; + + +// TODO: Not sure where the shader binary and swizzle patterns are supposed to be loaded to! +// For now, we just keep these local arrays around. +static u32 shader_memory[1024]; +static u32 swizzle_data[1024]; + +void SubmitShaderMemoryChange(u32 addr, u32 value) +{ + shader_memory[addr] = value; +} + +void SubmitSwizzleDataChange(u32 addr, u32 value) +{ + swizzle_data[addr] = value; +} + +Math::Vec4& GetFloatUniform(u32 index) +{ + return shader_uniforms.f[index]; +} + +struct VertexShaderState { + u32* program_counter; + + const float24* input_register_table[16]; + float24* output_register_table[7*4]; + + Math::Vec4 temporary_registers[16]; + bool status_registers[2]; + + enum { + INVALID_ADDRESS = 0xFFFFFFFF + }; + u32 call_stack[8]; // TODO: What is the maximal call stack depth? + u32* call_stack_pointer; +}; + +static void ProcessShaderCode(VertexShaderState& state) { + while (true) { + bool increment_pc = true; + bool exit_loop = false; + const Instruction& instr = *(const Instruction*)state.program_counter; + + const float24* src1_ = (instr.common.src1 < 0x10) ? state.input_register_table[instr.common.src1] + : (instr.common.src1 < 0x20) ? &state.temporary_registers[instr.common.src1-0x10].x + : (instr.common.src1 < 0x80) ? &shader_uniforms.f[instr.common.src1-0x20].x + : nullptr; + const float24* src2_ = (instr.common.src2 < 0x10) ? state.input_register_table[instr.common.src2] + : &state.temporary_registers[instr.common.src2-0x10].x; + // TODO: Unsure about the limit values + float24* dest = (instr.common.dest <= 0x1C) ? state.output_register_table[instr.common.dest] + : (instr.common.dest <= 0x3C) ? nullptr + : (instr.common.dest <= 0x7C) ? &state.temporary_registers[(instr.common.dest-0x40)/4][instr.common.dest%4] + : nullptr; + + const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id]; + + const float24 src1[4] = { + src1_[(int)swizzle.GetSelectorSrc1(0)], + src1_[(int)swizzle.GetSelectorSrc1(1)], + src1_[(int)swizzle.GetSelectorSrc1(2)], + src1_[(int)swizzle.GetSelectorSrc1(3)], + }; + const float24 src2[4] = { + src2_[(int)swizzle.GetSelectorSrc2(0)], + src2_[(int)swizzle.GetSelectorSrc2(1)], + src2_[(int)swizzle.GetSelectorSrc2(2)], + src2_[(int)swizzle.GetSelectorSrc2(3)], + }; + + switch (instr.opcode) { + case Instruction::OpCode::ADD: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = src1[i] + src2[i]; + } + + break; + } + + case Instruction::OpCode::MUL: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = src1[i] * src2[i]; + } + + break; + } + + case Instruction::OpCode::DP3: + case Instruction::OpCode::DP4: + { + float24 dot = float24::FromFloat32(0.f); + int num_components = (instr.opcode == Instruction::OpCode::DP3) ? 3 : 4; + for (int i = 0; i < num_components; ++i) + dot = dot + src1[i] * src2[i]; + + for (int i = 0; i < num_components; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = dot; + } + break; + } + + // Reciprocal + case Instruction::OpCode::RCP: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + // TODO: Be stable against division by zero! + // TODO: I think this might be wrong... we should only use one component here + dest[i] = float24::FromFloat32(1.0 / src1[i].ToFloat32()); + } + + break; + } + + // Reciprocal Square Root + case Instruction::OpCode::RSQ: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + // TODO: Be stable against division by zero! + // TODO: I think this might be wrong... we should only use one component here + dest[i] = float24::FromFloat32(1.0 / sqrt(src1[i].ToFloat32())); + } + + break; + } + + case Instruction::OpCode::MOV: + { + for (int i = 0; i < 4; ++i) { + if (!swizzle.DestComponentEnabled(i)) + continue; + + dest[i] = src1[i]; + } + break; + } + + case Instruction::OpCode::RET: + if (*state.call_stack_pointer == VertexShaderState::INVALID_ADDRESS) { + exit_loop = true; + } else { + state.program_counter = &shader_memory[*state.call_stack_pointer--]; + *state.call_stack_pointer = VertexShaderState::INVALID_ADDRESS; + } + + break; + + case Instruction::OpCode::CALL: + increment_pc = false; + + _dbg_assert_(GPU, state.call_stack_pointer - state.call_stack < sizeof(state.call_stack)); + + *++state.call_stack_pointer = state.program_counter - shader_memory; + // TODO: Does this offset refer to the beginning of shader memory? + state.program_counter = &shader_memory[instr.flow_control.offset_words]; + break; + + case Instruction::OpCode::FLS: + // TODO: Do whatever needs to be done here? + break; + + default: + ERROR_LOG(GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x", + (int)instr.opcode.Value(), instr.GetOpCodeName().c_str(), instr.hex); + break; + } + + if (increment_pc) + ++state.program_counter; + + if (exit_loop) + break; + } +} + +OutputVertex RunShader(const InputVertex& input, int num_attributes) +{ + VertexShaderState state; + + const u32* main = &shader_memory[registers.vs_main_offset]; + state.program_counter = (u32*)main; + + // Setup input register table + const auto& attribute_register_map = registers.vs_input_register_map; + float24 dummy_register; + std::fill(&state.input_register_table[0], &state.input_register_table[16], &dummy_register); + if(num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x; + if(num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x; + if(num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x; + if(num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x; + if(num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x; + if(num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x; + if(num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x; + if(num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x; + if(num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x; + if(num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x; + if(num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x; + if(num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x; + if(num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x; + if(num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x; + if(num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x; + if(num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x; + + // Setup output register table + OutputVertex ret; + for (int i = 0; i < 7; ++i) { + const auto& output_register_map = registers.vs_output_attributes[i]; + + u32 semantics[4] = { + output_register_map.map_x, output_register_map.map_y, + output_register_map.map_z, output_register_map.map_w + }; + + for (int comp = 0; comp < 4; ++comp) + state.output_register_table[4*i+comp] = ((float24*)&ret) + semantics[comp]; + } + + state.status_registers[0] = false; + state.status_registers[1] = false; + std::fill(state.call_stack, state.call_stack + sizeof(state.call_stack) / sizeof(state.call_stack[0]), + VertexShaderState::INVALID_ADDRESS); + state.call_stack_pointer = &state.call_stack[0]; + + ProcessShaderCode(state); + + DEBUG_LOG(GPU, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", + ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), + ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), + ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); + + return ret; +} + + +} // namespace + +} // namespace diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h new file mode 100644 index 000000000..1b71e367b --- /dev/null +++ b/src/video_core/vertex_shader.h @@ -0,0 +1,211 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#pragma once + +#include + +#include + +#include "math.h" +#include "pica.h" + +namespace Pica { + +namespace VertexShader { + +struct InputVertex { + Math::Vec4 attr[16]; +}; + +struct OutputVertex { + OutputVertex() = default; + + // VS output attributes + Math::Vec4 pos; + Math::Vec4 dummy; // quaternions (not implemented, yet) + Math::Vec4 color; + Math::Vec2 tc0; + float24 tc0_v; + + // Padding for optimal alignment + float24 pad[14]; + + // Attributes used to store intermediate results + + // position after perspective divide + Math::Vec3 screenpos; + + // Linear interpolation + // factor: 0=this, 1=vtx + void Lerp(float24 factor, const OutputVertex& vtx) { + pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); + + // TODO: Should perform perspective correct interpolation here... + tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); + + screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); + + color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); + } + + // Linear interpolation + // factor: 0=v0, 1=v1 + static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { + OutputVertex ret = v0; + ret.Lerp(factor, v1); + return ret; + } +}; +static_assert(std::is_pod::value, "Structure is not POD"); + +union Instruction { + enum class OpCode : u32 { + ADD = 0x0, + DP3 = 0x1, + DP4 = 0x2, + + MUL = 0x8, + + MAX = 0xC, + MIN = 0xD, + RCP = 0xE, + RSQ = 0xF, + + MOV = 0x13, + + RET = 0x21, + FLS = 0x22, // Flush + CALL = 0x24, + }; + + std::string GetOpCodeName() const { + std::map map = { + { OpCode::ADD, "ADD" }, + { OpCode::DP3, "DP3" }, + { OpCode::DP4, "DP4" }, + { OpCode::MUL, "MUL" }, + { OpCode::MAX, "MAX" }, + { OpCode::MIN, "MIN" }, + { OpCode::RCP, "RCP" }, + { OpCode::RSQ, "RSQ" }, + { OpCode::MOV, "MOV" }, + { OpCode::RET, "RET" }, + { OpCode::FLS, "FLS" }, + }; + auto it = map.find(opcode); + if (it == map.end()) + return "UNK"; + else + return it->second; + } + + u32 hex; + + BitField<0x1a, 0x6, OpCode> opcode; + + // General notes: + // + // When two input registers are used, one of them uses a 5-bit index while the other + // one uses a 7-bit index. This is because at most one floating point uniform may be used + // as an input. + + + // Format used e.g. by arithmetic instructions and comparisons + // "src1" and "src2" specify register indices (i.e. indices referring to groups of 4 floats), + // while "dest" addresses individual floats. + union { + BitField<0x00, 0x5, u32> operand_desc_id; + BitField<0x07, 0x5, u32> src2; + BitField<0x0c, 0x7, u32> src1; + BitField<0x13, 0x7, u32> dest; + } common; + + // Format used for flow control instructions ("if") + union { + BitField<0x00, 0x8, u32> num_instructions; + BitField<0x0a, 0xc, u32> offset_words; + } flow_control; +}; + +union SwizzlePattern { + u32 hex; + + enum class Selector : u32 { + x = 0, + y = 1, + z = 2, + w = 3 + }; + + Selector GetSelectorSrc1(int comp) const { + Selector selectors[] = { + src1_selector_0, src1_selector_1, src1_selector_2, src1_selector_3 + }; + return selectors[comp]; + } + + Selector GetSelectorSrc2(int comp) const { + Selector selectors[] = { + src2_selector_0, src2_selector_1, src2_selector_2, src2_selector_3 + }; + return selectors[comp]; + } + + bool DestComponentEnabled(int i) const { + return (dest_mask & (0x8 >> i)); + } + + std::string SelectorToString(bool src2) const { + std::map map = { + { Selector::x, "x" }, + { Selector::y, "y" }, + { Selector::z, "z" }, + { Selector::w, "w" } + }; + std::string ret; + for (int i = 0; i < 4; ++i) { + ret += map.at(src2 ? GetSelectorSrc2(i) : GetSelectorSrc1(i)); + } + return ret; + } + + std::string DestMaskToString() const { + std::string ret; + for (int i = 0; i < 4; ++i) { + if (!DestComponentEnabled(i)) + ret += "_"; + else + ret += "xyzw"[i]; + } + return ret; + } + + // Components of "dest" that should be written to: LSB=dest.w, MSB=dest.x + BitField< 0, 4, u32> dest_mask; + + BitField< 5, 2, Selector> src1_selector_3; + BitField< 7, 2, Selector> src1_selector_2; + BitField< 9, 2, Selector> src1_selector_1; + BitField<11, 2, Selector> src1_selector_0; + + BitField<14, 2, Selector> src2_selector_3; + BitField<16, 2, Selector> src2_selector_2; + BitField<18, 2, Selector> src2_selector_1; + BitField<20, 2, Selector> src2_selector_0; + + BitField<31, 1, u32> flag; // not sure what this means, maybe it's the sign? +}; + +void SubmitShaderMemoryChange(u32 addr, u32 value); +void SubmitSwizzleDataChange(u32 addr, u32 value); + +OutputVertex RunShader(const InputVertex& input, int num_attributes); + +Math::Vec4& GetFloatUniform(u32 index); + +} // namespace + +} // namespace + diff --git a/src/video_core/video_core.vcxproj b/src/video_core/video_core.vcxproj index d77be2bef..48d77cdc4 100644 --- a/src/video_core/video_core.vcxproj +++ b/src/video_core/video_core.vcxproj @@ -20,14 +20,25 @@ + + + + + + + + + + + diff --git a/src/video_core/video_core.vcxproj.filters b/src/video_core/video_core.vcxproj.filters index b89ac1ac4..31af4f1df 100644 --- a/src/video_core/video_core.vcxproj.filters +++ b/src/video_core/video_core.vcxproj.filters @@ -9,17 +9,28 @@ renderer_opengl + + + + + renderer_opengl + + + + + +