diff --git a/src/citra_qt/debugger/graphics/graphics_tracing.cpp b/src/citra_qt/debugger/graphics/graphics_tracing.cpp index 716ed50b8..17f1c5ce2 100644 --- a/src/citra_qt/debugger/graphics/graphics_tracing.cpp +++ b/src/citra_qt/debugger/graphics/graphics_tracing.cpp @@ -71,8 +71,8 @@ void GraphicsTracingWidget::StartRecording() { std::array default_attributes; for (unsigned i = 0; i < 16; ++i) { for (unsigned comp = 0; comp < 3; ++comp) { - default_attributes[4 * i + comp] = - nihstro::to_float24(Pica::g_state.vs_default_attributes[i][comp].ToFloat32()); + default_attributes[4 * i + comp] = nihstro::to_float24( + Pica::g_state.input_default_attributes.attr[i][comp].ToFloat32()); } } diff --git a/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp index f37524190..489ec5f21 100644 --- a/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp +++ b/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp @@ -511,7 +511,7 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d auto& shader_config = Pica::g_state.regs.vs; for (auto instr : shader_setup.program_code) info.code.push_back({instr}); - int num_attributes = Pica::g_state.regs.vertex_attributes.GetNumTotalAttributes(); + int num_attributes = shader_config.max_input_attribute_index + 1; for (auto pattern : shader_setup.swizzle_data) info.swizzle_info.push_back({pattern}); @@ -522,11 +522,11 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d // Generate debug information Pica::Shader::InterpreterEngine shader_engine; shader_engine.SetupBatch(shader_setup, entry_point); - debug_data = shader_engine.ProduceDebugInfo(shader_setup, input_vertex, num_attributes); + debug_data = shader_engine.ProduceDebugInfo(shader_setup, input_vertex, shader_config); // Reload widget state for (int attr = 0; attr < num_attributes; ++attr) { - unsigned source_attr = shader_config.input_register_map.GetRegisterForAttribute(attr); + unsigned source_attr = shader_config.GetRegisterForAttribute(attr); input_data_mapping[attr]->setText(QString("-> v%1").arg(source_attr)); input_data_container[attr]->setVisible(true); } diff --git a/src/citra_qt/debugger/graphics/graphics_vertex_shader.h b/src/citra_qt/debugger/graphics/graphics_vertex_shader.h index 3292573f3..c249a2ff8 100644 --- a/src/citra_qt/debugger/graphics/graphics_vertex_shader.h +++ b/src/citra_qt/debugger/graphics/graphics_vertex_shader.h @@ -82,7 +82,7 @@ private: nihstro::ShaderInfo info; Pica::Shader::DebugData debug_data; - Pica::Shader::InputVertex input_vertex; + Pica::Shader::AttributeBuffer input_vertex; friend class GraphicsVertexShaderModel; }; diff --git a/src/common/bit_set.h b/src/common/bit_set.h index 3059d0cb0..9c2e6b28c 100644 --- a/src/common/bit_set.h +++ b/src/common/bit_set.h @@ -121,22 +121,19 @@ public: class Iterator { public: Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {} - Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {} + Iterator(IntTy val) : m_val(val), m_bit(0) {} Iterator& operator=(Iterator other) { new (this) Iterator(other); return *this; } int operator*() { - return m_bit; + return m_bit + ComputeLsb(); } Iterator& operator++() { - if (m_val == 0) { - m_bit = -1; - } else { - int bit = LeastSignificantSetBit(m_val); - m_val &= ~(1 << bit); - m_bit = bit; - } + int lsb = ComputeLsb(); + m_val >>= lsb + 1; + m_bit += lsb + 1; + m_has_lsb = false; return *this; } Iterator operator++(int _) { @@ -145,15 +142,24 @@ public: return other; } bool operator==(Iterator other) const { - return m_bit == other.m_bit; + return m_val == other.m_val; } bool operator!=(Iterator other) const { - return m_bit != other.m_bit; + return m_val != other.m_val; } private: + int ComputeLsb() { + if (!m_has_lsb) { + m_lsb = LeastSignificantSetBit(m_val); + m_has_lsb = true; + } + return m_lsb; + } IntTy m_val; int m_bit; + int m_lsb = -1; + bool m_has_lsb = false; }; BitSet() : m_val(0) {} @@ -221,11 +227,10 @@ public: } Iterator begin() const { - Iterator it(m_val, 0); - return ++it; + return Iterator(m_val); } Iterator end() const { - return Iterator(m_val, -1); + return Iterator(0); } IntTy m_val; diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp index 05b5cea73..0774ffc53 100644 --- a/src/video_core/clipper.cpp +++ b/src/video_core/clipper.cpp @@ -18,6 +18,8 @@ #include "video_core/rasterizer.h" #include "video_core/shader/shader.h" +using Pica::Rasterizer::Vertex; + namespace Pica { namespace Clipper { @@ -29,20 +31,20 @@ public: float24::FromFloat32(0), float24::FromFloat32(0))) : coeffs(coeffs), bias(bias) {} - bool IsInside(const OutputVertex& vertex) const { + bool IsInside(const Vertex& vertex) const { return Math::Dot(vertex.pos + bias, coeffs) <= float24::FromFloat32(0); } - bool IsOutSide(const OutputVertex& vertex) const { + bool IsOutSide(const Vertex& vertex) const { return !IsInside(vertex); } - OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const { + Vertex GetIntersection(const Vertex& v0, const Vertex& v1) const { float24 dp = Math::Dot(v0.pos + bias, coeffs); float24 dp_prev = Math::Dot(v1.pos + bias, coeffs); float24 factor = dp_prev / (dp_prev - dp); - return OutputVertex::Lerp(factor, v0, v1); + return Vertex::Lerp(factor, v0, v1); } private: @@ -51,7 +53,7 @@ private: Math::Vec4 bias; }; -static void InitScreenCoordinates(OutputVertex& vtx) { +static void InitScreenCoordinates(Vertex& vtx) { struct { float24 halfsize_x; float24 offset_x; @@ -91,8 +93,8 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu // introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a // fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9. static const size_t MAX_VERTICES = 9; - static_vector buffer_a = {v0, v1, v2}; - static_vector buffer_b; + static_vector buffer_a = {v0, v1, v2}; + static_vector buffer_b; auto* output_list = &buffer_a; auto* input_list = &buffer_b; @@ -123,7 +125,7 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu std::swap(input_list, output_list); output_list->clear(); - const OutputVertex* reference_vertex = &input_list->back(); + const Vertex* reference_vertex = &input_list->back(); for (const auto& vertex : *input_list) { // NOTE: This algorithm changes vertex order in some cases! @@ -148,9 +150,9 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu InitScreenCoordinates((*output_list)[1]); for (size_t i = 0; i < output_list->size() - 2; i++) { - OutputVertex& vtx0 = (*output_list)[0]; - OutputVertex& vtx1 = (*output_list)[i + 1]; - OutputVertex& vtx2 = (*output_list)[i + 2]; + Vertex& vtx0 = (*output_list)[0]; + Vertex& vtx1 = (*output_list)[i + 1]; + Vertex& vtx2 = (*output_list)[i + 2]; InitScreenCoordinates(vtx2); diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index eb79974a8..4955ff9f9 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -125,20 +125,21 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { // TODO: Verify that this actually modifies the register! if (setup.index < 15) { - g_state.vs_default_attributes[setup.index] = attribute; + g_state.input_default_attributes.attr[setup.index] = attribute; setup.index++; } else { - // Put each attribute into an immediate input buffer. - // When all specified immediate attributes are present, the Vertex Shader is invoked - // and everything is - // sent to the primitive assembler. + // Put each attribute into an immediate input buffer. When all specified immediate + // attributes are present, the Vertex Shader is invoked and everything is sent to + // the primitive assembler. auto& immediate_input = g_state.immediate.input_vertex; auto& immediate_attribute_id = g_state.immediate.current_attribute; - immediate_input.attr[immediate_attribute_id++] = attribute; + immediate_input.attr[immediate_attribute_id] = attribute; - if (immediate_attribute_id >= regs.vs.num_input_attributes + 1) { + if (immediate_attribute_id < regs.max_input_attrib_index) { + immediate_attribute_id += 1; + } else { MICROPROFILE_SCOPE(GPU_Drawing); immediate_attribute_id = 0; @@ -150,10 +151,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, static_cast(&immediate_input)); Shader::UnitState shader_unit; - shader_unit.LoadInputVertex(immediate_input, regs.vs.num_input_attributes + 1); + Shader::AttributeBuffer output{}; + + shader_unit.LoadInput(regs.vs, immediate_input); shader_engine->Run(g_state.vs, shader_unit); - auto output_vertex = Shader::OutputVertex::FromRegisters( - shader_unit.registers.output, regs, regs.vs.output_mask); + shader_unit.WriteOutput(regs.vs, output); // Send to renderer using Pica::Shader::OutputVertex; @@ -162,7 +164,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2); }; - g_state.primitive_assembler.SubmitVertex(output_vertex, AddTriangle); + g_state.primitive_assembler.SubmitVertex( + Shader::OutputVertex::FromAttributeBuffer(regs, output), AddTriangle); } } } @@ -280,19 +283,19 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { if (!vertex_cache_hit) { // Initialize data for the current vertex - Shader::InputVertex input; + Shader::AttributeBuffer input, output{}; loader.LoadVertex(base_address, index, vertex, input, memory_accesses); // Send to vertex shader if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, (void*)&input); - shader_unit.LoadInputVertex(input, loader.GetNumTotalAttributes()); + shader_unit.LoadInput(regs.vs, input); shader_engine->Run(g_state.vs, shader_unit); + shader_unit.WriteOutput(regs.vs, output); // Retrieve vertex from register data - output_vertex = Shader::OutputVertex::FromRegisters(shader_unit.registers.output, - regs, regs.vs.output_mask); + output_vertex = Shader::OutputVertex::FromAttributeBuffer(regs, output); if (is_indexed) { vertex_cache[vertex_cache_pos] = output_vertex; diff --git a/src/video_core/pica.h b/src/video_core/pica.h index 4ab4f1f40..731540b99 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h @@ -99,7 +99,8 @@ struct Regs { TEXCOORD1_U = 14, TEXCOORD1_V = 15, - // TODO: Not verified + TEXCOORD0_W = 16, + VIEW_X = 18, VIEW_Y = 19, VIEW_Z = 20, @@ -871,7 +872,7 @@ struct Regs { LightSrc light[8]; LightColor global_ambient; // Emission + (material.ambient * lighting.ambient) INSERT_PADDING_WORDS(0x1); - BitField<0, 3, u32> num_lights; // Number of enabled lights - 1 + BitField<0, 3, u32> max_light_index; // Number of enabled lights - 1 union { BitField<2, 2, LightingFresnelSelector> fresnel_selector; @@ -1048,7 +1049,7 @@ struct Regs { BitField<48, 12, u64> attribute_mask; // number of total attributes minus 1 - BitField<60, 4, u64> num_extra_attributes; + BitField<60, 4, u64> max_attribute_index; }; inline VertexAttributeFormat GetFormat(int n) const { @@ -1079,7 +1080,7 @@ struct Regs { } inline int GetNumTotalAttributes() const { - return (int)num_extra_attributes + 1; + return (int)max_attribute_index + 1; } // Attribute loaders map the source vertex data to input attributes @@ -1179,7 +1180,12 @@ struct Regs { } } command_buffer; - INSERT_PADDING_WORDS(0x07); + INSERT_PADDING_WORDS(4); + + /// Number of input attributes to the vertex shader minus 1 + BitField<0, 4, u32> max_input_attrib_index; + + INSERT_PADDING_WORDS(2); enum class GPUMode : u32 { Drawing = 0, @@ -1217,42 +1223,21 @@ struct Regs { union { // Number of input attributes to shader unit - 1 - BitField<0, 4, u32> num_input_attributes; + BitField<0, 4, u32> max_input_attribute_index; }; // Offset to shader program entry point (in words) BitField<0, 16, u32> main_offset; - union { - BitField<0, 4, u64> attribute0_register; - BitField<4, 4, u64> attribute1_register; - BitField<8, 4, u64> attribute2_register; - BitField<12, 4, u64> attribute3_register; - BitField<16, 4, u64> attribute4_register; - BitField<20, 4, u64> attribute5_register; - BitField<24, 4, u64> attribute6_register; - BitField<28, 4, u64> attribute7_register; - BitField<32, 4, u64> attribute8_register; - BitField<36, 4, u64> attribute9_register; - BitField<40, 4, u64> attribute10_register; - BitField<44, 4, u64> attribute11_register; - BitField<48, 4, u64> attribute12_register; - BitField<52, 4, u64> attribute13_register; - BitField<56, 4, u64> attribute14_register; - BitField<60, 4, u64> attribute15_register; + /// Maps input attributes to registers. 4-bits per attribute, specifying a register index + u32 input_attribute_to_register_map_low; + u32 input_attribute_to_register_map_high; - int GetRegisterForAttribute(int attribute_index) const { - u64 fields[] = { - attribute0_register, attribute1_register, attribute2_register, - attribute3_register, attribute4_register, attribute5_register, - attribute6_register, attribute7_register, attribute8_register, - attribute9_register, attribute10_register, attribute11_register, - attribute12_register, attribute13_register, attribute14_register, - attribute15_register, - }; - return (int)fields[attribute_index]; - } - } input_register_map; + unsigned int GetRegisterForAttribute(unsigned int attribute_index) const { + u64 map = ((u64)input_attribute_to_register_map_high << 32) | + (u64)input_attribute_to_register_map_low; + return (map >> (attribute_index * 4)) & 0b1111; + } BitField<0, 16, u32> output_mask; diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h index e4f2e6d5d..785d05650 100644 --- a/src/video_core/pica_state.h +++ b/src/video_core/pica_state.h @@ -23,7 +23,7 @@ struct State { Shader::ShaderSetup vs; Shader::ShaderSetup gs; - std::array, 16> vs_default_attributes; + Shader::AttributeBuffer input_default_attributes; struct { union LutEntry { @@ -66,7 +66,7 @@ struct State { /// Struct used to describe immediate mode rendering state struct ImmediateModeState { // Used to buffer partial vertices for immediate-mode rendering. - Shader::InputVertex input_vertex; + Shader::AttributeBuffer input_vertex; // Index of the next attribute to be loaded into `input_vertex`. u32 current_attribute = 0; } immediate; diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp index be7377290..e71ff5719 100644 --- a/src/video_core/primitive_assembly.cpp +++ b/src/video_core/primitive_assembly.cpp @@ -14,7 +14,7 @@ PrimitiveAssembler::PrimitiveAssembler(Regs::TriangleTopology topolo : topology(topology), buffer_index(0) {} template -void PrimitiveAssembler::SubmitVertex(VertexType& vtx, +void PrimitiveAssembler::SubmitVertex(const VertexType& vtx, TriangleHandler triangle_handler) { switch (topology) { // TODO: Figure out what's different with TriangleTopology::Shader. diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h index 0384d5984..24da47382 100644 --- a/src/video_core/primitive_assembly.h +++ b/src/video_core/primitive_assembly.h @@ -15,7 +15,8 @@ namespace Pica { */ template struct PrimitiveAssembler { - using TriangleHandler = std::function; + using TriangleHandler = + std::function; PrimitiveAssembler(Regs::TriangleTopology topology = Regs::TriangleTopology::List); @@ -25,7 +26,7 @@ struct PrimitiveAssembler { * NOTE: We could specify the triangle handler in the constructor, but this way we can * keep event and handler code next to each other. */ - void SubmitVertex(VertexType& vtx, TriangleHandler triangle_handler); + void SubmitVertex(const VertexType& vtx, TriangleHandler triangle_handler); /** * Resets the internal state of the PrimitiveAssembler. diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index c034c12d3..287d732b5 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp @@ -308,8 +308,8 @@ MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 24 * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing * culling via recursion. */ -static void ProcessTriangleInternal(const Shader::OutputVertex& v0, const Shader::OutputVertex& v1, - const Shader::OutputVertex& v2, bool reversed = false) { +static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Vertex& v2, + bool reversed = false) { const auto& regs = g_state.regs; MICROPROFILE_SCOPE(GPU_Rasterization); @@ -1277,8 +1277,7 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0, const Shader } } -void ProcessTriangle(const Shader::OutputVertex& v0, const Shader::OutputVertex& v1, - const Shader::OutputVertex& v2) { +void ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2) { ProcessTriangleInternal(v0, v1, v2); } diff --git a/src/video_core/rasterizer.h b/src/video_core/rasterizer.h index 6cbda3067..3a72ac343 100644 --- a/src/video_core/rasterizer.h +++ b/src/video_core/rasterizer.h @@ -4,16 +4,44 @@ #pragma once -namespace Pica { +#include "video_core/shader/shader.h" -namespace Shader { -struct OutputVertex; -} +namespace Pica { namespace Rasterizer { -void ProcessTriangle(const Shader::OutputVertex& v0, const Shader::OutputVertex& v1, - const Shader::OutputVertex& v2); +struct Vertex : Shader::OutputVertex { + Vertex(const OutputVertex& v) : OutputVertex(v) {} + + // Attributes used to store intermediate results + // position after perspective divide + Math::Vec3 screenpos; + + // Linear interpolation + // factor: 0=this, 1=vtx + void Lerp(float24 factor, const Vertex& vtx) { + pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); + + // TODO: Should perform perspective correct interpolation here... + tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); + tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); + tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); + + screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); + + color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); + } + + // Linear interpolation + // factor: 0=v0, 1=v1 + static Vertex Lerp(float24 factor, const Vertex& v0, const Vertex& v1) { + Vertex ret = v0; + ret.Lerp(factor, v1); + return ret; + } +}; + +void ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2); } // namespace Rasterizer diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index f3674e965..071e4ace0 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -467,7 +467,7 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) { // Fragment lighting switches case PICA_REG_INDEX(lighting.disable): - case PICA_REG_INDEX(lighting.num_lights): + case PICA_REG_INDEX(lighting.max_light_index): case PICA_REG_INDEX(lighting.config0): case PICA_REG_INDEX(lighting.config1): case PICA_REG_INDEX(lighting.abs_lut_input): diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index cc3e4bed5..a1aa07074 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -84,7 +84,7 @@ union PicaShaderConfig { // Fragment lighting state.lighting.enable = !regs.lighting.disable; - state.lighting.src_num = regs.lighting.num_lights + 1; + state.lighting.src_num = regs.lighting.max_light_index + 1; for (unsigned light_index = 0; light_index < state.lighting.src_num; ++light_index) { unsigned num = regs.lighting.light_enable.GetNum(light_index); diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 2da50bd62..f5f7ea61d 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -4,6 +4,7 @@ #include #include +#include "common/bit_set.h" #include "common/logging/log.h" #include "common/microprofile.h" #include "video_core/pica.h" @@ -19,38 +20,32 @@ namespace Pica { namespace Shader { -OutputVertex OutputVertex::FromRegisters(Math::Vec4 output_regs[16], const Regs& regs, - u32 output_mask) { +OutputVertex OutputVertex::FromAttributeBuffer(const Regs& regs, AttributeBuffer& input) { // Setup output data - OutputVertex ret; - // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to - // figure out what those circumstances are and enable the remaining outputs then. - unsigned index = 0; - for (unsigned i = 0; i < 7; ++i) { + union { + OutputVertex ret{}; + std::array vertex_slots; + }; + static_assert(sizeof(vertex_slots) == sizeof(ret), "Struct and array have different sizes."); - if (index >= regs.vs_output_total) - break; + unsigned int num_attributes = regs.vs_output_total; + ASSERT(num_attributes <= 7); + for (unsigned int i = 0; i < num_attributes; ++i) { + const auto& output_register_map = regs.vs_output_attributes[i]; - if ((output_mask & (1 << i)) == 0) - continue; - - const auto& output_register_map = regs.vs_output_attributes[index]; - - u32 semantics[4] = {output_register_map.map_x, output_register_map.map_y, - output_register_map.map_z, output_register_map.map_w}; + Regs::VSOutputAttributes::Semantic semantics[4] = { + output_register_map.map_x, output_register_map.map_y, output_register_map.map_z, + output_register_map.map_w}; for (unsigned comp = 0; comp < 4; ++comp) { - float24* out = ((float24*)&ret) + semantics[comp]; - if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { - *out = output_regs[i][comp]; - } else { - // Zero output so that attributes which aren't output won't have denormals in them, - // which would slow us down later. - memset(out, 0, sizeof(*out)); + Regs::VSOutputAttributes::Semantic semantic = semantics[comp]; + float24* out = &vertex_slots[semantic]; + if (semantic < vertex_slots.size()) { + *out = input.attr[i][comp]; + } else if (semantic != Regs::VSOutputAttributes::INVALID) { + LOG_ERROR(HW_GPU, "Invalid/unknown semantic id: %u", (unsigned int)semantic); } } - - index++; } // The hardware takes the absolute and saturates vertex colors like this, *before* doing @@ -71,12 +66,20 @@ OutputVertex OutputVertex::FromRegisters(Math::Vec4 output_regs[16], co return ret; } -void UnitState::LoadInputVertex(const InputVertex& input, int num_attributes) { - // Setup input register table - const auto& attribute_register_map = g_state.regs.vs.input_register_map; +void UnitState::LoadInput(const Regs::ShaderConfig& config, const AttributeBuffer& input) { + const unsigned max_attribute = config.max_input_attribute_index; - for (int i = 0; i < num_attributes; i++) - registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; + for (unsigned attr = 0; attr <= max_attribute; ++attr) { + unsigned reg = config.GetRegisterForAttribute(attr); + registers.input[reg] = input.attr[attr]; + } +} + +void UnitState::WriteOutput(const Regs::ShaderConfig& config, AttributeBuffer& output) { + unsigned int output_i = 0; + for (unsigned int reg : Common::BitSet(config.output_mask)) { + output.attr[output_i++] = registers.output[reg]; + } } MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 44d9f76c3..b188d3edf 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -23,14 +23,11 @@ namespace Pica { namespace Shader { -struct InputVertex { +struct AttributeBuffer { alignas(16) Math::Vec4 attr[16]; }; struct OutputVertex { - OutputVertex() = default; - - // VS output attributes Math::Vec4 pos; Math::Vec4 quat; Math::Vec4 color; @@ -42,43 +39,22 @@ struct OutputVertex { INSERT_PADDING_WORDS(1); Math::Vec2 tc2; - // Padding for optimal alignment - INSERT_PADDING_WORDS(4); - - // Attributes used to store intermediate results - - // position after perspective divide - Math::Vec3 screenpos; - INSERT_PADDING_WORDS(1); - - // Linear interpolation - // factor: 0=this, 1=vtx - void Lerp(float24 factor, const OutputVertex& vtx) { - pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); - - // TODO: Should perform perspective correct interpolation here... - tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); - tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); - tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); - - screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); - - color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); - } - - // Linear interpolation - // factor: 0=v0, 1=v1 - static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { - OutputVertex ret = v0; - ret.Lerp(factor, v1); - return ret; - } - - static OutputVertex FromRegisters(Math::Vec4 output_regs[16], const Regs& regs, - u32 output_mask); + static OutputVertex FromAttributeBuffer(const Regs& regs, AttributeBuffer& output); }; +#define ASSERT_POS(var, pos) \ + static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong " \ + "offset.") +ASSERT_POS(pos, Regs::VSOutputAttributes::POSITION_X); +ASSERT_POS(quat, Regs::VSOutputAttributes::QUATERNION_X); +ASSERT_POS(color, Regs::VSOutputAttributes::COLOR_R); +ASSERT_POS(tc0, Regs::VSOutputAttributes::TEXCOORD0_U); +ASSERT_POS(tc1, Regs::VSOutputAttributes::TEXCOORD1_U); +ASSERT_POS(tc0_w, Regs::VSOutputAttributes::TEXCOORD0_W); +ASSERT_POS(view, Regs::VSOutputAttributes::VIEW_X); +ASSERT_POS(tc2, Regs::VSOutputAttributes::TEXCOORD2_U); +#undef ASSERT_POS static_assert(std::is_pod::value, "Structure is not POD"); -static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); +static_assert(sizeof(OutputVertex) == 24 * sizeof(float), "OutputVertex has invalid size"); /** * This structure contains the state information that needs to be unique for a shader unit. The 3DS @@ -137,10 +113,12 @@ struct UnitState { /** * Loads the unit state with an input vertex. * - * @param input Input vertex into the shader - * @param num_attributes The number of vertex shader attributes to load + * @param config Shader configuration registers corresponding to the unit. + * @param input Attribute buffer to load into the input registers. */ - void LoadInputVertex(const InputVertex& input, int num_attributes); + void LoadInput(const Regs::ShaderConfig& config, const AttributeBuffer& input); + + void WriteOutput(const Regs::ShaderConfig& config, AttributeBuffer& output); }; struct ShaderSetup { diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index c0c89b857..81522b8f5 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -668,14 +668,14 @@ void InterpreterEngine::Run(const ShaderSetup& setup, UnitState& state) const { } DebugData InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup, - const InputVertex& input, - int num_attributes) const { + const AttributeBuffer& input, + const Regs::ShaderConfig& config) const { UnitState state; DebugData debug_data; // Setup input register table boost::fill(state.registers.input, Math::Vec4::AssignToAll(float24::Zero())); - state.LoadInputVertex(input, num_attributes); + state.LoadInput(config, input); RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point); return debug_data; } diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h index d6c0e2d8c..d7a61e122 100644 --- a/src/video_core/shader/shader_interpreter.h +++ b/src/video_core/shader/shader_interpreter.h @@ -19,12 +19,11 @@ public: /** * Produce debug information based on the given shader and input vertex * @param input Input vertex into the shader - * @param num_attributes The number of vertex shader attributes * @param config Configuration object for the shader pipeline * @return Debug information for this shader with regards to the given vertex */ - DebugData ProduceDebugInfo(const ShaderSetup& setup, const InputVertex& input, - int num_attributes) const; + DebugData ProduceDebugInfo(const ShaderSetup& setup, const AttributeBuffer& input, + const Regs::ShaderConfig& config) const; }; } // namespace diff --git a/src/video_core/vertex_loader.cpp b/src/video_core/vertex_loader.cpp index 2b8ef7018..bf83b61ca 100644 --- a/src/video_core/vertex_loader.cpp +++ b/src/video_core/vertex_loader.cpp @@ -70,7 +70,8 @@ void VertexLoader::Setup(const Pica::Regs& regs) { is_setup = true; } -void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input, +void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, + Shader::AttributeBuffer& input, DebugUtils::MemoryAccessTracker& memory_accesses) { ASSERT_MSG(is_setup, "A VertexLoader needs to be setup before loading vertices."); @@ -142,7 +143,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::I input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32()); } else if (vertex_attribute_is_default[i]) { // Load the default attribute if we're configured to do so - input.attr[i] = g_state.vs_default_attributes[i]; + input.attr[i] = g_state.input_default_attributes.attr[i]; LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)", i, vertex, index, input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), diff --git a/src/video_core/vertex_loader.h b/src/video_core/vertex_loader.h index 9f2098bb2..51f3d45b4 100644 --- a/src/video_core/vertex_loader.h +++ b/src/video_core/vertex_loader.h @@ -11,7 +11,7 @@ class MemoryAccessTracker; } namespace Shader { -struct InputVertex; +struct AttributeBuffer; } class VertexLoader { @@ -22,7 +22,7 @@ public: } void Setup(const Pica::Regs& regs); - void LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input, + void LoadVertex(u32 base_address, int index, int vertex, Shader::AttributeBuffer& input, DebugUtils::MemoryAccessTracker& memory_accesses); int GetNumTotalAttributes() const {