shader: Split SSY and PBK stack
Hardware testing revealed that SSY and PBK push to a different stack, allowing code like this: SSY label1; PBK label2; SYNC; label1: PBK; label2: EXIT;
This commit is contained in:
parent
cd2d9628c9
commit
fe8e6618f2
|
@ -143,6 +143,24 @@ u32 GetGenericAttributeIndex(Attribute::Index index) {
|
||||||
return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
|
return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
constexpr const char* GetFlowStackPrefix(MetaStackClass stack) {
|
||||||
|
switch (stack) {
|
||||||
|
case MetaStackClass::Ssy:
|
||||||
|
return "ssy";
|
||||||
|
case MetaStackClass::Pbk:
|
||||||
|
return "pbk";
|
||||||
|
}
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string FlowStackName(MetaStackClass stack) {
|
||||||
|
return fmt::format("{}_flow_stack", GetFlowStackPrefix(stack));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string FlowStackTopName(MetaStackClass stack) {
|
||||||
|
return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
|
||||||
|
}
|
||||||
|
|
||||||
class GLSLDecompiler final {
|
class GLSLDecompiler final {
|
||||||
public:
|
public:
|
||||||
explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
|
explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
|
||||||
|
@ -173,8 +191,10 @@ public:
|
||||||
// TODO(Subv): Figure out the actual depth of the flow stack, for now it seems
|
// TODO(Subv): Figure out the actual depth of the flow stack, for now it seems
|
||||||
// unlikely that shaders will use 20 nested SSYs and PBKs.
|
// unlikely that shaders will use 20 nested SSYs and PBKs.
|
||||||
constexpr u32 FLOW_STACK_SIZE = 20;
|
constexpr u32 FLOW_STACK_SIZE = 20;
|
||||||
code.AddLine("uint flow_stack[{}];", FLOW_STACK_SIZE);
|
for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) {
|
||||||
code.AddLine("uint flow_stack_top = 0u;");
|
code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE);
|
||||||
|
code.AddLine("uint {} = 0u;", FlowStackTopName(stack));
|
||||||
|
}
|
||||||
|
|
||||||
code.AddLine("while (true) {{");
|
code.AddLine("while (true) {{");
|
||||||
++code.scope;
|
++code.scope;
|
||||||
|
@ -1438,15 +1458,18 @@ private:
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string PushFlowStack(Operation operation) {
|
std::string PushFlowStack(Operation operation) {
|
||||||
|
const auto stack = std::get<MetaStackClass>(operation.GetMeta());
|
||||||
const auto target = std::get_if<ImmediateNode>(&*operation[0]);
|
const auto target = std::get_if<ImmediateNode>(&*operation[0]);
|
||||||
UNIMPLEMENTED_IF(!target);
|
UNIMPLEMENTED_IF(!target);
|
||||||
|
|
||||||
code.AddLine("flow_stack[flow_stack_top++] = 0x{:x}u;", target->GetValue());
|
code.AddLine("{}[{}++] = 0x{:x}u;", FlowStackName(stack), FlowStackTopName(stack),
|
||||||
|
target->GetValue());
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string PopFlowStack(Operation operation) {
|
std::string PopFlowStack(Operation operation) {
|
||||||
code.AddLine("jmp_to = flow_stack[--flow_stack_top];");
|
const auto stack = std::get<MetaStackClass>(operation.GetMeta());
|
||||||
|
code.AddLine("jmp_to = {}[--{}];", FlowStackName(stack), FlowStackTopName(stack));
|
||||||
code.AddLine("break;");
|
code.AddLine("break;");
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
|
@ -132,20 +132,16 @@ public:
|
||||||
branch_labels.push_back(label);
|
branch_labels.push_back(label);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely
|
|
||||||
// that shaders will use 20 nested SSYs and PBKs.
|
|
||||||
constexpr u32 FLOW_STACK_SIZE = 20;
|
|
||||||
const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE));
|
|
||||||
jmp_to = Emit(OpVariable(TypePointer(spv::StorageClass::Function, t_uint),
|
jmp_to = Emit(OpVariable(TypePointer(spv::StorageClass::Function, t_uint),
|
||||||
spv::StorageClass::Function, Constant(t_uint, first_address)));
|
spv::StorageClass::Function, Constant(t_uint, first_address)));
|
||||||
flow_stack = Emit(OpVariable(TypePointer(spv::StorageClass::Function, flow_stack_type),
|
std::tie(ssy_flow_stack, ssy_flow_stack_top) = CreateFlowStack();
|
||||||
spv::StorageClass::Function, ConstantNull(flow_stack_type)));
|
std::tie(pbk_flow_stack, pbk_flow_stack_top) = CreateFlowStack();
|
||||||
flow_stack_top =
|
|
||||||
Emit(OpVariable(t_func_uint, spv::StorageClass::Function, Constant(t_uint, 0)));
|
|
||||||
|
|
||||||
Name(jmp_to, "jmp_to");
|
Name(jmp_to, "jmp_to");
|
||||||
Name(flow_stack, "flow_stack");
|
Name(ssy_flow_stack, "ssy_flow_stack");
|
||||||
Name(flow_stack_top, "flow_stack_top");
|
Name(ssy_flow_stack_top, "ssy_flow_stack_top");
|
||||||
|
Name(pbk_flow_stack, "pbk_flow_stack");
|
||||||
|
Name(pbk_flow_stack_top, "pbk_flow_stack_top");
|
||||||
|
|
||||||
Emit(OpBranch(loop_label));
|
Emit(OpBranch(loop_label));
|
||||||
Emit(loop_label);
|
Emit(loop_label);
|
||||||
|
@ -952,6 +948,7 @@ private:
|
||||||
const auto target = std::get_if<ImmediateNode>(&*operation[0]);
|
const auto target = std::get_if<ImmediateNode>(&*operation[0]);
|
||||||
ASSERT(target);
|
ASSERT(target);
|
||||||
|
|
||||||
|
const auto [flow_stack, flow_stack_top] = GetFlowStack(operation);
|
||||||
const Id current = Emit(OpLoad(t_uint, flow_stack_top));
|
const Id current = Emit(OpLoad(t_uint, flow_stack_top));
|
||||||
const Id next = Emit(OpIAdd(t_uint, current, Constant(t_uint, 1)));
|
const Id next = Emit(OpIAdd(t_uint, current, Constant(t_uint, 1)));
|
||||||
const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, current));
|
const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, current));
|
||||||
|
@ -962,6 +959,7 @@ private:
|
||||||
}
|
}
|
||||||
|
|
||||||
Id PopFlowStack(Operation operation) {
|
Id PopFlowStack(Operation operation) {
|
||||||
|
const auto [flow_stack, flow_stack_top] = GetFlowStack(operation);
|
||||||
const Id current = Emit(OpLoad(t_uint, flow_stack_top));
|
const Id current = Emit(OpLoad(t_uint, flow_stack_top));
|
||||||
const Id previous = Emit(OpISub(t_uint, current, Constant(t_uint, 1)));
|
const Id previous = Emit(OpISub(t_uint, current, Constant(t_uint, 1)));
|
||||||
const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, previous));
|
const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, previous));
|
||||||
|
@ -1172,6 +1170,31 @@ private:
|
||||||
Emit(skip_label);
|
Emit(skip_label);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::tuple<Id, Id> CreateFlowStack() {
|
||||||
|
// TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely
|
||||||
|
// that shaders will use 20 nested SSYs and PBKs.
|
||||||
|
constexpr u32 FLOW_STACK_SIZE = 20;
|
||||||
|
constexpr auto storage_class = spv::StorageClass::Function;
|
||||||
|
|
||||||
|
const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE));
|
||||||
|
const Id stack = Emit(OpVariable(TypePointer(storage_class, flow_stack_type), storage_class,
|
||||||
|
ConstantNull(flow_stack_type)));
|
||||||
|
const Id top = Emit(OpVariable(t_func_uint, storage_class, Constant(t_uint, 0)));
|
||||||
|
return std::tie(stack, top);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<Id, Id> GetFlowStack(Operation operation) {
|
||||||
|
const auto stack_class = std::get<MetaStackClass>(operation.GetMeta());
|
||||||
|
switch (stack_class) {
|
||||||
|
case MetaStackClass::Ssy:
|
||||||
|
return {ssy_flow_stack, ssy_flow_stack_top};
|
||||||
|
case MetaStackClass::Pbk:
|
||||||
|
return {pbk_flow_stack, pbk_flow_stack_top};
|
||||||
|
}
|
||||||
|
UNREACHABLE();
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
static constexpr OperationDecompilersArray operation_decompilers = {
|
static constexpr OperationDecompilersArray operation_decompilers = {
|
||||||
&SPIRVDecompiler::Assign,
|
&SPIRVDecompiler::Assign,
|
||||||
|
|
||||||
|
@ -1414,8 +1437,10 @@ private:
|
||||||
|
|
||||||
Id execute_function{};
|
Id execute_function{};
|
||||||
Id jmp_to{};
|
Id jmp_to{};
|
||||||
Id flow_stack_top{};
|
Id ssy_flow_stack_top{};
|
||||||
Id flow_stack{};
|
Id pbk_flow_stack_top{};
|
||||||
|
Id ssy_flow_stack{};
|
||||||
|
Id pbk_flow_stack{};
|
||||||
Id continue_label{};
|
Id continue_label{};
|
||||||
std::map<u32, Id> labels;
|
std::map<u32, Id> labels;
|
||||||
};
|
};
|
||||||
|
|
|
@ -109,22 +109,20 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
|
||||||
UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
|
UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
|
||||||
"Constant buffer flow is not supported");
|
"Constant buffer flow is not supported");
|
||||||
|
|
||||||
// The SSY opcode tells the GPU where to re-converge divergent execution paths, it sets the
|
// The SSY opcode tells the GPU where to re-converge divergent execution paths with SYNC.
|
||||||
// target of the jump that the SYNC instruction will make. The SSY opcode has a similar
|
|
||||||
// structure to the BRA opcode.
|
|
||||||
const u32 target = pc + instr.bra.GetBranchTarget();
|
const u32 target = pc + instr.bra.GetBranchTarget();
|
||||||
bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target)));
|
bb.push_back(
|
||||||
|
Operation(OperationCode::PushFlowStack, MetaStackClass::Ssy, Immediate(target)));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case OpCode::Id::PBK: {
|
case OpCode::Id::PBK: {
|
||||||
UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
|
UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
|
||||||
"Constant buffer PBK is not supported");
|
"Constant buffer PBK is not supported");
|
||||||
|
|
||||||
// PBK pushes to a stack the address where BRK will jump to. This shares stack with SSY but
|
// PBK pushes to a stack the address where BRK will jump to.
|
||||||
// using SYNC on a PBK address will kill the shader execution. We don't emulate this because
|
|
||||||
// it's very unlikely a driver will emit such invalid shader.
|
|
||||||
const u32 target = pc + instr.bra.GetBranchTarget();
|
const u32 target = pc + instr.bra.GetBranchTarget();
|
||||||
bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target)));
|
bb.push_back(
|
||||||
|
Operation(OperationCode::PushFlowStack, MetaStackClass::Pbk, Immediate(target)));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case OpCode::Id::SYNC: {
|
case OpCode::Id::SYNC: {
|
||||||
|
@ -133,7 +131,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
|
||||||
static_cast<u32>(cc));
|
static_cast<u32>(cc));
|
||||||
|
|
||||||
// The SYNC opcode jumps to the address previously set by the SSY opcode
|
// The SYNC opcode jumps to the address previously set by the SSY opcode
|
||||||
bb.push_back(Operation(OperationCode::PopFlowStack));
|
bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Ssy));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case OpCode::Id::BRK: {
|
case OpCode::Id::BRK: {
|
||||||
|
@ -142,7 +140,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
|
||||||
static_cast<u32>(cc));
|
static_cast<u32>(cc));
|
||||||
|
|
||||||
// The BRK opcode jumps to the address previously set by the PBK opcode
|
// The BRK opcode jumps to the address previously set by the PBK opcode
|
||||||
bb.push_back(Operation(OperationCode::PopFlowStack));
|
bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Pbk));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case OpCode::Id::IPA: {
|
case OpCode::Id::IPA: {
|
||||||
|
|
|
@ -174,6 +174,11 @@ enum class InternalFlag {
|
||||||
Amount = 4,
|
Amount = 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum class MetaStackClass {
|
||||||
|
Ssy,
|
||||||
|
Pbk,
|
||||||
|
};
|
||||||
|
|
||||||
class OperationNode;
|
class OperationNode;
|
||||||
class ConditionalNode;
|
class ConditionalNode;
|
||||||
class GprNode;
|
class GprNode;
|
||||||
|
@ -285,7 +290,7 @@ struct MetaTexture {
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Parameters that modify an operation but are not part of any particular operand
|
/// Parameters that modify an operation but are not part of any particular operand
|
||||||
using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>;
|
using Meta = std::variant<MetaArithmetic, MetaTexture, MetaStackClass, Tegra::Shader::HalfType>;
|
||||||
|
|
||||||
/// Holds any kind of operation that can be done in the IR
|
/// Holds any kind of operation that can be done in the IR
|
||||||
class OperationNode final {
|
class OperationNode final {
|
||||||
|
|
Loading…
Reference in a new issue