Merge pull request #2092 from ReinUsesLisp/stg
shader/memory: Implement STG and global memory flushing
This commit is contained in:
commit
1b83f255c2
|
@ -791,6 +791,12 @@ union Instruction {
|
||||||
BitField<20, 24, s64> immediate_offset;
|
BitField<20, 24, s64> immediate_offset;
|
||||||
} ldg;
|
} ldg;
|
||||||
|
|
||||||
|
union {
|
||||||
|
BitField<48, 3, UniformType> type;
|
||||||
|
BitField<46, 2, u64> cache_mode;
|
||||||
|
BitField<20, 24, s64> immediate_offset;
|
||||||
|
} stg;
|
||||||
|
|
||||||
union {
|
union {
|
||||||
BitField<0, 3, u64> pred0;
|
BitField<0, 3, u64> pred0;
|
||||||
BitField<3, 3, u64> pred3;
|
BitField<3, 3, u64> pred3;
|
||||||
|
|
|
@ -14,28 +14,28 @@
|
||||||
|
|
||||||
namespace OpenGL {
|
namespace OpenGL {
|
||||||
|
|
||||||
CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr)
|
CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size)
|
||||||
: RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size} {
|
: RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, host_ptr{host_ptr}, size{size},
|
||||||
|
max_size{max_size} {
|
||||||
buffer.Create();
|
buffer.Create();
|
||||||
// Bind and unbind the buffer so it gets allocated by the driver
|
|
||||||
glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
|
|
||||||
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
|
|
||||||
LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory");
|
LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory");
|
||||||
}
|
}
|
||||||
|
|
||||||
void CachedGlobalRegion::Reload(u32 size_) {
|
CachedGlobalRegion::~CachedGlobalRegion() = default;
|
||||||
constexpr auto max_size = static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize);
|
|
||||||
|
|
||||||
|
void CachedGlobalRegion::Reload(u32 size_) {
|
||||||
size = size_;
|
size = size_;
|
||||||
if (size > max_size) {
|
if (size > max_size) {
|
||||||
size = max_size;
|
size = max_size;
|
||||||
LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the expected size {}!", size_,
|
LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the supported size {}!", size_,
|
||||||
max_size);
|
max_size);
|
||||||
}
|
}
|
||||||
|
glNamedBufferData(buffer.handle, size, host_ptr, GL_STREAM_DRAW);
|
||||||
|
}
|
||||||
|
|
||||||
// TODO(Rodrigo): Get rid of Memory::GetPointer with a staging buffer
|
void CachedGlobalRegion::Flush() {
|
||||||
glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
|
LOG_DEBUG(Render_OpenGL, "Flushing {} bytes to CPU memory address 0x{:16}", size, cpu_addr);
|
||||||
glBufferData(GL_SHADER_STORAGE_BUFFER, size, GetHostPtr(), GL_DYNAMIC_DRAW);
|
glGetNamedBufferSubData(buffer.handle, 0, static_cast<GLsizeiptr>(size), host_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const {
|
GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const {
|
||||||
|
@ -46,14 +46,16 @@ GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr,
|
||||||
return search->second;
|
return search->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u32 size,
|
GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr,
|
||||||
u8* host_ptr) {
|
u32 size) {
|
||||||
GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)};
|
GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)};
|
||||||
if (!region) {
|
if (!region) {
|
||||||
// No reserved surface available, create a new one and reserve it
|
// No reserved surface available, create a new one and reserve it
|
||||||
auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
|
auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
|
||||||
const auto cpu_addr = *memory_manager.GpuToCpuAddress(addr);
|
const auto cpu_addr{memory_manager.GpuToCpuAddress(addr)};
|
||||||
region = std::make_shared<CachedGlobalRegion>(cpu_addr, size, host_ptr);
|
ASSERT(cpu_addr);
|
||||||
|
|
||||||
|
region = std::make_shared<CachedGlobalRegion>(*cpu_addr, host_ptr, size, max_ssbo_size);
|
||||||
ReserveGlobalRegion(region);
|
ReserveGlobalRegion(region);
|
||||||
}
|
}
|
||||||
region->Reload(size);
|
region->Reload(size);
|
||||||
|
@ -65,7 +67,11 @@ void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) {
|
||||||
}
|
}
|
||||||
|
|
||||||
GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer)
|
GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer)
|
||||||
: RasterizerCache{rasterizer} {}
|
: RasterizerCache{rasterizer} {
|
||||||
|
GLint max_ssbo_size_;
|
||||||
|
glGetIntegerv(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &max_ssbo_size_);
|
||||||
|
max_ssbo_size = static_cast<u32>(max_ssbo_size_);
|
||||||
|
}
|
||||||
|
|
||||||
GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
|
GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
|
||||||
const GLShader::GlobalMemoryEntry& global_region,
|
const GLShader::GlobalMemoryEntry& global_region,
|
||||||
|
@ -73,7 +79,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
|
||||||
|
|
||||||
auto& gpu{Core::System::GetInstance().GPU()};
|
auto& gpu{Core::System::GetInstance().GPU()};
|
||||||
auto& memory_manager{gpu.MemoryManager()};
|
auto& memory_manager{gpu.MemoryManager()};
|
||||||
const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<u64>(stage)]};
|
const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
|
||||||
const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address +
|
const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address +
|
||||||
global_region.GetCbufOffset()};
|
global_region.GetCbufOffset()};
|
||||||
const auto actual_addr{memory_manager.Read<u64>(addr)};
|
const auto actual_addr{memory_manager.Read<u64>(addr)};
|
||||||
|
@ -85,7 +91,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
|
||||||
|
|
||||||
if (!region) {
|
if (!region) {
|
||||||
// No global region found - create a new one
|
// No global region found - create a new one
|
||||||
region = GetUncachedGlobalRegion(actual_addr, size, host_ptr);
|
region = GetUncachedGlobalRegion(actual_addr, host_ptr, size);
|
||||||
Register(region);
|
Register(region);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,7 @@ namespace OpenGL {
|
||||||
|
|
||||||
namespace GLShader {
|
namespace GLShader {
|
||||||
class GlobalMemoryEntry;
|
class GlobalMemoryEntry;
|
||||||
} // namespace GLShader
|
}
|
||||||
|
|
||||||
class RasterizerOpenGL;
|
class RasterizerOpenGL;
|
||||||
class CachedGlobalRegion;
|
class CachedGlobalRegion;
|
||||||
|
@ -27,7 +27,8 @@ using GlobalRegion = std::shared_ptr<CachedGlobalRegion>;
|
||||||
|
|
||||||
class CachedGlobalRegion final : public RasterizerCacheObject {
|
class CachedGlobalRegion final : public RasterizerCacheObject {
|
||||||
public:
|
public:
|
||||||
explicit CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr);
|
explicit CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size);
|
||||||
|
~CachedGlobalRegion();
|
||||||
|
|
||||||
VAddr GetCpuAddr() const override {
|
VAddr GetCpuAddr() const override {
|
||||||
return cpu_addr;
|
return cpu_addr;
|
||||||
|
@ -45,14 +46,14 @@ public:
|
||||||
/// Reloads the global region from guest memory
|
/// Reloads the global region from guest memory
|
||||||
void Reload(u32 size_);
|
void Reload(u32 size_);
|
||||||
|
|
||||||
// TODO(Rodrigo): When global memory is written (STG), implement flushing
|
void Flush() override;
|
||||||
void Flush() override {
|
|
||||||
UNIMPLEMENTED();
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
VAddr cpu_addr{};
|
VAddr cpu_addr{};
|
||||||
|
u8* host_ptr{};
|
||||||
u32 size{};
|
u32 size{};
|
||||||
|
u32 max_size{};
|
||||||
|
|
||||||
OGLBuffer buffer;
|
OGLBuffer buffer;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -66,10 +67,11 @@ public:
|
||||||
|
|
||||||
private:
|
private:
|
||||||
GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const;
|
GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const;
|
||||||
GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u32 size, u8* host_ptr);
|
GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, u32 size);
|
||||||
void ReserveGlobalRegion(GlobalRegion region);
|
void ReserveGlobalRegion(GlobalRegion region);
|
||||||
|
|
||||||
std::unordered_map<CacheAddr, GlobalRegion> reserve;
|
std::unordered_map<CacheAddr, GlobalRegion> reserve;
|
||||||
|
u32 max_ssbo_size{};
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace OpenGL
|
} // namespace OpenGL
|
||||||
|
|
|
@ -756,6 +756,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
res_cache.FlushRegion(addr, size);
|
res_cache.FlushRegion(addr, size);
|
||||||
|
global_cache.FlushRegion(addr, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
|
void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
|
||||||
|
@ -953,6 +954,9 @@ void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::Shade
|
||||||
for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
|
for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
|
||||||
const auto& entry{entries[bindpoint]};
|
const auto& entry{entries[bindpoint]};
|
||||||
const auto& region{global_cache.GetGlobalRegion(entry, stage)};
|
const auto& region{global_cache.GetGlobalRegion(entry, stage)};
|
||||||
|
if (entry.IsWritten()) {
|
||||||
|
region->MarkAsModified(true, global_cache);
|
||||||
|
}
|
||||||
bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0,
|
bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0,
|
||||||
static_cast<GLsizeiptr>(region->GetSizeInBytes()));
|
static_cast<GLsizeiptr>(region->GetSizeInBytes()));
|
||||||
}
|
}
|
||||||
|
|
|
@ -71,10 +71,6 @@ public:
|
||||||
static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0,
|
static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0,
|
||||||
"The maximum size of a constbuffer must be a multiple of the size of GLvec4");
|
"The maximum size of a constbuffer must be a multiple of the size of GLvec4");
|
||||||
|
|
||||||
static constexpr std::size_t MaxGlobalMemorySize = 0x10000;
|
|
||||||
static_assert(MaxGlobalMemorySize % sizeof(float) == 0,
|
|
||||||
"The maximum size of a global memory must be a multiple of the size of float");
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
class SamplerInfo {
|
class SamplerInfo {
|
||||||
public:
|
public:
|
||||||
|
|
|
@ -45,8 +45,6 @@ using TextureIR = std::variant<TextureAoffi, TextureArgument>;
|
||||||
enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 };
|
enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 };
|
||||||
constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
|
constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
|
||||||
static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
|
static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
|
||||||
constexpr u32 MAX_GLOBALMEMORY_ELEMENTS =
|
|
||||||
static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize) / sizeof(float);
|
|
||||||
|
|
||||||
class ShaderWriter {
|
class ShaderWriter {
|
||||||
public:
|
public:
|
||||||
|
@ -208,8 +206,10 @@ public:
|
||||||
for (const auto& sampler : ir.GetSamplers()) {
|
for (const auto& sampler : ir.GetSamplers()) {
|
||||||
entries.samplers.emplace_back(sampler);
|
entries.samplers.emplace_back(sampler);
|
||||||
}
|
}
|
||||||
for (const auto& gmem : ir.GetGlobalMemoryBases()) {
|
for (const auto& gmem_pair : ir.GetGlobalMemory()) {
|
||||||
entries.global_memory_entries.emplace_back(gmem.cbuf_index, gmem.cbuf_offset);
|
const auto& [base, usage] = gmem_pair;
|
||||||
|
entries.global_memory_entries.emplace_back(base.cbuf_index, base.cbuf_offset,
|
||||||
|
usage.is_read, usage.is_written);
|
||||||
}
|
}
|
||||||
entries.clip_distances = ir.GetClipDistances();
|
entries.clip_distances = ir.GetClipDistances();
|
||||||
entries.shader_length = ir.GetLength();
|
entries.shader_length = ir.GetLength();
|
||||||
|
@ -380,12 +380,22 @@ private:
|
||||||
}
|
}
|
||||||
|
|
||||||
void DeclareGlobalMemory() {
|
void DeclareGlobalMemory() {
|
||||||
for (const auto& entry : ir.GetGlobalMemoryBases()) {
|
for (const auto& gmem : ir.GetGlobalMemory()) {
|
||||||
|
const auto& [base, usage] = gmem;
|
||||||
|
|
||||||
|
// Since we don't know how the shader will use the shader, hint the driver to disable as
|
||||||
|
// much optimizations as possible
|
||||||
|
std::string qualifier = "coherent volatile";
|
||||||
|
if (usage.is_read && !usage.is_written)
|
||||||
|
qualifier += " readonly";
|
||||||
|
else if (usage.is_written && !usage.is_read)
|
||||||
|
qualifier += " writeonly";
|
||||||
|
|
||||||
const std::string binding =
|
const std::string binding =
|
||||||
fmt::format("GMEM_BINDING_{}_{}", entry.cbuf_index, entry.cbuf_offset);
|
fmt::format("GMEM_BINDING_{}_{}", base.cbuf_index, base.cbuf_offset);
|
||||||
code.AddLine("layout (std430, binding = " + binding + ") buffer " +
|
code.AddLine("layout (std430, binding = " + binding + ") " + qualifier + " buffer " +
|
||||||
GetGlobalMemoryBlock(entry) + " {");
|
GetGlobalMemoryBlock(base) + " {");
|
||||||
code.AddLine(" float " + GetGlobalMemory(entry) + "[MAX_GLOBALMEMORY_ELEMENTS];");
|
code.AddLine(" float " + GetGlobalMemory(base) + "[];");
|
||||||
code.AddLine("};");
|
code.AddLine("};");
|
||||||
code.AddNewLine();
|
code.AddNewLine();
|
||||||
}
|
}
|
||||||
|
@ -868,6 +878,12 @@ private:
|
||||||
} else if (const auto lmem = std::get_if<LmemNode>(dest)) {
|
} else if (const auto lmem = std::get_if<LmemNode>(dest)) {
|
||||||
target = GetLocalMemory() + "[ftou(" + Visit(lmem->GetAddress()) + ") / 4]";
|
target = GetLocalMemory() + "[ftou(" + Visit(lmem->GetAddress()) + ") / 4]";
|
||||||
|
|
||||||
|
} else if (const auto gmem = std::get_if<GmemNode>(dest)) {
|
||||||
|
const std::string real = Visit(gmem->GetRealAddress());
|
||||||
|
const std::string base = Visit(gmem->GetBaseAddress());
|
||||||
|
const std::string final_offset = "(ftou(" + real + ") - ftou(" + base + ")) / 4";
|
||||||
|
target = fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
UNREACHABLE_MSG("Assign called without a proper target");
|
UNREACHABLE_MSG("Assign called without a proper target");
|
||||||
}
|
}
|
||||||
|
@ -1621,9 +1637,7 @@ private:
|
||||||
|
|
||||||
std::string GetCommonDeclarations() {
|
std::string GetCommonDeclarations() {
|
||||||
const auto cbuf = std::to_string(MAX_CONSTBUFFER_ELEMENTS);
|
const auto cbuf = std::to_string(MAX_CONSTBUFFER_ELEMENTS);
|
||||||
const auto gmem = std::to_string(MAX_GLOBALMEMORY_ELEMENTS);
|
|
||||||
return "#define MAX_CONSTBUFFER_ELEMENTS " + cbuf + "\n" +
|
return "#define MAX_CONSTBUFFER_ELEMENTS " + cbuf + "\n" +
|
||||||
"#define MAX_GLOBALMEMORY_ELEMENTS " + gmem + "\n" +
|
|
||||||
"#define ftoi floatBitsToInt\n"
|
"#define ftoi floatBitsToInt\n"
|
||||||
"#define ftou floatBitsToUint\n"
|
"#define ftou floatBitsToUint\n"
|
||||||
"#define itof intBitsToFloat\n"
|
"#define itof intBitsToFloat\n"
|
||||||
|
|
|
@ -39,8 +39,9 @@ private:
|
||||||
|
|
||||||
class GlobalMemoryEntry {
|
class GlobalMemoryEntry {
|
||||||
public:
|
public:
|
||||||
explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset)
|
explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read, bool is_written)
|
||||||
: cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset} {}
|
: cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_read{is_read}, is_written{
|
||||||
|
is_written} {}
|
||||||
|
|
||||||
u32 GetCbufIndex() const {
|
u32 GetCbufIndex() const {
|
||||||
return cbuf_index;
|
return cbuf_index;
|
||||||
|
@ -50,9 +51,19 @@ public:
|
||||||
return cbuf_offset;
|
return cbuf_offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool IsRead() const {
|
||||||
|
return is_read;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool IsWritten() const {
|
||||||
|
return is_written;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
u32 cbuf_index{};
|
u32 cbuf_index{};
|
||||||
u32 cbuf_offset{};
|
u32 cbuf_offset{};
|
||||||
|
bool is_read{};
|
||||||
|
bool is_written{};
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ShaderEntries {
|
struct ShaderEntries {
|
||||||
|
|
|
@ -337,11 +337,16 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
|
||||||
for (u32 i = 0; i < global_memory_count; ++i) {
|
for (u32 i = 0; i < global_memory_count; ++i) {
|
||||||
u32 cbuf_index{};
|
u32 cbuf_index{};
|
||||||
u32 cbuf_offset{};
|
u32 cbuf_offset{};
|
||||||
|
u8 is_read{};
|
||||||
|
u8 is_written{};
|
||||||
if (file.ReadBytes(&cbuf_index, sizeof(u32)) != sizeof(u32) ||
|
if (file.ReadBytes(&cbuf_index, sizeof(u32)) != sizeof(u32) ||
|
||||||
file.ReadBytes(&cbuf_offset, sizeof(u32)) != sizeof(u32)) {
|
file.ReadBytes(&cbuf_offset, sizeof(u32)) != sizeof(u32) ||
|
||||||
|
file.ReadBytes(&is_read, sizeof(u8)) != sizeof(u8) ||
|
||||||
|
file.ReadBytes(&is_written, sizeof(u8)) != sizeof(u8)) {
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset);
|
entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset, is_read != 0,
|
||||||
|
is_written != 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto& clip_distance : entry.entries.clip_distances) {
|
for (auto& clip_distance : entry.entries.clip_distances) {
|
||||||
|
@ -397,7 +402,9 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(FileUtil::IOFile& file, u64 uniqu
|
||||||
return false;
|
return false;
|
||||||
for (const auto& gmem : entries.global_memory_entries) {
|
for (const auto& gmem : entries.global_memory_entries) {
|
||||||
if (file.WriteObject(static_cast<u32>(gmem.GetCbufIndex())) != 1 ||
|
if (file.WriteObject(static_cast<u32>(gmem.GetCbufIndex())) != 1 ||
|
||||||
file.WriteObject(static_cast<u32>(gmem.GetCbufOffset())) != 1) {
|
file.WriteObject(static_cast<u32>(gmem.GetCbufOffset())) != 1 ||
|
||||||
|
file.WriteObject(static_cast<u8>(gmem.IsRead() ? 1 : 0)) != 1 ||
|
||||||
|
file.WriteObject(static_cast<u8>(gmem.IsWritten() ? 1 : 0)) != 1) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -191,8 +191,9 @@ public:
|
||||||
for (const auto& cbuf : ir.GetConstantBuffers()) {
|
for (const auto& cbuf : ir.GetConstantBuffers()) {
|
||||||
entries.const_buffers.emplace_back(cbuf.second, cbuf.first);
|
entries.const_buffers.emplace_back(cbuf.second, cbuf.first);
|
||||||
}
|
}
|
||||||
for (const auto& gmem : ir.GetGlobalMemoryBases()) {
|
for (const auto& gmem_pair : ir.GetGlobalMemory()) {
|
||||||
entries.global_buffers.emplace_back(gmem.cbuf_index, gmem.cbuf_offset);
|
const auto& [base, usage] = gmem_pair;
|
||||||
|
entries.global_buffers.emplace_back(base.cbuf_index, base.cbuf_offset);
|
||||||
}
|
}
|
||||||
for (const auto& sampler : ir.GetSamplers()) {
|
for (const auto& sampler : ir.GetSamplers()) {
|
||||||
entries.samplers.emplace_back(sampler);
|
entries.samplers.emplace_back(sampler);
|
||||||
|
@ -225,7 +226,7 @@ private:
|
||||||
return current_binding;
|
return current_binding;
|
||||||
};
|
};
|
||||||
const_buffers_base_binding = Allocate(ir.GetConstantBuffers().size());
|
const_buffers_base_binding = Allocate(ir.GetConstantBuffers().size());
|
||||||
global_buffers_base_binding = Allocate(ir.GetGlobalMemoryBases().size());
|
global_buffers_base_binding = Allocate(ir.GetGlobalMemory().size());
|
||||||
samplers_base_binding = Allocate(ir.GetSamplers().size());
|
samplers_base_binding = Allocate(ir.GetSamplers().size());
|
||||||
|
|
||||||
ASSERT_MSG(binding_iterator - binding_base < STAGE_BINDING_STRIDE,
|
ASSERT_MSG(binding_iterator - binding_base < STAGE_BINDING_STRIDE,
|
||||||
|
@ -390,14 +391,15 @@ private:
|
||||||
|
|
||||||
void DeclareGlobalBuffers() {
|
void DeclareGlobalBuffers() {
|
||||||
u32 binding = global_buffers_base_binding;
|
u32 binding = global_buffers_base_binding;
|
||||||
for (const auto& entry : ir.GetGlobalMemoryBases()) {
|
for (const auto& entry : ir.GetGlobalMemory()) {
|
||||||
|
const auto [base, usage] = entry;
|
||||||
const Id id = OpVariable(t_gmem_ssbo, spv::StorageClass::StorageBuffer);
|
const Id id = OpVariable(t_gmem_ssbo, spv::StorageClass::StorageBuffer);
|
||||||
AddGlobalVariable(
|
AddGlobalVariable(
|
||||||
Name(id, fmt::format("gmem_{}_{}", entry.cbuf_index, entry.cbuf_offset)));
|
Name(id, fmt::format("gmem_{}_{}", base.cbuf_index, base.cbuf_offset)));
|
||||||
|
|
||||||
Decorate(id, spv::Decoration::Binding, binding++);
|
Decorate(id, spv::Decoration::Binding, binding++);
|
||||||
Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
|
Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
|
||||||
global_buffers.emplace(entry, id);
|
global_buffers.emplace(base, id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,23 @@ using Tegra::Shader::Instruction;
|
||||||
using Tegra::Shader::OpCode;
|
using Tegra::Shader::OpCode;
|
||||||
using Tegra::Shader::Register;
|
using Tegra::Shader::Register;
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) {
|
||||||
|
switch (uniform_type) {
|
||||||
|
case Tegra::Shader::UniformType::Single:
|
||||||
|
return 1;
|
||||||
|
case Tegra::Shader::UniformType::Double:
|
||||||
|
return 2;
|
||||||
|
case Tegra::Shader::UniformType::Quad:
|
||||||
|
case Tegra::Shader::UniformType::UnsignedQuad:
|
||||||
|
return 4;
|
||||||
|
default:
|
||||||
|
UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
|
u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
|
||||||
const Instruction instr = {program_code[pc]};
|
const Instruction instr = {program_code[pc]};
|
||||||
const auto opcode = OpCode::Decode(instr);
|
const auto opcode = OpCode::Decode(instr);
|
||||||
|
@ -126,45 +143,15 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case OpCode::Id::LDG: {
|
case OpCode::Id::LDG: {
|
||||||
const u32 count = [&]() {
|
const auto [real_address_base, base_address, descriptor] =
|
||||||
switch (instr.ldg.type) {
|
TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8),
|
||||||
case Tegra::Shader::UniformType::Single:
|
static_cast<u32>(instr.ldg.immediate_offset.Value()), false);
|
||||||
return 1;
|
|
||||||
case Tegra::Shader::UniformType::Double:
|
|
||||||
return 2;
|
|
||||||
case Tegra::Shader::UniformType::Quad:
|
|
||||||
case Tegra::Shader::UniformType::UnsignedQuad:
|
|
||||||
return 4;
|
|
||||||
default:
|
|
||||||
UNIMPLEMENTED_MSG("Unimplemented LDG size!");
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}();
|
|
||||||
|
|
||||||
const Node addr_register = GetRegister(instr.gpr8);
|
|
||||||
const Node base_address =
|
|
||||||
TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()));
|
|
||||||
const auto cbuf = std::get_if<CbufNode>(base_address);
|
|
||||||
ASSERT(cbuf != nullptr);
|
|
||||||
const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset());
|
|
||||||
ASSERT(cbuf_offset_imm != nullptr);
|
|
||||||
const auto cbuf_offset = cbuf_offset_imm->GetValue();
|
|
||||||
|
|
||||||
bb.push_back(Comment(
|
|
||||||
fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset)));
|
|
||||||
|
|
||||||
const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset};
|
|
||||||
used_global_memory_bases.insert(descriptor);
|
|
||||||
|
|
||||||
const Node immediate_offset =
|
|
||||||
Immediate(static_cast<u32>(instr.ldg.immediate_offset.Value()));
|
|
||||||
const Node base_real_address =
|
|
||||||
Operation(OperationCode::UAdd, NO_PRECISE, immediate_offset, addr_register);
|
|
||||||
|
|
||||||
|
const u32 count = GetUniformTypeElementsCount(instr.ldg.type);
|
||||||
for (u32 i = 0; i < count; ++i) {
|
for (u32 i = 0; i < count; ++i) {
|
||||||
const Node it_offset = Immediate(i * 4);
|
const Node it_offset = Immediate(i * 4);
|
||||||
const Node real_address =
|
const Node real_address =
|
||||||
Operation(OperationCode::UAdd, NO_PRECISE, base_real_address, it_offset);
|
Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
|
||||||
const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));
|
const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));
|
||||||
|
|
||||||
SetTemporal(bb, i, gmem);
|
SetTemporal(bb, i, gmem);
|
||||||
|
@ -174,6 +161,28 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case OpCode::Id::STG: {
|
||||||
|
const auto [real_address_base, base_address, descriptor] =
|
||||||
|
TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8),
|
||||||
|
static_cast<u32>(instr.stg.immediate_offset.Value()), true);
|
||||||
|
|
||||||
|
// Encode in temporary registers like this: real_base_address, {registers_to_be_written...}
|
||||||
|
SetTemporal(bb, 0, real_address_base);
|
||||||
|
|
||||||
|
const u32 count = GetUniformTypeElementsCount(instr.stg.type);
|
||||||
|
for (u32 i = 0; i < count; ++i) {
|
||||||
|
SetTemporal(bb, i + 1, GetRegister(instr.gpr0.Value() + i));
|
||||||
|
}
|
||||||
|
for (u32 i = 0; i < count; ++i) {
|
||||||
|
const Node it_offset = Immediate(i * 4);
|
||||||
|
const Node real_address =
|
||||||
|
Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
|
||||||
|
const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));
|
||||||
|
|
||||||
|
bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporal(i + 1)));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
case OpCode::Id::ST_A: {
|
case OpCode::Id::ST_A: {
|
||||||
UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex,
|
UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex,
|
||||||
"Indirect attribute loads are not supported");
|
"Indirect attribute loads are not supported");
|
||||||
|
@ -236,4 +245,34 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
|
||||||
return pc;
|
return pc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackAndGetGlobalMemory(NodeBlock& bb,
|
||||||
|
Node addr_register,
|
||||||
|
u32 immediate_offset,
|
||||||
|
bool is_write) {
|
||||||
|
const Node base_address{
|
||||||
|
TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()))};
|
||||||
|
const auto cbuf = std::get_if<CbufNode>(base_address);
|
||||||
|
ASSERT(cbuf != nullptr);
|
||||||
|
const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset());
|
||||||
|
ASSERT(cbuf_offset_imm != nullptr);
|
||||||
|
const auto cbuf_offset = cbuf_offset_imm->GetValue();
|
||||||
|
|
||||||
|
bb.push_back(
|
||||||
|
Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset)));
|
||||||
|
|
||||||
|
const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset};
|
||||||
|
const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor);
|
||||||
|
auto& usage = entry->second;
|
||||||
|
if (is_write) {
|
||||||
|
usage.is_written = true;
|
||||||
|
} else {
|
||||||
|
usage.is_read = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto real_address =
|
||||||
|
Operation(OperationCode::UAdd, NO_PRECISE, Immediate(immediate_offset), addr_register);
|
||||||
|
|
||||||
|
return {real_address, base_address, descriptor};
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace VideoCommon::Shader
|
} // namespace VideoCommon::Shader
|
||||||
|
|
|
@ -276,6 +276,11 @@ struct GlobalMemoryBase {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct GlobalMemoryUsage {
|
||||||
|
bool is_read{};
|
||||||
|
bool is_written{};
|
||||||
|
};
|
||||||
|
|
||||||
struct MetaArithmetic {
|
struct MetaArithmetic {
|
||||||
bool precise{};
|
bool precise{};
|
||||||
};
|
};
|
||||||
|
@ -578,8 +583,8 @@ public:
|
||||||
return used_clip_distances;
|
return used_clip_distances;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::set<GlobalMemoryBase>& GetGlobalMemoryBases() const {
|
const std::map<GlobalMemoryBase, GlobalMemoryUsage>& GetGlobalMemory() const {
|
||||||
return used_global_memory_bases;
|
return used_global_memory;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::size_t GetLength() const {
|
std::size_t GetLength() const {
|
||||||
|
@ -781,6 +786,11 @@ private:
|
||||||
|
|
||||||
std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code, s64 cursor);
|
std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code, s64 cursor);
|
||||||
|
|
||||||
|
std::tuple<Node, Node, GlobalMemoryBase> TrackAndGetGlobalMemory(NodeBlock& bb,
|
||||||
|
Node addr_register,
|
||||||
|
u32 immediate_offset,
|
||||||
|
bool is_write);
|
||||||
|
|
||||||
template <typename... T>
|
template <typename... T>
|
||||||
Node Operation(OperationCode code, const T*... operands) {
|
Node Operation(OperationCode code, const T*... operands) {
|
||||||
return StoreNode(OperationNode(code, operands...));
|
return StoreNode(OperationNode(code, operands...));
|
||||||
|
@ -834,7 +844,7 @@ private:
|
||||||
std::map<u32, ConstBuffer> used_cbufs;
|
std::map<u32, ConstBuffer> used_cbufs;
|
||||||
std::set<Sampler> used_samplers;
|
std::set<Sampler> used_samplers;
|
||||||
std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{};
|
std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{};
|
||||||
std::set<GlobalMemoryBase> used_global_memory_bases;
|
std::map<GlobalMemoryBase, GlobalMemoryUsage> used_global_memory;
|
||||||
|
|
||||||
Tegra::Shader::Header header;
|
Tegra::Shader::Header header;
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in a new issue