From db8a601cf82cd4797c32931f13a585c64527780d Mon Sep 17 00:00:00 2001
From: Ameer J <52414509+ameerj@users.noreply.github.com>
Date: Tue, 19 Dec 2023 17:32:31 -0500
Subject: [PATCH 1/2] OpenGL: Add GL_PRIMITIVES_GENERATED and
 GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN queries

---
 src/video_core/engines/maxwell_3d.cpp         | 24 ++++++---
 src/video_core/query_cache.h                  | 39 +++++++--------
 .../renderer_opengl/gl_query_cache.cpp        | 23 ++++++---
 .../renderer_opengl/gl_rasterizer.cpp         | 49 +++++++++++++------
 .../renderer_opengl/gl_rasterizer.h           |  3 ++
 .../renderer_vulkan/vk_rasterizer.cpp         |  4 ++
 6 files changed, 93 insertions(+), 49 deletions(-)

diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 592c28ba3..95ba4f76c 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -586,14 +586,22 @@ void Maxwell3D::ProcessQueryCondition() {
 }
 
 void Maxwell3D::ProcessCounterReset() {
-    switch (regs.clear_report_value) {
-    case Regs::ClearReport::ZPassPixelCount:
-        rasterizer->ResetCounter(VideoCommon::QueryType::ZPassPixelCount64);
-        break;
-    default:
-        LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.clear_report_value);
-        break;
-    }
+    const auto query_type = [clear_report = regs.clear_report_value]() {
+        switch (clear_report) {
+        case Tegra::Engines::Maxwell3D::Regs::ClearReport::ZPassPixelCount:
+            return VideoCommon::QueryType::ZPassPixelCount64;
+        case Tegra::Engines::Maxwell3D::Regs::ClearReport::StreamingPrimitivesSucceeded:
+            return VideoCommon::QueryType::StreamingPrimitivesSucceeded;
+        case Tegra::Engines::Maxwell3D::Regs::ClearReport::PrimitivesGenerated:
+            return VideoCommon::QueryType::PrimitivesGenerated;
+        case Tegra::Engines::Maxwell3D::Regs::ClearReport::VtgPrimitivesOut:
+            return VideoCommon::QueryType::VtgPrimitivesOut;
+        default:
+            LOG_DEBUG(HW_GPU, "Unimplemented counter reset={}", clear_report);
+            return VideoCommon::QueryType::Payload;
+        }
+    }();
+    rasterizer->ResetCounter(query_type);
 }
 
 void Maxwell3D::ProcessSyncPoint() {
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h
index 9fcaeeac7..a64404ce4 100644
--- a/src/video_core/query_cache.h
+++ b/src/video_core/query_cache.h
@@ -28,8 +28,11 @@
 namespace VideoCore {
 enum class QueryType {
     SamplesPassed,
+    PrimitivesGenerated,
+    TfbPrimitivesWritten,
+    Count,
 };
-constexpr std::size_t NumQueryTypes = 1;
+constexpr std::size_t NumQueryTypes = static_cast<size_t>(QueryType::Count);
 } // namespace VideoCore
 
 namespace VideoCommon {
@@ -44,15 +47,6 @@ public:
     explicit CounterStreamBase(QueryCache& cache_, VideoCore::QueryType type_)
         : cache{cache_}, type{type_} {}
 
-    /// Updates the state of the stream, enabling or disabling as needed.
-    void Update(bool enabled) {
-        if (enabled) {
-            Enable();
-        } else {
-            Disable();
-        }
-    }
-
     /// Resets the stream to zero. It doesn't disable the query after resetting.
     void Reset() {
         if (current) {
@@ -80,7 +74,6 @@ public:
         return current != nullptr;
     }
 
-private:
     /// Enables the stream.
     void Enable() {
         if (current) {
@@ -97,6 +90,7 @@ private:
         last = std::exchange(current, nullptr);
     }
 
+private:
     QueryCache& cache;
     const VideoCore::QueryType type;
 
@@ -112,8 +106,14 @@ public:
         : rasterizer{rasterizer_},
           // Use reinterpret_cast instead of static_cast as workaround for
           // UBSan bug (https://github.com/llvm/llvm-project/issues/59060)
-          cpu_memory{cpu_memory_}, streams{{CounterStream{reinterpret_cast<QueryCache&>(*this),
-                                                          VideoCore::QueryType::SamplesPassed}}} {
+          cpu_memory{cpu_memory_}, streams{{
+                                       {CounterStream{reinterpret_cast<QueryCache&>(*this),
+                                                      VideoCore::QueryType::SamplesPassed}},
+                                       {CounterStream{reinterpret_cast<QueryCache&>(*this),
+                                                      VideoCore::QueryType::PrimitivesGenerated}},
+                                       {CounterStream{reinterpret_cast<QueryCache&>(*this),
+                                                      VideoCore::QueryType::TfbPrimitivesWritten}},
+                                   }} {
         (void)slot_async_jobs.insert(); // Null value
     }
 
@@ -157,12 +157,11 @@ public:
         AsyncFlushQuery(query, timestamp, lock);
     }
 
-    /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch.
-    void UpdateCounters() {
+    /// Enables all available GPU counters
+    void EnableCounters() {
         std::unique_lock lock{mutex};
-        if (maxwell3d) {
-            const auto& regs = maxwell3d->regs;
-            Stream(VideoCore::QueryType::SamplesPassed).Update(regs.zpass_pixel_count_enable);
+        for (auto& stream : streams) {
+            stream.Enable();
         }
     }
 
@@ -176,7 +175,7 @@ public:
     void DisableStreams() {
         std::unique_lock lock{mutex};
         for (auto& stream : streams) {
-            stream.Update(false);
+            stream.Disable();
         }
     }
 
@@ -353,7 +352,7 @@ private:
 
     std::shared_ptr<std::vector<AsyncJobId>> uncommitted_flushes{};
     std::list<std::shared_ptr<std::vector<AsyncJobId>>> committed_flushes;
-};
+}; // namespace VideoCommon
 
 template <class QueryCache, class HostCounter>
 class HostCounterBase {
diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp
index ec142d48e..fef7360ed 100644
--- a/src/video_core/renderer_opengl/gl_query_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_query_cache.cpp
@@ -18,16 +18,27 @@ namespace OpenGL {
 
 namespace {
 
-constexpr std::array<GLenum, VideoCore::NumQueryTypes> QueryTargets = {GL_SAMPLES_PASSED};
-
 constexpr GLenum GetTarget(VideoCore::QueryType type) {
-    return QueryTargets[static_cast<std::size_t>(type)];
+    switch (type) {
+    case VideoCore::QueryType::SamplesPassed:
+        return GL_SAMPLES_PASSED;
+    case VideoCore::QueryType::PrimitivesGenerated:
+        return GL_PRIMITIVES_GENERATED;
+    case VideoCore::QueryType::TfbPrimitivesWritten:
+        return GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN;
+    default:
+        break;
+    }
+    UNIMPLEMENTED_MSG("Query type {}", type);
+    return 0;
 }
 
 } // Anonymous namespace
 
 QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_)
-    : QueryCacheLegacy(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {}
+    : QueryCacheLegacy(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {
+    EnableCounters();
+}
 
 QueryCache::~QueryCache() = default;
 
@@ -103,13 +114,13 @@ u64 CachedQuery::Flush([[maybe_unused]] bool async) {
     auto& stream = cache->Stream(type);
     const bool slice_counter = WaitPending() && stream.IsEnabled();
     if (slice_counter) {
-        stream.Update(false);
+        stream.Disable();
     }
 
     auto result = VideoCommon::CachedQueryBase<HostCounter>::Flush();
 
     if (slice_counter) {
-        stream.Update(true);
+        stream.Enable();
     }
 
     return result;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 279e5a4e0..0545e33c5 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -51,6 +51,22 @@ constexpr size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
 void oglEnable(GLenum cap, bool state) {
     (state ? glEnable : glDisable)(cap);
 }
+
+std::optional<VideoCore::QueryType> MaxwellToVideoCoreQuery(VideoCommon::QueryType type) {
+    switch (type) {
+    case VideoCommon::QueryType::PrimitivesGenerated:
+    case VideoCommon::QueryType::VtgPrimitivesOut:
+        return VideoCore::QueryType::PrimitivesGenerated;
+    case VideoCommon::QueryType::ZPassPixelCount64:
+        return VideoCore::QueryType::SamplesPassed;
+    case VideoCommon::QueryType::StreamingPrimitivesSucceeded:
+        // case VideoCommon::QueryType::StreamingByteCount:
+        // TODO: StreamingByteCount = StreamingPrimitivesSucceeded * num_verts * vert_stride
+        return VideoCore::QueryType::TfbPrimitivesWritten;
+    default:
+        return std::nullopt;
+    }
+}
 } // Anonymous namespace
 
 RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
@@ -212,7 +228,6 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) {
 
     SCOPE_EXIT({ gpu.TickWork(); });
     gpu_memory->FlushCaching();
-    query_cache.UpdateCounters();
 
     GraphicsPipeline* const pipeline{shader_cache.CurrentGraphicsPipeline()};
     if (!pipeline) {
@@ -330,7 +345,6 @@ void RasterizerOpenGL::DrawTexture() {
     MICROPROFILE_SCOPE(OpenGL_Drawing);
 
     SCOPE_EXIT({ gpu.TickWork(); });
-    query_cache.UpdateCounters();
 
     texture_cache.SynchronizeGraphicsDescriptors();
     texture_cache.UpdateRenderTargets(false);
@@ -397,21 +411,28 @@ void RasterizerOpenGL::DispatchCompute() {
 }
 
 void RasterizerOpenGL::ResetCounter(VideoCommon::QueryType type) {
-    if (type == VideoCommon::QueryType::ZPassPixelCount64) {
-        query_cache.ResetCounter(VideoCore::QueryType::SamplesPassed);
+    const auto query_cache_type = MaxwellToVideoCoreQuery(type);
+    if (!query_cache_type.has_value()) {
+        UNIMPLEMENTED_MSG("Reset query type: {}", type);
+        return;
     }
+    query_cache.ResetCounter(*query_cache_type);
 }
 
 void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
                              VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) {
-    if (type == VideoCommon::QueryType::ZPassPixelCount64) {
-        if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) {
-            query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, {gpu.GetTicks()});
-        } else {
-            query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, std::nullopt);
-        }
-        return;
+    const auto query_cache_type = MaxwellToVideoCoreQuery(type);
+    if (!query_cache_type.has_value()) {
+        return QueryFallback(gpu_addr, type, flags, payload, subreport);
     }
+    const bool has_timeout = True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout);
+    const auto timestamp = has_timeout ? std::optional<u64>{gpu.GetTicks()} : std::nullopt;
+    query_cache.Query(gpu_addr, *query_cache_type, timestamp);
+}
+
+void RasterizerOpenGL::QueryFallback(GPUVAddr gpu_addr, VideoCommon::QueryType type,
+                                     VideoCommon::QueryPropertiesFlags flags, u32 payload,
+                                     u32 subreport) {
     if (type != VideoCommon::QueryType::Payload) {
         payload = 1u;
     }
@@ -1294,15 +1315,13 @@ void RasterizerOpenGL::BeginTransformFeedback(GraphicsPipeline* program, GLenum
     program->ConfigureTransformFeedback();
 
     UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) ||
-                     regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation) ||
-                     regs.IsShaderConfigEnabled(Maxwell::ShaderType::Geometry));
-    UNIMPLEMENTED_IF(primitive_mode != GL_POINTS);
+                     regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation));
 
     // We may have to call BeginTransformFeedbackNV here since they seem to call different
     // implementations on Nvidia's driver (the pointer is different) but we are using
     // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB
     // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works.
-    glBeginTransformFeedback(GL_POINTS);
+    glBeginTransformFeedback(primitive_mode);
 }
 
 void RasterizerOpenGL::EndTransformFeedback() {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index ceffe1f1e..b79d7a70c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -225,6 +225,9 @@ private:
     /// End a transform feedback
     void EndTransformFeedback();
 
+    void QueryFallback(GPUVAddr gpu_addr, VideoCommon::QueryType type,
+                       VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport);
+
     Tegra::GPU& gpu;
 
     const Device& device;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 59829c88b..241fc34be 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -485,6 +485,10 @@ void RasterizerVulkan::DispatchCompute() {
 }
 
 void RasterizerVulkan::ResetCounter(VideoCommon::QueryType type) {
+    if (type != VideoCommon::QueryType::ZPassPixelCount64) {
+        LOG_DEBUG(Render_Vulkan, "Unimplemented counter reset={}", type);
+        return;
+    }
     query_cache.CounterReset(type);
 }
 

From 61e8c5f798dc868070a1bc175a4f6851bb1840f9 Mon Sep 17 00:00:00 2001
From: Ameer J <52414509+ameerj@users.noreply.github.com>
Date: Wed, 20 Dec 2023 11:51:44 -0500
Subject: [PATCH 2/2] gl_rasterizer: Less spammy log for unimplemented resets

---
 src/video_core/renderer_opengl/gl_rasterizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 0545e33c5..8111f77a8 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -413,7 +413,7 @@ void RasterizerOpenGL::DispatchCompute() {
 void RasterizerOpenGL::ResetCounter(VideoCommon::QueryType type) {
     const auto query_cache_type = MaxwellToVideoCoreQuery(type);
     if (!query_cache_type.has_value()) {
-        UNIMPLEMENTED_MSG("Reset query type: {}", type);
+        UNIMPLEMENTED_IF_MSG(type != VideoCommon::QueryType::Payload, "Reset query type: {}", type);
         return;
     }
     query_cache.ResetCounter(*query_cache_type);