From 4258d515e69d5f2d40301a1d1592304c12fd8126 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Sat, 29 Jan 2022 22:00:49 +0100
Subject: [PATCH] Rasterizer: Implement Inline2Memory Acceleration.

---
 src/video_core/buffer_cache/buffer_cache.h    | 46 +++++++++++++++++--
 src/video_core/engines/engine_upload.cpp      |  8 +++-
 src/video_core/engines/engine_upload.h        |  8 ++++
 src/video_core/engines/kepler_compute.cpp     |  1 +
 src/video_core/engines/kepler_memory.cpp      |  4 ++
 src/video_core/engines/kepler_memory.h        |  7 +++
 src/video_core/engines/maxwell_3d.cpp         |  1 +
 src/video_core/engines/maxwell_3d.h           |  3 +-
 src/video_core/gpu.cpp                        |  1 +
 src/video_core/rasterizer_interface.h         |  3 ++
 .../renderer_opengl/gl_rasterizer.cpp         | 22 +++++++++
 .../renderer_opengl/gl_rasterizer.h           |  1 +
 .../renderer_vulkan/vk_rasterizer.cpp         | 22 +++++++++
 .../renderer_vulkan/vk_rasterizer.h           |  1 +
 14 files changed, 122 insertions(+), 6 deletions(-)

diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 048dba4f3..792c3820a 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -131,6 +131,8 @@ public:
 
     void DownloadMemory(VAddr cpu_addr, u64 size);
 
+    bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<u8> inlined_buffer);
+
     void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
 
     void DisableGraphicsUniformBuffer(size_t stage, u32 index);
@@ -808,6 +810,8 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
         return;
     }
     MICROPROFILE_SCOPE(GPU_DownloadMemory);
+    const bool is_accuracy_normal =
+        Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::Normal;
 
     boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads;
     u64 total_size_bytes = 0;
@@ -819,6 +823,9 @@ void BufferCache<P>::CommitAsyncFlushesHigh() {
             ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
                 buffer.ForEachDownloadRangeAndClear(
                     cpu_addr, size, [&](u64 range_offset, u64 range_size) {
+                        if (is_accuracy_normal) {
+                            return;
+                        }
                         const VAddr buffer_addr = buffer.CpuAddr();
                         const auto add_download = [&](VAddr start, VAddr end) {
                             const u64 new_offset = start - buffer_addr;
@@ -1417,10 +1424,8 @@ void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 s
     const IntervalType base_interval{cpu_addr, cpu_addr + size};
     common_ranges.add(base_interval);
 
-    const bool is_accuracy_high =
-        Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High;
     const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
-    if (!is_async && !is_accuracy_high) {
+    if (!is_async) {
         return;
     }
     uncommitted_ranges.add(base_interval);
@@ -1643,6 +1648,41 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
     runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
 }
 
+template <class P>
+bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size,
+                                  std::span<u8> inlined_buffer) {
+    const bool is_dirty = IsRegionRegistered(dest_address, copy_size);
+    if (!is_dirty) {
+        return false;
+    }
+    if (!IsRegionGpuModified(dest_address, copy_size)) {
+        return false;
+    }
+
+    const IntervalType subtract_interval{dest_address, dest_address + copy_size};
+    ClearDownload(subtract_interval);
+
+    BufferId buffer_id = FindBuffer(dest_address, static_cast<u32>(copy_size));
+    auto& buffer = slot_buffers[buffer_id];
+    SynchronizeBuffer(buffer, dest_address, static_cast<u32>(copy_size));
+
+    if constexpr (USE_MEMORY_MAPS) {
+        std::array copies{BufferCopy{
+            .src_offset = 0,
+            .dst_offset = buffer.Offset(dest_address),
+            .size = copy_size,
+        }};
+        auto upload_staging = runtime.UploadStagingBuffer(copy_size);
+        u8* const src_pointer = upload_staging.mapped_span.data();
+        std::memcpy(src_pointer, inlined_buffer.data(), copy_size);
+        runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
+    } else {
+        buffer.ImmediateUpload(buffer.Offset(dest_address), inlined_buffer);
+    }
+
+    return true;
+}
+
 template <class P>
 void BufferCache<P>::DownloadBufferMemory(Buffer& buffer) {
     DownloadBufferMemory(buffer, buffer.CpuAddr(), buffer.SizeBytes());
diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp
index 9ac0e533c..4eb9a094d 100644
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -7,6 +7,7 @@
 #include "common/assert.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
 #include "video_core/textures/decoders.h"
 
 namespace Tegra::Engines::Upload {
@@ -16,6 +17,10 @@ State::State(MemoryManager& memory_manager_, Registers& regs_)
 
 State::~State() = default;
 
+void State::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
+    rasterizer = rasterizer_;
+}
+
 void State::ProcessExec(const bool is_linear_) {
     write_offset = 0;
     copy_size = regs.line_length_in * regs.line_count;
@@ -32,8 +37,7 @@ void State::ProcessData(const u32 data, const bool is_last_call) {
     }
     const GPUVAddr address{regs.dest.Address()};
     if (is_linear) {
-        memory_manager.FlushRegion(address, copy_size);
-        memory_manager.WriteBlock(address, inner_buffer.data(), copy_size);
+        rasterizer->AccelerateInline2Memory(address, copy_size, inner_buffer);
     } else {
         UNIMPLEMENTED_IF(regs.dest.z != 0);
         UNIMPLEMENTED_IF(regs.dest.depth != 1);
diff --git a/src/video_core/engines/engine_upload.h b/src/video_core/engines/engine_upload.h
index 1c7f1effa..c9c5ec8c3 100644
--- a/src/video_core/engines/engine_upload.h
+++ b/src/video_core/engines/engine_upload.h
@@ -12,6 +12,10 @@ namespace Tegra {
 class MemoryManager;
 }
 
+namespace VideoCore {
+class RasterizerInterface;
+}
+
 namespace Tegra::Engines::Upload {
 
 struct Registers {
@@ -60,6 +64,9 @@ public:
     void ProcessExec(bool is_linear_);
     void ProcessData(u32 data, bool is_last_call);
 
+    /// Binds a rasterizer to this engine.
+    void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
+
 private:
     u32 write_offset = 0;
     u32 copy_size = 0;
@@ -68,6 +75,7 @@ private:
     bool is_linear = false;
     Registers& regs;
     MemoryManager& memory_manager;
+    VideoCore::RasterizerInterface* rasterizer = nullptr;
 };
 
 } // namespace Tegra::Engines::Upload
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index 492b4c5a3..5a1c12076 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -22,6 +22,7 @@ KeplerCompute::~KeplerCompute() = default;
 
 void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
     rasterizer = rasterizer_;
+    upload_state.BindRasterizer(rasterizer);
 }
 
 void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 560551157..8aed16caa 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -19,6 +19,10 @@ KeplerMemory::KeplerMemory(Core::System& system_, MemoryManager& memory_manager)
 
 KeplerMemory::~KeplerMemory() = default;
 
+void KeplerMemory::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
+    upload_state.BindRasterizer(rasterizer_);
+}
+
 void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
     ASSERT_MSG(method < Regs::NUM_REGS,
                "Invalid KeplerMemory register, increase the size of the Regs structure");
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index 0d8ea09a9..949e2fae1 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -22,6 +22,10 @@ namespace Tegra {
 class MemoryManager;
 }
 
+namespace VideoCore {
+class RasterizerInterface;
+}
+
 namespace Tegra::Engines {
 
 /**
@@ -38,6 +42,9 @@ public:
     explicit KeplerMemory(Core::System& system_, MemoryManager& memory_manager);
     ~KeplerMemory() override;
 
+    /// Binds a rasterizer to this engine.
+    void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
+
     /// Write the value to the register identified by method.
     void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;
 
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index c38ebd670..5d6d217bb 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -31,6 +31,7 @@ Maxwell3D::~Maxwell3D() = default;
 
 void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
     rasterizer = rasterizer_;
+    upload_state.BindRasterizer(rasterizer_);
 }
 
 void Maxwell3D::InitializeRegisterDefaults() {
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index f22342dfb..e0a435c45 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1557,7 +1557,8 @@ private:
 
     static constexpr u32 null_cb_data = 0xFFFFFFFF;
     struct CBDataState {
-        std::array<std::array<u32, 0x4000>, 16> buffer;
+        static constexpr size_t inline_size = 0x8000;
+        std::array<std::array<u32, inline_size>, 16> buffer;
         u32 current{null_cb_data};
         u32 id{null_cb_data};
         u32 start_pos{};
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 705765c99..e326018ae 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -59,6 +59,7 @@ struct GPU::Impl {
         maxwell_3d->BindRasterizer(rasterizer);
         fermi_2d->BindRasterizer(rasterizer);
         kepler_compute->BindRasterizer(rasterizer);
+        kepler_memory->BindRasterizer(rasterizer);
         maxwell_dma->BindRasterizer(rasterizer);
     }
 
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index b094fc064..78e6e566f 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -123,6 +123,9 @@ public:
 
     [[nodiscard]] virtual Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() = 0;
 
+    virtual void AccelerateInline2Memory(GPUVAddr address, size_t copy_size,
+                                         std::span<u8> memory) = 0;
+
     /// Attempt to use a faster method to display the framebuffer to screen
     [[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,
                                                  VAddr framebuffer_addr, u32 pixel_stride) {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 9b516c64f..57a6ff752 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -484,6 +484,28 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerOpenGL::AccessAccelerateDMA()
     return accelerate_dma;
 }
 
+void RasterizerOpenGL::AccelerateInline2Memory(GPUVAddr address, size_t copy_size,
+                                               std::span<u8> memory) {
+    auto cpu_addr = gpu_memory.GpuToCpuAddress(address);
+    if (!cpu_addr) [[unlikely]] {
+        gpu_memory.WriteBlock(address, memory.data(), copy_size);
+        return;
+    }
+    gpu_memory.WriteBlockUnsafe(address, memory.data(), copy_size);
+    {
+        std::unique_lock<std::mutex> lock{buffer_cache.mutex};
+        if (!buffer_cache.InlineMemory(*cpu_addr, copy_size, memory)) {
+            buffer_cache.WriteMemory(*cpu_addr, copy_size);
+        }
+    }
+    {
+        std::scoped_lock lock_texture{texture_cache.mutex};
+        texture_cache.WriteMemory(*cpu_addr, copy_size);
+    }
+    shader_cache.InvalidateRegion(*cpu_addr, copy_size);
+    query_cache.InvalidateRegion(*cpu_addr, copy_size);
+}
+
 bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
                                          VAddr framebuffer_addr, u32 pixel_stride) {
     if (framebuffer_addr == 0) {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index d0397b745..dd97c279b 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -106,6 +106,7 @@ public:
                                const Tegra::Engines::Fermi2D::Surface& dst,
                                const Tegra::Engines::Fermi2D::Config& copy_config) override;
     Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
+    void AccelerateInline2Memory(GPUVAddr address, size_t copy_size, std::span<u8> memory) override;
     bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                            u32 pixel_stride) override;
     void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index fd334a146..65c5fd63f 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -548,6 +548,28 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerVulkan::AccessAccelerateDMA()
     return accelerate_dma;
 }
 
+void RasterizerVulkan::AccelerateInline2Memory(GPUVAddr address, size_t copy_size,
+                                               std::span<u8> memory) {
+    auto cpu_addr = gpu_memory.GpuToCpuAddress(address);
+    if (!cpu_addr) [[unlikely]] {
+        gpu_memory.WriteBlock(address, memory.data(), copy_size);
+        return;
+    }
+    gpu_memory.WriteBlockUnsafe(address, memory.data(), copy_size);
+    {
+        std::unique_lock<std::mutex> lock{buffer_cache.mutex};
+        if (!buffer_cache.InlineMemory(*cpu_addr, copy_size, memory)) {
+            buffer_cache.WriteMemory(*cpu_addr, copy_size);
+        }
+    }
+    {
+        std::scoped_lock lock_texture{texture_cache.mutex};
+        texture_cache.WriteMemory(*cpu_addr, copy_size);
+    }
+    pipeline_cache.InvalidateRegion(*cpu_addr, copy_size);
+    query_cache.InvalidateRegion(*cpu_addr, copy_size);
+}
+
 bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config,
                                          VAddr framebuffer_addr, u32 pixel_stride) {
     if (!framebuffer_addr) {
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 866827247..bbd79aae4 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -99,6 +99,7 @@ public:
                                const Tegra::Engines::Fermi2D::Surface& dst,
                                const Tegra::Engines::Fermi2D::Config& copy_config) override;
     Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
+    void AccelerateInline2Memory(GPUVAddr address, size_t copy_size, std::span<u8> memory) override;
     bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                            u32 pixel_stride) override;
     void LoadDiskResources(u64 title_id, std::stop_token stop_loading,