From 4006929c986a2e0e52429fe21201a7ad5ca3fea9 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Wed, 3 Mar 2021 03:07:19 -0300
Subject: [PATCH] shader: Implement HADD2

---
 src/shader_recompiler/CMakeLists.txt          |   1 +
 .../backend/spirv/emit_spirv.h                |  28 ++-
 .../backend/spirv/emit_spirv_composite.cpp    |  72 +++++--
 .../backend/spirv/emit_spirv_convert.cpp      |  16 ++
 .../frontend/ir/ir_emitter.cpp                |  90 ++++++++-
 .../frontend/ir/ir_emitter.h                  |   4 +-
 src/shader_recompiler/frontend/ir/opcodes.inc |  17 +-
 .../impl/half_floating_point_add.cpp          | 184 ++++++++++++++++++
 .../translate/impl/load_store_memory.cpp      |   2 +-
 .../translate/impl/not_implemented.cpp        |  16 --
 .../global_memory_to_storage_buffer_pass.cpp  |   2 +-
 .../ir_opt/lower_fp16_to_fp32.cpp             |  10 +
 12 files changed, 400 insertions(+), 42 deletions(-)
 create mode 100644 src/shader_recompiler/frontend/maxwell/translate/impl/half_floating_point_add.cpp

diff --git a/src/shader_recompiler/CMakeLists.txt b/src/shader_recompiler/CMakeLists.txt
index 6506413a8..cb73e03fb 100644
--- a/src/shader_recompiler/CMakeLists.txt
+++ b/src/shader_recompiler/CMakeLists.txt
@@ -71,6 +71,7 @@ add_library(shader_recompiler STATIC
     frontend/maxwell/translate/impl/floating_point_multi_function.cpp
     frontend/maxwell/translate/impl/floating_point_multiply.cpp
     frontend/maxwell/translate/impl/floating_point_range_reduction.cpp
+    frontend/maxwell/translate/impl/half_floating_point_add.cpp
     frontend/maxwell/translate/impl/impl.cpp
     frontend/maxwell/translate/impl/impl.h
     frontend/maxwell/translate/impl/integer_add.cpp
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.h b/src/shader_recompiler/backend/spirv/emit_spirv.h
index 5446d6ab6..bed43c094 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv.h
+++ b/src/shader_recompiler/backend/spirv/emit_spirv.h
@@ -90,24 +90,36 @@ Id EmitCompositeConstructU32x4(EmitContext& ctx, Id e1, Id e2, Id e3, Id e4);
 Id EmitCompositeExtractU32x2(EmitContext& ctx, Id composite, u32 index);
 Id EmitCompositeExtractU32x3(EmitContext& ctx, Id composite, u32 index);
 Id EmitCompositeExtractU32x4(EmitContext& ctx, Id composite, u32 index);
-void EmitCompositeConstructF16x2(EmitContext& ctx);
-void EmitCompositeConstructF16x3(EmitContext& ctx);
-void EmitCompositeConstructF16x4(EmitContext& ctx);
+Id EmitCompositeInsertU32x2(EmitContext& ctx, Id composite, Id object, u32 index);
+Id EmitCompositeInsertU32x3(EmitContext& ctx, Id composite, Id object, u32 index);
+Id EmitCompositeInsertU32x4(EmitContext& ctx, Id composite, Id object, u32 index);
+Id EmitCompositeConstructF16x2(EmitContext& ctx, Id e1, Id e2);
+Id EmitCompositeConstructF16x3(EmitContext& ctx, Id e1, Id e2, Id e3);
+Id EmitCompositeConstructF16x4(EmitContext& ctx, Id e1, Id e2, Id e3, Id e4);
 Id EmitCompositeExtractF16x2(EmitContext& ctx, Id composite, u32 index);
 Id EmitCompositeExtractF16x3(EmitContext& ctx, Id composite, u32 index);
 Id EmitCompositeExtractF16x4(EmitContext& ctx, Id composite, u32 index);
-void EmitCompositeConstructF32x2(EmitContext& ctx);
-void EmitCompositeConstructF32x3(EmitContext& ctx);
-void EmitCompositeConstructF32x4(EmitContext& ctx);
+Id EmitCompositeInsertF16x2(EmitContext& ctx, Id composite, Id object, u32 index);
+Id EmitCompositeInsertF16x3(EmitContext& ctx, Id composite, Id object, u32 index);
+Id EmitCompositeInsertF16x4(EmitContext& ctx, Id composite, Id object, u32 index);
+Id EmitCompositeConstructF32x2(EmitContext& ctx, Id e1, Id e2);
+Id EmitCompositeConstructF32x3(EmitContext& ctx, Id e1, Id e2, Id e3);
+Id EmitCompositeConstructF32x4(EmitContext& ctx, Id e1, Id e2, Id e3, Id e4);
 Id EmitCompositeExtractF32x2(EmitContext& ctx, Id composite, u32 index);
 Id EmitCompositeExtractF32x3(EmitContext& ctx, Id composite, u32 index);
 Id EmitCompositeExtractF32x4(EmitContext& ctx, Id composite, u32 index);
+Id EmitCompositeInsertF32x2(EmitContext& ctx, Id composite, Id object, u32 index);
+Id EmitCompositeInsertF32x3(EmitContext& ctx, Id composite, Id object, u32 index);
+Id EmitCompositeInsertF32x4(EmitContext& ctx, Id composite, Id object, u32 index);
 void EmitCompositeConstructF64x2(EmitContext& ctx);
 void EmitCompositeConstructF64x3(EmitContext& ctx);
 void EmitCompositeConstructF64x4(EmitContext& ctx);
 void EmitCompositeExtractF64x2(EmitContext& ctx);
 void EmitCompositeExtractF64x3(EmitContext& ctx);
 void EmitCompositeExtractF64x4(EmitContext& ctx);
+Id EmitCompositeInsertF64x2(EmitContext& ctx, Id composite, Id object, u32 index);
+Id EmitCompositeInsertF64x3(EmitContext& ctx, Id composite, Id object, u32 index);
+Id EmitCompositeInsertF64x4(EmitContext& ctx, Id composite, Id object, u32 index);
 Id EmitSelectU8(EmitContext& ctx, Id cond, Id true_value, Id false_value);
 Id EmitSelectU16(EmitContext& ctx, Id cond, Id true_value, Id false_value);
 Id EmitSelectU32(EmitContext& ctx, Id cond, Id true_value, Id false_value);
@@ -270,5 +282,9 @@ Id EmitConvertU64F32(EmitContext& ctx, Id value);
 Id EmitConvertU64F64(EmitContext& ctx, Id value);
 Id EmitConvertU64U32(EmitContext& ctx, Id value);
 Id EmitConvertU32U64(EmitContext& ctx, Id value);
+Id EmitConvertF16F32(EmitContext& ctx, Id value);
+Id EmitConvertF32F16(EmitContext& ctx, Id value);
+Id EmitConvertF32F64(EmitContext& ctx, Id value);
+Id EmitConvertF64F32(EmitContext& ctx, Id value);
 
 } // namespace Shader::Backend::SPIRV
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_composite.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_composite.cpp
index c950854a0..616e63676 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_composite.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_composite.cpp
@@ -30,16 +30,28 @@ Id EmitCompositeExtractU32x4(EmitContext& ctx, Id composite, u32 index) {
     return ctx.OpCompositeExtract(ctx.U32[1], composite, index);
 }
 
-void EmitCompositeConstructF16x2(EmitContext&) {
-    throw NotImplementedException("SPIR-V Instruction");
+Id EmitCompositeInsertU32x2(EmitContext& ctx, Id composite, Id object, u32 index) {
+    return ctx.OpCompositeInsert(ctx.U32[2], object, composite, index);
 }
 
-void EmitCompositeConstructF16x3(EmitContext&) {
-    throw NotImplementedException("SPIR-V Instruction");
+Id EmitCompositeInsertU32x3(EmitContext& ctx, Id composite, Id object, u32 index) {
+    return ctx.OpCompositeInsert(ctx.U32[3], object, composite, index);
 }
 
-void EmitCompositeConstructF16x4(EmitContext&) {
-    throw NotImplementedException("SPIR-V Instruction");
+Id EmitCompositeInsertU32x4(EmitContext& ctx, Id composite, Id object, u32 index) {
+    return ctx.OpCompositeInsert(ctx.U32[4], object, composite, index);
+}
+
+Id EmitCompositeConstructF16x2(EmitContext& ctx, Id e1, Id e2) {
+    return ctx.OpCompositeConstruct(ctx.F16[2], e1, e2);
+}
+
+Id EmitCompositeConstructF16x3(EmitContext& ctx, Id e1, Id e2, Id e3) {
+    return ctx.OpCompositeConstruct(ctx.F16[3], e1, e2, e3);
+}
+
+Id EmitCompositeConstructF16x4(EmitContext& ctx, Id e1, Id e2, Id e3, Id e4) {
+    return ctx.OpCompositeConstruct(ctx.F16[4], e1, e2, e3, e4);
 }
 
 Id EmitCompositeExtractF16x2(EmitContext& ctx, Id composite, u32 index) {
@@ -54,16 +66,28 @@ Id EmitCompositeExtractF16x4(EmitContext& ctx, Id composite, u32 index) {
     return ctx.OpCompositeExtract(ctx.F16[1], composite, index);
 }
 
-void EmitCompositeConstructF32x2(EmitContext&) {
-    throw NotImplementedException("SPIR-V Instruction");
+Id EmitCompositeInsertF16x2(EmitContext& ctx, Id composite, Id object, u32 index) {
+    return ctx.OpCompositeInsert(ctx.F16[2], object, composite, index);
 }
 
-void EmitCompositeConstructF32x3(EmitContext&) {
-    throw NotImplementedException("SPIR-V Instruction");
+Id EmitCompositeInsertF16x3(EmitContext& ctx, Id composite, Id object, u32 index) {
+    return ctx.OpCompositeInsert(ctx.F16[3], object, composite, index);
 }
 
-void EmitCompositeConstructF32x4(EmitContext&) {
-    throw NotImplementedException("SPIR-V Instruction");
+Id EmitCompositeInsertF16x4(EmitContext& ctx, Id composite, Id object, u32 index) {
+    return ctx.OpCompositeInsert(ctx.F16[4], object, composite, index);
+}
+
+Id EmitCompositeConstructF32x2(EmitContext& ctx, Id e1, Id e2) {
+    return ctx.OpCompositeConstruct(ctx.F32[2], e1, e2);
+}
+
+Id EmitCompositeConstructF32x3(EmitContext& ctx, Id e1, Id e2, Id e3) {
+    return ctx.OpCompositeConstruct(ctx.F32[3], e1, e2, e3);
+}
+
+Id EmitCompositeConstructF32x4(EmitContext& ctx, Id e1, Id e2, Id e3, Id e4) {
+    return ctx.OpCompositeConstruct(ctx.F32[4], e1, e2, e3, e4);
 }
 
 Id EmitCompositeExtractF32x2(EmitContext& ctx, Id composite, u32 index) {
@@ -78,6 +102,18 @@ Id EmitCompositeExtractF32x4(EmitContext& ctx, Id composite, u32 index) {
     return ctx.OpCompositeExtract(ctx.F32[1], composite, index);
 }
 
+Id EmitCompositeInsertF32x2(EmitContext& ctx, Id composite, Id object, u32 index) {
+    return ctx.OpCompositeInsert(ctx.F32[2], object, composite, index);
+}
+
+Id EmitCompositeInsertF32x3(EmitContext& ctx, Id composite, Id object, u32 index) {
+    return ctx.OpCompositeInsert(ctx.F32[3], object, composite, index);
+}
+
+Id EmitCompositeInsertF32x4(EmitContext& ctx, Id composite, Id object, u32 index) {
+    return ctx.OpCompositeInsert(ctx.F32[4], object, composite, index);
+}
+
 void EmitCompositeConstructF64x2(EmitContext&) {
     throw NotImplementedException("SPIR-V Instruction");
 }
@@ -102,4 +138,16 @@ void EmitCompositeExtractF64x4(EmitContext&) {
     throw NotImplementedException("SPIR-V Instruction");
 }
 
+Id EmitCompositeInsertF64x2(EmitContext& ctx, Id composite, Id object, u32 index) {
+    return ctx.OpCompositeInsert(ctx.F64[2], object, composite, index);
+}
+
+Id EmitCompositeInsertF64x3(EmitContext& ctx, Id composite, Id object, u32 index) {
+    return ctx.OpCompositeInsert(ctx.F64[3], object, composite, index);
+}
+
+Id EmitCompositeInsertF64x4(EmitContext& ctx, Id composite, Id object, u32 index) {
+    return ctx.OpCompositeInsert(ctx.F64[4], object, composite, index);
+}
+
 } // namespace Shader::Backend::SPIRV
diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp
index 76ccaffce..edcc2a1cc 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_convert.cpp
@@ -86,4 +86,20 @@ Id EmitConvertU32U64(EmitContext& ctx, Id value) {
     return ctx.OpUConvert(ctx.U32[1], value);
 }
 
+Id EmitConvertF16F32(EmitContext& ctx, Id value) {
+    return ctx.OpFConvert(ctx.F16[1], value);
+}
+
+Id EmitConvertF32F16(EmitContext& ctx, Id value) {
+    return ctx.OpFConvert(ctx.F32[1], value);
+}
+
+Id EmitConvertF32F64(EmitContext& ctx, Id value) {
+    return ctx.OpFConvert(ctx.F32[1], value);
+}
+
+Id EmitConvertF64F32(EmitContext& ctx, Id value) {
+    return ctx.OpFConvert(ctx.F64[1], value);
+}
+
 } // namespace Shader::Backend::SPIRV
diff --git a/src/shader_recompiler/frontend/ir/ir_emitter.cpp b/src/shader_recompiler/frontend/ir/ir_emitter.cpp
index 0f1cab57a..186920d8f 100644
--- a/src/shader_recompiler/frontend/ir/ir_emitter.cpp
+++ b/src/shader_recompiler/frontend/ir/ir_emitter.cpp
@@ -334,12 +334,12 @@ Value IREmitter::CompositeConstruct(const Value& e1, const Value& e2, const Valu
 }
 
 Value IREmitter::CompositeExtract(const Value& vector, size_t element) {
-    const auto read = [&](Opcode opcode, size_t limit) -> Value {
+    const auto read{[&](Opcode opcode, size_t limit) -> Value {
         if (element >= limit) {
             throw InvalidArgument("Out of bounds element {}", element);
         }
         return Inst(opcode, vector, Value{static_cast<u32>(element)});
-    };
+    }};
     switch (vector.Type()) {
     case Type::U32x2:
         return read(Opcode::CompositeExtractU32x2, 2);
@@ -370,6 +370,43 @@ Value IREmitter::CompositeExtract(const Value& vector, size_t element) {
     }
 }
 
+Value IREmitter::CompositeInsert(const Value& vector, const Value& object, size_t element) {
+    const auto insert{[&](Opcode opcode, size_t limit) {
+        if (element >= limit) {
+            throw InvalidArgument("Out of bounds element {}", element);
+        }
+        return Inst(opcode, vector, object, Value{static_cast<u32>(element)});
+    }};
+    switch (vector.Type()) {
+    case Type::U32x2:
+        return insert(Opcode::CompositeInsertU32x2, 2);
+    case Type::U32x3:
+        return insert(Opcode::CompositeInsertU32x3, 3);
+    case Type::U32x4:
+        return insert(Opcode::CompositeInsertU32x4, 4);
+    case Type::F16x2:
+        return insert(Opcode::CompositeInsertF16x2, 2);
+    case Type::F16x3:
+        return insert(Opcode::CompositeInsertF16x3, 3);
+    case Type::F16x4:
+        return insert(Opcode::CompositeInsertF16x4, 4);
+    case Type::F32x2:
+        return insert(Opcode::CompositeInsertF32x2, 2);
+    case Type::F32x3:
+        return insert(Opcode::CompositeInsertF32x3, 3);
+    case Type::F32x4:
+        return insert(Opcode::CompositeInsertF32x4, 4);
+    case Type::F64x2:
+        return insert(Opcode::CompositeInsertF64x2, 2);
+    case Type::F64x3:
+        return insert(Opcode::CompositeInsertF64x3, 3);
+    case Type::F64x4:
+        return insert(Opcode::CompositeInsertF64x4, 4);
+    default:
+        ThrowInvalidType(vector.Type());
+    }
+}
+
 Value IREmitter::Select(const U1& condition, const Value& true_value, const Value& false_value) {
     if (true_value.Type() != false_value.Type()) {
         throw InvalidArgument("Mismatching types {} and {}", true_value.Type(), false_value.Type());
@@ -433,7 +470,7 @@ U32 IREmitter::PackFloat2x16(const Value& vector) {
 }
 
 Value IREmitter::UnpackFloat2x16(const U32& value) {
-    return Inst<Value>(Opcode::UnpackFloat2x16, value);
+    return Inst(Opcode::UnpackFloat2x16, value);
 }
 
 F64 IREmitter::PackDouble2x32(const Value& vector) {
@@ -968,7 +1005,7 @@ U32U64 IREmitter::ConvertFToI(size_t bitsize, bool is_signed, const F16F32F64& v
     }
 }
 
-U32U64 IREmitter::ConvertU(size_t result_bitsize, const U32U64& value) {
+U32U64 IREmitter::UConvert(size_t result_bitsize, const U32U64& value) {
     switch (result_bitsize) {
     case 32:
         switch (value.Type()) {
@@ -995,4 +1032,49 @@ U32U64 IREmitter::ConvertU(size_t result_bitsize, const U32U64& value) {
     throw NotImplementedException("Conversion from {} to {} bits", value.Type(), result_bitsize);
 }
 
+F16F32F64 IREmitter::FPConvert(size_t result_bitsize, const F16F32F64& value) {
+    switch (result_bitsize) {
+    case 16:
+        switch (value.Type()) {
+        case Type::F16:
+            // Nothing to do
+            return value;
+        case Type::F32:
+            return Inst<F16>(Opcode::ConvertF16F32, value);
+        case Type::F64:
+            throw LogicError("Illegal conversion from F64 to F16");
+        default:
+            break;
+        }
+        break;
+    case 32:
+        switch (value.Type()) {
+        case Type::F16:
+            return Inst<F32>(Opcode::ConvertF32F16, value);
+        case Type::F32:
+            // Nothing to do
+            return value;
+        case Type::F64:
+            return Inst<F64>(Opcode::ConvertF32F64, value);
+        default:
+            break;
+        }
+        break;
+    case 64:
+        switch (value.Type()) {
+        case Type::F16:
+            throw LogicError("Illegal conversion from F16 to F64");
+        case Type::F32:
+            // Nothing to do
+            return value;
+        case Type::F64:
+            return Inst<F64>(Opcode::ConvertF32F64, value);
+        default:
+            break;
+        }
+        break;
+    }
+    throw NotImplementedException("Conversion from {} to {} bits", value.Type(), result_bitsize);
+}
+
 } // namespace Shader::IR
diff --git a/src/shader_recompiler/frontend/ir/ir_emitter.h b/src/shader_recompiler/frontend/ir/ir_emitter.h
index 03a67985f..5beb99895 100644
--- a/src/shader_recompiler/frontend/ir/ir_emitter.h
+++ b/src/shader_recompiler/frontend/ir/ir_emitter.h
@@ -97,6 +97,7 @@ public:
     [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2, const Value& e3,
                                            const Value& e4);
     [[nodiscard]] Value CompositeExtract(const Value& vector, size_t element);
+    [[nodiscard]] Value CompositeInsert(const Value& vector, const Value& object, size_t element);
 
     [[nodiscard]] Value Select(const U1& condition, const Value& true_value,
                                const Value& false_value);
@@ -186,7 +187,8 @@ public:
     [[nodiscard]] U32U64 ConvertFToU(size_t bitsize, const F16F32F64& value);
     [[nodiscard]] U32U64 ConvertFToI(size_t bitsize, bool is_signed, const F16F32F64& value);
 
-    [[nodiscard]] U32U64 ConvertU(size_t result_bitsize, const U32U64& value);
+    [[nodiscard]] U32U64 UConvert(size_t result_bitsize, const U32U64& value);
+    [[nodiscard]] F16F32F64 FPConvert(size_t result_bitsize, const F16F32F64& value);
 
 private:
     IR::Block::iterator insertion_point;
diff --git a/src/shader_recompiler/frontend/ir/opcodes.inc b/src/shader_recompiler/frontend/ir/opcodes.inc
index aedbc5c3e..acfc0a829 100644
--- a/src/shader_recompiler/frontend/ir/opcodes.inc
+++ b/src/shader_recompiler/frontend/ir/opcodes.inc
@@ -83,24 +83,36 @@ OPCODE(CompositeConstructU32x4,                             U32x4,          U32,
 OPCODE(CompositeExtractU32x2,                               U32,            U32x2,          U32,                                            )
 OPCODE(CompositeExtractU32x3,                               U32,            U32x3,          U32,                                            )
 OPCODE(CompositeExtractU32x4,                               U32,            U32x4,          U32,                                            )
+OPCODE(CompositeInsertU32x2,                                U32x2,          U32x2,          U32,            U32,                            )
+OPCODE(CompositeInsertU32x3,                                U32x3,          U32x3,          U32,            U32,                            )
+OPCODE(CompositeInsertU32x4,                                U32x4,          U32x4,          U32,            U32,                            )
 OPCODE(CompositeConstructF16x2,                             F16x2,          F16,            F16,                                            )
 OPCODE(CompositeConstructF16x3,                             F16x3,          F16,            F16,            F16,                            )
 OPCODE(CompositeConstructF16x4,                             F16x4,          F16,            F16,            F16,            F16,            )
 OPCODE(CompositeExtractF16x2,                               F16,            F16x2,          U32,                                            )
 OPCODE(CompositeExtractF16x3,                               F16,            F16x3,          U32,                                            )
 OPCODE(CompositeExtractF16x4,                               F16,            F16x4,          U32,                                            )
+OPCODE(CompositeInsertF16x2,                                F16x2,          F16x2,          F16,            U32,                            )
+OPCODE(CompositeInsertF16x3,                                F16x3,          F16x3,          F16,            U32,                            )
+OPCODE(CompositeInsertF16x4,                                F16x4,          F16x4,          F16,            U32,                            )
 OPCODE(CompositeConstructF32x2,                             F32x2,          F32,            F32,                                            )
 OPCODE(CompositeConstructF32x3,                             F32x3,          F32,            F32,            F32,                            )
 OPCODE(CompositeConstructF32x4,                             F32x4,          F32,            F32,            F32,            F32,            )
 OPCODE(CompositeExtractF32x2,                               F32,            F32x2,          U32,                                            )
 OPCODE(CompositeExtractF32x3,                               F32,            F32x3,          U32,                                            )
 OPCODE(CompositeExtractF32x4,                               F32,            F32x4,          U32,                                            )
+OPCODE(CompositeInsertF32x2,                                F32x2,          F32x2,          F32,            U32,                            )
+OPCODE(CompositeInsertF32x3,                                F32x3,          F32x3,          F32,            U32,                            )
+OPCODE(CompositeInsertF32x4,                                F32x4,          F32x4,          F32,            U32,                            )
 OPCODE(CompositeConstructF64x2,                             F64x2,          F64,            F64,                                            )
 OPCODE(CompositeConstructF64x3,                             F64x3,          F64,            F64,            F64,                            )
 OPCODE(CompositeConstructF64x4,                             F64x4,          F64,            F64,            F64,            F64,            )
 OPCODE(CompositeExtractF64x2,                               F64,            F64x2,          U32,                                            )
 OPCODE(CompositeExtractF64x3,                               F64,            F64x3,          U32,                                            )
 OPCODE(CompositeExtractF64x4,                               F64,            F64x4,          U32,                                            )
+OPCODE(CompositeInsertF64x2,                                F64x2,          F64x2,          F64,            U32,                            )
+OPCODE(CompositeInsertF64x3,                                F64x3,          F64x3,          F64,            U32,                            )
+OPCODE(CompositeInsertF64x4,                                F64x4,          F64x4,          F64,            U32,                            )
 
 // Select operations
 OPCODE(SelectU8,                                            U8,             U1,             U8,             U8,                             )
@@ -277,6 +289,9 @@ OPCODE(ConvertU32F64,                                       U32,            F64,
 OPCODE(ConvertU64F16,                                       U64,            F16,                                                            )
 OPCODE(ConvertU64F32,                                       U64,            F32,                                                            )
 OPCODE(ConvertU64F64,                                       U64,            F64,                                                            )
-
 OPCODE(ConvertU64U32,                                       U64,            U32,                                                            )
 OPCODE(ConvertU32U64,                                       U32,            U64,                                                            )
+OPCODE(ConvertF16F32,                                       F16,            F32,                                                            )
+OPCODE(ConvertF32F16,                                       F32,            F16,                                                            )
+OPCODE(ConvertF32F64,                                       F32,            F64,                                                            )
+OPCODE(ConvertF64F32,                                       F64,            F32,                                                            )
diff --git a/src/shader_recompiler/frontend/maxwell/translate/impl/half_floating_point_add.cpp b/src/shader_recompiler/frontend/maxwell/translate/impl/half_floating_point_add.cpp
new file mode 100644
index 000000000..6965adfb3
--- /dev/null
+++ b/src/shader_recompiler/frontend/maxwell/translate/impl/half_floating_point_add.cpp
@@ -0,0 +1,184 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/common_types.h"
+#include "shader_recompiler/exception.h"
+#include "shader_recompiler/frontend/maxwell/translate/impl/common_encoding.h"
+#include "shader_recompiler/frontend/maxwell/translate/impl/impl.h"
+
+namespace Shader::Maxwell {
+namespace {
+enum class Merge : u64 {
+    H1_H0,
+    F32,
+    MRG_H0,
+    MRG_H1,
+};
+
+enum class Swizzle : u64 {
+    H1_H0,
+    F32,
+    H0_H0,
+    H1_H1,
+};
+
+std::pair<IR::F16F32F64, IR::F16F32F64> Extract(IR::IREmitter& ir, IR::U32 value, Swizzle swizzle) {
+    switch (swizzle) {
+    case Swizzle::H1_H0: {
+        const IR::Value vector{ir.UnpackFloat2x16(value)};
+        return {IR::F16{ir.CompositeExtract(vector, 0)}, IR::F16{ir.CompositeExtract(vector, 1)}};
+    }
+    case Swizzle::H0_H0: {
+        const IR::F16 scalar{ir.CompositeExtract(ir.UnpackFloat2x16(value), 0)};
+        return {scalar, scalar};
+    }
+    case Swizzle::H1_H1: {
+        const IR::F16 scalar{ir.CompositeExtract(ir.UnpackFloat2x16(value), 1)};
+        return {scalar, scalar};
+    }
+    case Swizzle::F32: {
+        const IR::F32 scalar{ir.BitCast<IR::F32>(value)};
+        return {scalar, scalar};
+    }
+    }
+    throw InvalidArgument("Invalid swizzle {}", swizzle);
+}
+
+IR::U32 MergeResult(IR::IREmitter& ir, IR::Reg dest, const IR::F16& lhs, const IR::F16& rhs,
+                    Merge merge) {
+    switch (merge) {
+    case Merge::H1_H0:
+        return ir.PackFloat2x16(ir.CompositeConstruct(lhs, rhs));
+    case Merge::F32:
+        return ir.BitCast<IR::U32, IR::F32>(ir.FPConvert(32, lhs));
+    case Merge::MRG_H0:
+    case Merge::MRG_H1: {
+        const IR::Value vector{ir.UnpackFloat2x16(ir.GetReg(dest))};
+        const bool h0{merge == Merge::MRG_H0};
+        const IR::F16& insert{h0 ? lhs : rhs};
+        return ir.PackFloat2x16(ir.CompositeInsert(vector, insert, h0 ? 0 : 1));
+    }
+    }
+    throw InvalidArgument("Invalid merge {}", merge);
+}
+
+void HADD2(TranslatorVisitor& v, u64 insn, Merge merge, bool ftz, bool sat, bool abs_a, bool neg_a,
+           Swizzle swizzle_a, bool abs_b, bool neg_b, Swizzle swizzle_b, const IR::U32& src_b) {
+    union {
+        u64 raw;
+        BitField<0, 8, IR::Reg> dest_reg;
+        BitField<8, 8, IR::Reg> src_a;
+    } const hadd2{insn};
+
+    auto [lhs_a, rhs_a]{Extract(v.ir, v.X(hadd2.src_a), swizzle_a)};
+    auto [lhs_b, rhs_b]{Extract(v.ir, src_b, swizzle_b)};
+    const bool promotion{lhs_a.Type() != lhs_b.Type()};
+    if (promotion) {
+        if (lhs_a.Type() == IR::Type::F16) {
+            lhs_a = v.ir.FPConvert(32, lhs_a);
+            rhs_a = v.ir.FPConvert(32, rhs_a);
+        }
+        if (lhs_b.Type() == IR::Type::F16) {
+            lhs_b = v.ir.FPConvert(32, lhs_b);
+            rhs_b = v.ir.FPConvert(32, rhs_b);
+        }
+    }
+    lhs_a = v.ir.FPAbsNeg(lhs_a, abs_a, neg_a);
+    rhs_a = v.ir.FPAbsNeg(rhs_a, abs_a, neg_a);
+
+    lhs_b = v.ir.FPAbsNeg(lhs_b, abs_b, neg_b);
+    rhs_b = v.ir.FPAbsNeg(rhs_b, abs_b, neg_b);
+
+    const IR::FpControl fp_control{
+        .no_contraction{true},
+        .rounding{IR::FpRounding::DontCare},
+        .fmz_mode{ftz ? IR::FmzMode::FTZ : IR::FmzMode::None},
+    };
+    IR::F16F32F64 lhs{v.ir.FPAdd(lhs_a, lhs_b, fp_control)};
+    IR::F16F32F64 rhs{v.ir.FPAdd(rhs_a, rhs_b, fp_control)};
+    if (sat) {
+        lhs = v.ir.FPSaturate(lhs);
+        rhs = v.ir.FPSaturate(rhs);
+    }
+    if (promotion) {
+        lhs = v.ir.FPConvert(16, lhs);
+        rhs = v.ir.FPConvert(16, rhs);
+    }
+    v.X(hadd2.dest_reg, MergeResult(v.ir, hadd2.dest_reg, lhs, rhs, merge));
+}
+} // Anonymous namespace
+
+void TranslatorVisitor::HADD2_reg(u64 insn) {
+    union {
+        u64 raw;
+        BitField<49, 2, Merge> merge;
+        BitField<39, 1, u64> ftz;
+        BitField<32, 1, u64> sat;
+        BitField<43, 1, u64> neg_a;
+        BitField<44, 1, u64> abs_a;
+        BitField<47, 2, Swizzle> swizzle_a;
+        BitField<31, 1, u64> neg_b;
+        BitField<30, 1, u64> abs_b;
+        BitField<28, 2, Swizzle> swizzle_b;
+    } const hadd2{insn};
+
+    HADD2(*this, insn, hadd2.merge, hadd2.ftz != 0, hadd2.sat != 0, hadd2.abs_a != 0,
+          hadd2.neg_a != 0, hadd2.swizzle_a, hadd2.abs_b != 0, hadd2.neg_b != 0, hadd2.swizzle_b,
+          GetReg20(insn));
+}
+
+void TranslatorVisitor::HADD2_cbuf(u64 insn) {
+    union {
+        u64 raw;
+        BitField<49, 2, Merge> merge;
+        BitField<39, 1, u64> ftz;
+        BitField<52, 1, u64> sat;
+        BitField<43, 1, u64> neg_a;
+        BitField<44, 1, u64> abs_a;
+        BitField<47, 2, Swizzle> swizzle_a;
+        BitField<56, 1, u64> neg_b;
+        BitField<54, 1, u64> abs_b;
+    } const hadd2{insn};
+
+    HADD2(*this, insn, hadd2.merge, hadd2.ftz != 0, hadd2.sat != 0, hadd2.abs_a != 0,
+          hadd2.neg_a != 0, hadd2.swizzle_a, hadd2.abs_b != 0, hadd2.neg_b != 0, Swizzle::F32,
+          GetCbuf(insn));
+}
+
+void TranslatorVisitor::HADD2_imm(u64 insn) {
+    union {
+        u64 raw;
+        BitField<49, 2, Merge> merge;
+        BitField<39, 1, u64> ftz;
+        BitField<52, 1, u64> sat;
+        BitField<43, 1, u64> neg_a;
+        BitField<44, 1, u64> abs_a;
+        BitField<47, 2, Swizzle> swizzle_a;
+        BitField<56, 1, u64> neg_high;
+        BitField<30, 9, u64> high;
+        BitField<29, 1, u64> neg_low;
+        BitField<20, 9, u64> low;
+    } const hadd2{insn};
+
+    const u32 imm{static_cast<u32>(hadd2.low << 6) | ((hadd2.neg_low != 0 ? 1 : 0) << 15) |
+                  static_cast<u32>(hadd2.high << 22) | ((hadd2.neg_high != 0 ? 1 : 0) << 31)};
+    HADD2(*this, insn, hadd2.merge, hadd2.ftz != 0, hadd2.sat != 0, hadd2.abs_a != 0,
+          hadd2.neg_a != 0, hadd2.swizzle_a, false, false, Swizzle::H1_H0, ir.Imm32(imm));
+}
+
+void TranslatorVisitor::HADD2_32I(u64 insn) {
+    union {
+        u64 raw;
+        BitField<55, 1, u64> ftz;
+        BitField<52, 1, u64> sat;
+        BitField<56, 1, u64> neg_a;
+        BitField<53, 2, Swizzle> swizzle_a;
+        BitField<20, 32, u64> imm32;
+    } const hadd2{insn};
+
+    const u32 imm{static_cast<u32>(hadd2.imm32)};
+    HADD2(*this, insn, Merge::H1_H0, hadd2.ftz != 0, hadd2.sat != 0, false, hadd2.neg_a != 0,
+          hadd2.swizzle_a, false, false, Swizzle::H1_H0, ir.Imm32(imm));
+}
+} // namespace Shader::Maxwell
diff --git a/src/shader_recompiler/frontend/maxwell/translate/impl/load_store_memory.cpp b/src/shader_recompiler/frontend/maxwell/translate/impl/load_store_memory.cpp
index 727524284..748b856c9 100644
--- a/src/shader_recompiler/frontend/maxwell/translate/impl/load_store_memory.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate/impl/load_store_memory.cpp
@@ -59,7 +59,7 @@ IR::U64 Address(TranslatorVisitor& v, u64 insn) {
     const IR::U64 address{[&]() -> IR::U64 {
         if (mem.e == 0) {
             // LDG/STG without .E uses a 32-bit pointer, zero-extend it
-            return v.ir.ConvertU(64, v.X(mem.addr_reg));
+            return v.ir.UConvert(64, v.X(mem.addr_reg));
         }
         if (!IR::IsAligned(mem.addr_reg, 2)) {
             throw NotImplementedException("Unaligned address register");
diff --git a/src/shader_recompiler/frontend/maxwell/translate/impl/not_implemented.cpp b/src/shader_recompiler/frontend/maxwell/translate/impl/not_implemented.cpp
index a0535f1c2..c24f29ff7 100644
--- a/src/shader_recompiler/frontend/maxwell/translate/impl/not_implemented.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate/impl/not_implemented.cpp
@@ -265,22 +265,6 @@ void TranslatorVisitor::GETLMEMBASE(u64) {
     ThrowNotImplemented(Opcode::GETLMEMBASE);
 }
 
-void TranslatorVisitor::HADD2_reg(u64) {
-    ThrowNotImplemented(Opcode::HADD2_reg);
-}
-
-void TranslatorVisitor::HADD2_cbuf(u64) {
-    ThrowNotImplemented(Opcode::HADD2_cbuf);
-}
-
-void TranslatorVisitor::HADD2_imm(u64) {
-    ThrowNotImplemented(Opcode::HADD2_imm);
-}
-
-void TranslatorVisitor::HADD2_32I(u64) {
-    ThrowNotImplemented(Opcode::HADD2_32I);
-}
-
 void TranslatorVisitor::HFMA2_reg(u64) {
     ThrowNotImplemented(Opcode::HFMA2_reg);
 }
diff --git a/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp b/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp
index 98e3dfef7..965e52135 100644
--- a/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp
+++ b/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp
@@ -298,7 +298,7 @@ IR::U32 StorageOffset(IR::Block& block, IR::Inst& inst, StorageBufferAddr buffer
             offset = ir.IAdd(offset, ir.Imm32(low_addr->imm_offset));
         }
     } else {
-        offset = ir.ConvertU(32, IR::U64{inst.Arg(0)});
+        offset = ir.UConvert(32, IR::U64{inst.Arg(0)});
     }
     // Subtract the least significant 32 bits from the guest offset. The result is the storage
     // buffer offset in bytes.
diff --git a/src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp b/src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp
index c7032f168..14a5cb50f 100644
--- a/src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp
+++ b/src/shader_recompiler/ir_opt/lower_fp16_to_fp32.cpp
@@ -44,6 +44,12 @@ IR::Opcode Replace(IR::Opcode op) {
         return IR::Opcode::CompositeExtractF32x3;
     case IR::Opcode::CompositeExtractF16x4:
         return IR::Opcode::CompositeExtractF32x4;
+    case IR::Opcode::CompositeInsertF16x2:
+        return IR::Opcode::CompositeInsertF32x2;
+    case IR::Opcode::CompositeInsertF16x3:
+        return IR::Opcode::CompositeInsertF32x3;
+    case IR::Opcode::CompositeInsertF16x4:
+        return IR::Opcode::CompositeInsertF32x4;
     case IR::Opcode::ConvertS16F16:
         return IR::Opcode::ConvertS16F32;
     case IR::Opcode::ConvertS32F16:
@@ -60,6 +66,10 @@ IR::Opcode Replace(IR::Opcode op) {
         return IR::Opcode::PackHalf2x16;
     case IR::Opcode::UnpackFloat2x16:
         return IR::Opcode::UnpackHalf2x16;
+    case IR::Opcode::ConvertF32F16:
+        return IR::Opcode::Identity;
+    case IR::Opcode::ConvertF16F32:
+        return IR::Opcode::Identity;
     default:
         return op;
     }