[libcxx-commits] [libcxx] [llvm] [clang-tools-extra] [libc] [clang] [flang] [compiler-rt] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)
Mariusz Sikora via libcxx-commits
libcxx-commits at lists.llvm.org
Thu Jan 18 02:55:37 PST 2024
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/75917
>From f0920d06a57b3bc77b50baf94c4616be597e74c3 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Mon, 18 Dec 2023 20:08:18 +0100
Subject: [PATCH 1/6] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions
- image_atomic_pk_add_f16
- image_atomic_pk_add_bf16
- ds_pk_add_bf16
- ds_pk_add_f16
- ds_pk_add_rtn_bf16
- ds_pk_add_rtn_f16
- flat_atomic_pk_add_f16
- flat_atomic_pk_add_bf16
- global_atomic_pk_add_f16
- global_atomic_pk_add_bf16
- buffer_atomic_pk_add_f16
- buffer_atomic_pk_add_bf16
---
clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 +-
.../builtins-fp-atomics-gfx12.cl | 92 ++++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 45 ++
llvm/lib/Target/AMDGPU/AMDGPU.td | 4 +
llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 1 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 26 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 +
.../Target/AMDGPU/AMDGPUSearchableTables.td | 4 +
llvm/lib/Target/AMDGPU/BUFInstructions.td | 21 +
llvm/lib/Target/AMDGPU/DSInstructions.td | 12 +-
llvm/lib/Target/AMDGPU/FLATInstructions.td | 4 +
llvm/lib/Target/AMDGPU/MIMGInstructions.td | 2 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 1 +
llvm/lib/TargetParser/TargetParser.cpp | 4 +
.../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 433 ++++++++++++++++++
.../AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll | 60 +++
llvm/test/MC/AMDGPU/gfx11_unsupported.s | 18 +
llvm/test/MC/AMDGPU/gfx12_asm_ds.s | 75 +++
llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s | 132 ++++++
llvm/test/MC/AMDGPU/gfx12_asm_vflat.s | 60 +++
llvm/test/MC/AMDGPU/gfx12_asm_vimage.s | 54 +++
.../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt | 60 +++
.../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt | 84 ++++
.../Disassembler/AMDGPU/gfx12_dasm_vflat.txt | 60 +++
.../Disassembler/AMDGPU/gfx12_dasm_vimage.txt | 54 +++
29 files changed, 1329 insertions(+), 21 deletions(-)
create mode 100644 clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
create mode 100644 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 8959634572b44e9..fe1798406967e81 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -100,8 +100,8 @@
// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1200: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1201: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
new file mode 100644
index 000000000000000..20ff12c33763707
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -0,0 +1,92 @@
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN: %s -S -emit-llvm -o - | FileCheck %s
+
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN: -S -o - %s | FileCheck -check-prefix=GFX12 %s
+
+// REQUIRES: amdgpu-registered-target
+
+typedef half __attribute__((ext_vector_type(2))) half2;
+typedef short __attribute__((ext_vector_type(2))) short2;
+
+// CHECK-LABEL: test_local_add_2bf16
+// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
+// GFX12-LABEL: test_local_add_2bf16
+// GFX12: ds_pk_add_rtn_bf16
+short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
+ return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
+}
+
+// CHECK-LABEL: test_local_add_2bf16_noret
+// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
+// GFX12-LABEL: test_local_add_2bf16_noret
+// GFX12: ds_pk_add_bf16
+void test_local_add_2bf16_noret(__local short2 *addr, short2 x) {
+ __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
+}
+
+// CHECK-LABEL: test_local_add_2f16
+// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
+// GFX12-LABEL: test_local_add_2f16
+// GFX12: ds_pk_add_rtn_f16
+half2 test_local_add_2f16(__local half2 *addr, half2 x) {
+ return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: test_local_add_2f16_noret
+// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
+// GFX12-LABEL: test_local_add_2f16_noret
+// GFX12: ds_pk_add_f16
+void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
+ __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: test_flat_add_2f16
+// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %{{.*}}, <2 x half> %{{.*}})
+// GFX12-LABEL: test_flat_add_2f16
+// GFX12: flat_atomic_pk_add_f16
+half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
+ return __builtin_amdgcn_flat_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: test_flat_add_2bf16
+// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, <2 x i16> %{{.*}})
+// GFX12-LABEL: test_flat_add_2bf16
+// GFX12: flat_atomic_pk_add_bf16
+short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
+ return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x);
+}
+
+// CHECK-LABEL: test_global_add_half2
+// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %{{.*}}, <2 x half> %{{.*}})
+// GFX12-LABEL: test_global_add_half2
+// GFX12: global_atomic_pk_add_f16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN
+void test_global_add_half2(__global half2 *addr, half2 x) {
+ half2 *rtn;
+ *rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: test_global_add_half2_noret
+// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %{{.*}}, <2 x half> %{{.*}})
+// GFX12-LABEL: test_global_add_half2_noret
+// GFX12: global_atomic_pk_add_f16 v[0:1], v2, off
+void test_global_add_half2_noret(__global half2 *addr, half2 x) {
+ __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: test_global_add_2bf16
+// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
+// GFX12-LABEL: test_global_add_2bf16
+// GFX12: global_atomic_pk_add_bf16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN
+void test_global_add_2bf16(__global short2 *addr, short2 x) {
+ short2 *rtn;
+ *rtn = __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x);
+}
+
+// CHECK-LABEL: test_global_add_2bf16_noret
+// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
+// GFX12-LABEL: test_global_add_2bf16_noret
+// GFX12: global_atomic_pk_add_bf16 v[0:1], v2, off
+void test_global_add_2bf16_noret(__global short2 *addr, short2 x) {
+ __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 51bd9b63c127ed2..bea39743525b239 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1026,6 +1026,9 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
defm int_amdgcn_image_atomic_cmpswap :
AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">,
AMDGPUArg<LLVMMatchType<0>, "cmp">]>;
+
+ defm int_amdgcn_image_atomic_pk_add_f16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_F16">;
+ defm int_amdgcn_image_atomic_pk_add_bf16 : AMDGPUImageDimAtomic<"ATOMIC_PK_ADD_BF16">;
}
//////////////////////////////////////////////////////////////////////////
@@ -1294,6 +1297,26 @@ def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
// gfx908 intrinsic
def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>;
+// gfx12+ intrinsic
+def int_amdgcn_raw_buffer_atomic_fadd_v2bf16 : Intrinsic <
+ [llvm_v2i16_ty],
+ [llvm_v2i16_ty,
+ llvm_v4i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty],
+ [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
+ AMDGPURsrcIntrinsic<1, 0>;
+def int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic <
+ [llvm_v2i16_ty],
+ [llvm_v2i16_ty,
+ AMDGPUBufferRsrcTy,
+ llvm_i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty],
+ [IntrArgMemOnly, NoCapture<ArgIndex<1>>,
+ ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
+ AMDGPURsrcIntrinsic<1, 0>;
class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
[data_ty],
@@ -1368,6 +1391,28 @@ def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
// gfx908 intrinsic
def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
+// gfx12 intrinsic
+def int_amdgcn_struct_buffer_atomic_fadd_v2bf16 : Intrinsic <
+ [llvm_v2i16_ty],
+ [llvm_v2i16_ty,
+ llvm_v4i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty],
+ [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
+ AMDGPURsrcIntrinsic<1, 0>;
+def int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic <
+ [llvm_v2i16_ty],
+ [llvm_v2i16_ty,
+ AMDGPUBufferRsrcTy,
+ llvm_i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty],
+ [IntrArgMemOnly, NoCapture<ArgIndex<1>>,
+ ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
+ AMDGPURsrcIntrinsic<1, 0>;
// gfx90a intrinsics
def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 060fb66d38f7bc3..14b2155938eef65 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1473,6 +1473,10 @@ def FeatureISAVersion12 : FeatureSet<
FeatureArchitectedFlatScratch,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
+ FeatureAtomicDsPkAdd16Insts,
+ FeatureAtomicFlatPkAdd16Insts,
+ FeatureAtomicBufferGlobalPkAddF16Insts,
+ FeatureAtomicGlobalPkAddBF16Inst,
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeatureExtendedImageInsts,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 2b85024a9b40be0..f426b7f38428dad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -261,6 +261,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD_BF16, SIbuffer_atomic_fadd_bf16>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 156a264a7c1faa0..8205bdac4e2c5d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5425,6 +5425,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
+ NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 827fb106b551994..8d972c46447b8b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -587,6 +587,7 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_CMPSWAP,
BUFFER_ATOMIC_CSUB,
BUFFER_ATOMIC_FADD,
+ BUFFER_ATOMIC_FADD_BF16,
BUFFER_ATOMIC_FMIN,
BUFFER_ATOMIC_FMAX,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index fbee2888945185c..32964e5c2e3ef1c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5872,6 +5872,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
@@ -6079,6 +6082,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
LLT Ty = MRI->getType(VData);
+ const bool IsAtomicPacked16Bit =
+ (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
+ BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
+
// Check for 16 bit addresses and pack if true.
LLT GradTy =
MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
@@ -6087,7 +6094,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
const bool IsG16 =
ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
const bool IsA16 = AddrTy == S16;
- const bool IsD16 = Ty.getScalarType() == S16;
+ const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
int DMaskLanes = 0;
if (!BaseOpcode->Atomic) {
@@ -6129,7 +6136,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
LLT Ty = MRI->getType(VData0);
// TODO: Allow atomic swap and bit ops for v2s16/v4s16
- if (Ty.isVector())
+ if (Ty.isVector() && !IsAtomicPacked16Bit)
return false;
if (BaseOpcode->AtomicX2) {
@@ -6265,9 +6272,18 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (NumElts > 4 || DMaskLanes > 4)
return false;
+ // Image atomic instructions are using DMask to specify how many bits
+ // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
+ // DMaskLanes for image atomic has default value '0'.
+ // We must be sure that atomic variants (especially packed) will not be
+ // truncated from v2s16 or v4s16 to s16 type.
+ //
+ // ChangeElementCount will be needed for image load where Ty is always scalar.
const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
const LLT AdjustedTy =
- Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
+ DMaskLanes == 0
+ ? Ty
+ : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
// The raw dword aligned data component of the load. The only legal cases
// where this matters should be when using the packed D16 format, for
@@ -7069,6 +7085,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
return legalizeBufferAtomic(MI, B, IntrID);
case Intrinsic::trap:
return legalizeTrapIntrinsic(MI, MRI, B);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c9412f720c62ecf..d8d19f65190bb43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2920,6 +2920,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
applyDefaultMapping(OpdMapper);
@@ -4200,6 +4201,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
// vdata_out
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index beb670669581f1b..f4415aaa6b1ff92 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -278,6 +278,7 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
@@ -294,6 +295,7 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>;
@@ -310,6 +312,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
@@ -326,6 +329,7 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 43d35fa5291ca0c..2a6ad3b0d4a25be 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1245,6 +1245,13 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
"buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag
>;
+let SubtargetPredicate = isGFX12Plus in {
+let FPAtomic = 1 in
+defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_pk_add_bf16", VGPR_32, v2i16
+>;
+}
+
//===----------------------------------------------------------------------===//
// MTBUF Instructions
//===----------------------------------------------------------------------===//
@@ -1708,6 +1715,10 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">;
let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>;
+let SubtargetPredicate = isGFX12Plus in {
+ defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd_bf16", v2i16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">;
+}
+
let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">;
@@ -1772,14 +1783,22 @@ let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>;
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
+ let SubtargetPredicate = isGFX9Only in
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>;
+
+ let SubtargetPredicate = isGFX12Plus in
+ defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["noret"]>;
} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts]
let OtherPredicates = [HasAtomicFaddRtnInsts] in
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>;
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
+ let SubtargetPredicate = isGFX9Only in
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
+
+ let SubtargetPredicate = isGFX12Plus in
+ defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["ret"]>;
} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts]
let OtherPredicates = [isGFX90APlus] in {
@@ -2634,6 +2653,8 @@ defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x033,
defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x041, "buffer_atomic_swap_b64">;
defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03E, "buffer_atomic_xor_b32">;
defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x04B, "buffer_atomic_xor_b64">;
+defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_gfx12<0x059>;
+defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_gfx12<0x05a>;
//===----------------------------------------------------------------------===//
// MUBUF - GFX10.
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 3a895923fa4b987..3f208aaa453df6c 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -487,10 +487,10 @@ let SubtargetPredicate = isGFX90APlus in {
} // End SubtargetPredicate = isGFX90APlus
let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
- defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">;
- defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">;
- defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">;
- defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">;
+ defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc<"ds_pk_add_f16">;
+ defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">;
+ defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc<"ds_pk_add_bf16">;
+ defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">;
} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
defm DS_CMPSTORE_B32 : DS_1A2D_NORET_mc<"ds_cmpstore_b32">;
@@ -1236,6 +1236,10 @@ defm DS_MIN_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x072, DS_MIN_RTN_F64, "ds_min
defm DS_MAX_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x073, DS_MAX_RTN_F64, "ds_max_num_rtn_f64">;
defm DS_SUB_CLAMP_U32 : DS_Real_gfx12<0x099>;
defm DS_SUB_CLAMP_RTN_U32 : DS_Real_gfx12<0x0a9>;
+defm DS_PK_ADD_F16 : DS_Real_gfx12<0x09a>;
+defm DS_PK_ADD_RTN_F16 : DS_Real_gfx12<0x0aa>;
+defm DS_PK_ADD_BF16 : DS_Real_gfx12<0x09b>;
+defm DS_PK_ADD_RTN_BF16 : DS_Real_gfx12<0x0ab>;
//===----------------------------------------------------------------------===//
// GFX11.
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 0dd2b3f5c2c9127..d959282f6a5a972 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -2597,6 +2597,8 @@ defm FLAT_ATOMIC_DEC_U64 : VFLAT_Real_Atomics_gfx12<0x04d, "FLAT_ATOMI
defm FLAT_ATOMIC_MIN_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x051, "FLAT_ATOMIC_FMIN", "flat_atomic_min_num_f32", true, "flat_atomic_min_f32">;
defm FLAT_ATOMIC_MAX_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x052, "FLAT_ATOMIC_FMAX", "flat_atomic_max_num_f32", true, "flat_atomic_max_f32">;
defm FLAT_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056, "FLAT_ATOMIC_ADD_F32", "flat_atomic_add_f32">;
+defm FLAT_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059, "FLAT_ATOMIC_PK_ADD_F16", "flat_atomic_pk_add_f16">;
+defm FLAT_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a, "FLAT_ATOMIC_PK_ADD_BF16", "flat_atomic_pk_add_bf16">;
// ENC_VGLOBAL.
defm GLOBAL_LOAD_U8 : VGLOBAL_Real_AllAddr_gfx12<0x010, "GLOBAL_LOAD_UBYTE", "global_load_u8", true>;
@@ -2654,6 +2656,8 @@ defm GLOBAL_ATOMIC_DEC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_A
defm GLOBAL_ATOMIC_MIN_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">;
defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">;
defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
+defm GLOBAL_ATOMIC_PK_ADD_F16 : VGLOBAL_Real_Atomics_gfx12<0x059, "GLOBAL_ATOMIC_PK_ADD_F16", "global_atomic_pk_add_f16">;
+defm GLOBAL_ATOMIC_PK_ADD_BF16 : VGLOBAL_Real_Atomics_gfx12<0x05a, "GLOBAL_ATOMIC_PK_ADD_BF16", "global_atomic_pk_add_bf16">;
// ENC_VSCRATCH.
defm SCRATCH_LOAD_U8 : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 240366c8e7daaee..28ba445b908d9e2 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1553,6 +1553,8 @@ defm IMAGE_ATOMIC_DEC : MIMG_Atomic_Renamed <mimgopc<0x16, 0x16, 0x1c>
defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>;
defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>;
defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>;
+defm IMAGE_ATOMIC_PK_ADD_F16 : MIMG_Atomic <mimgopc<0x86, MIMG.NOP, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_pk_add_f16", 0, 1>;
+defm IMAGE_ATOMIC_PK_ADD_BF16 : MIMG_Atomic <mimgopc<0x87, MIMG.NOP, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_pk_add_bf16", 0, 1>;
defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x1b, 0x20>, AMDGPUSample>;
let OtherPredicates = [HasExtendedImageInsts] in {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4f4bc45e49b43e1..b2b6940abe709ad 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7035,17 +7035,17 @@ static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
// Re-construct the required return value for a image load intrinsic.
// This is more complicated due to the optional use TexFailCtrl which means the required
// return type is an aggregate
-static SDValue constructRetValue(SelectionDAG &DAG,
- MachineSDNode *Result,
- ArrayRef<EVT> ResultTypes,
- bool IsTexFail, bool Unpacked, bool IsD16,
- int DMaskPop, int NumVDataDwords,
+static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
+ ArrayRef<EVT> ResultTypes, bool IsTexFail,
+ bool Unpacked, bool IsD16, int DMaskPop,
+ int NumVDataDwords, bool IsAtomicPacked16Bit,
const SDLoc &DL) {
// Determine the required return type. This is the same regardless of IsTexFail flag
EVT ReqRetVT = ResultTypes[0];
int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
- int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ?
- ReqRetNumElts : (ReqRetNumElts + 1) / 2;
+ int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
+ ? (ReqRetNumElts + 1) / 2
+ : ReqRetNumElts;
int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
DMaskPop : (DMaskPop + 1) / 2;
@@ -7070,7 +7070,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,
}
}
- if (DataDwordVT.isVector())
+ if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
NumDataDwords - MaskPopDwords);
@@ -7177,6 +7177,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SDValue VData;
int NumVDataDwords;
bool AdjustRetType = false;
+ bool IsAtomicPacked16Bit = false;
// Offset of intrinsic arguments
const unsigned ArgOffset = WithChain ? 2 : 1;
@@ -7187,6 +7188,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
if (BaseOpcode->Atomic) {
VData = Op.getOperand(2);
+ IsAtomicPacked16Bit =
+ (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
+ Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
+
bool Is64Bit = VData.getValueSizeInBits() == 64;
if (BaseOpcode->AtomicX2) {
SDValue VData2 = Op.getOperand(3);
@@ -7516,10 +7521,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
if (BaseOpcode->Store)
return SDValue(NewNode, 0);
- return constructRetValue(DAG, NewNode,
- OrigResultTypes, IsTexFail,
- Subtarget->hasUnpackedD16VMem(), IsD16,
- DMaskLanes, NumVDataDwords, DL);
+ return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
+ Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
+ NumVDataDwords, IsAtomicPacked16Bit, DL);
}
SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
@@ -8383,9 +8387,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
+ return lowerRawBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_FADD_BF16);
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_FADD_BF16);
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
@@ -15277,6 +15287,7 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
case AMDGPUISD::BUFFER_ATOMIC_CSUB:
case AMDGPUISD::BUFFER_ATOMIC_FADD:
+ case AMDGPUISD::BUFFER_ATOMIC_FADD_BF16:
case AMDGPUISD::BUFFER_ATOMIC_FMIN:
case AMDGPUISD::BUFFER_ATOMIC_FMAX:
// Target-specific read-modify-write atomics are sources of divergence.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 173c877b8d29efd..2e6c147805e066b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -195,6 +195,7 @@ defm SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
defm SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
defm SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
defm SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
+defm SIbuffer_atomic_fadd_bf16 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD_BF16">;
defm SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
defm SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f9bc623abcd04b1..6dddb9217ca3a55 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3707,6 +3707,7 @@ def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_FADD_BF16 : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction;
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index d741d2ce7942dfd..824e0979e22ffbf 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -281,6 +281,10 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["dot9-insts"] = true;
Features["dot10-insts"] = true;
Features["dl-insts"] = true;
+ Features["atomic-ds-pk-add-16-insts"] = true;
+ Features["atomic-flat-pk-add-16-insts"] = true;
+ Features["atomic-buffer-global-pk-add-f16-insts"] = true;
+ Features["atomic-global-pk-add-bf16-inst"] = true;
Features["16-bit-insts"] = true;
Features["dpp"] = true;
Features["gfx8-insts"] = true;
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
new file mode 100644
index 000000000000000..0b06013a7324e04
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -0,0 +1,433 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12-GISEL
+
+declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg)
+declare <2 x i16> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x i16>, <4 x i32>, i32, i32, i32, i32 immarg)
+declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32)
+declare <2 x i16> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32, i32, i32)
+declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
+declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
+declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1)
+declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
+declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
+declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
+
+define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
+; GFX12-LABEL: local_atomic_fadd_v2f16_noret:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: ds_pk_add_f16 v0, v1
+; GFX12-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1
+; GFX12-GISEL-NEXT: s_endpgm
+ %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
+ ret void
+}
+
+define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
+; GFX12-LABEL: local_atomic_fadd_v2bf16_noret:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: ds_pk_add_bf16 v0, v1
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: buffer_gl0_inv
+; GFX12-NEXT: buffer_gl1_inv
+; GFX12-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-GISEL-NEXT: ds_pk_add_bf16 v1, v0
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT: buffer_gl0_inv
+; GFX12-GISEL-NEXT: buffer_gl1_inv
+; GFX12-GISEL-NEXT: s_endpgm
+ %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
+ ret void
+}
+
+define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) {
+; GFX12-LABEL: local_atomic_fadd_v2f16_rtn:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
+ ret <2 x half> %ret
+}
+
+define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) {
+; GFX12-LABEL: local_atomic_fadd_v2bf16_rtn:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: buffer_gl0_inv
+; GFX12-NEXT: buffer_gl1_inv
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX12-GISEL-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT: buffer_gl0_inv
+; GFX12-GISEL-NEXT: buffer_gl1_inv
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
+ ret <2 x i16> %ret
+}
+
+define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
+; GFX12-LABEL: flat_atomic_fadd_v2f16_noret:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
+; GFX12-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
+; GFX12-GISEL-NEXT: s_endpgm
+ %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
+ ret void
+}
+
+define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
+; GFX12-LABEL: flat_atomic_fadd_v2f16_rtn:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_rtn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
+ ret <2 x half> %ret
+}
+
+define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
+; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
+; GFX12-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
+; GFX12-GISEL-NEXT: s_endpgm
+ %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
+ ret void
+}
+
+define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
+; GFX12-LABEL: flat_atomic_fadd_v2bf16_rtn:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_rtn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
+ ret <2 x i16> %ret
+}
+
+define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
+; GFX12-LABEL: global_atomic_fadd_v2bf16_noret:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
+ ret void
+}
+
+define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) {
+; GFX12-LABEL: global_atomic_fadd_v2bf16_rtn:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_rtn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
+ ret <2 x i16> %ret
+}
+
+define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
+; GFX12-LABEL: global_atomic_pk_add_v2f16:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16:
+; GFX12-GISEL: ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+main_body:
+ %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
+ ret void
+}
+
+define <2 x half> @global_atomic_pk_add_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) {
+; GFX12-LABEL: global_atomic_pk_add_v2f16_rtn:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16_rtn:
+; GFX12-GISEL: ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+main_body:
+ %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
+ ret <2 x half> %ret
+}
+
+define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret_offset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: raw_buffer_atomic_add_v2f16_noret_offset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: raw_buffer_atomic_add_v2f16_noret:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret_offset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: raw_buffer_atomic_add_v2f16_ret_offset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+ %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
+ ret <2 x half> %ret
+}
+
+define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: raw_buffer_atomic_add_v2f16_ret:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+ %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ ret <2 x half> %ret
+}
+
+define amdgpu_ps float @struct_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: struct_buffer_atomic_add_v2f16_ret:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_ret:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+ %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ %r = bitcast <2 x half> %orig to float
+ ret float %r
+}
+
+define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: struct_buffer_atomic_add_v2f16_noret:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_noret:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x i16> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: struct_buffer_atomic_add_v2bf16_ret:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_ret:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+ %orig = call <2 x i16> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ %r = bitcast <2 x i16> %orig to float
+ ret float %r
+}
+
+define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x i16> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: struct_buffer_atomic_add_v2bf16_noret:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_noret:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %orig = call <2 x i16> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_add_v2bf16(<2 x i16> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: raw_buffer_atomic_add_v2bf16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2bf16:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+ %ret = call <2 x i16> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ ret void
+}
+
+define amdgpu_ps float @raw_buffer_atomic_add_v2bf16_ret(<2 x i16> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: raw_buffer_atomic_add_v2bf16_ret:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2bf16_ret:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+ %orig = call <2 x i16> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ %ret = bitcast <2 x i16> %orig to float
+ ret float %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
new file mode 100644
index 000000000000000..9affd307c6a9709
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -march=amdgcn -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+
+define amdgpu_ps float @atomic_pk_add_f16_1d_v2(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) {
+; GFX12-LABEL: atomic_pk_add_f16_1d_v2:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
+main_body:
+ %out = call <2 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v2f16.v2f16(<2 x half> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ %out_float = bitcast <2 x half> %out to float
+ ret float %out_float
+}
+
+define amdgpu_ps float @atomic_pk_add_f16_1d_v4(<8 x i32> inreg %rsrc, <4 x half> %data, i32 %s) {
+; GFX12-LABEL: atomic_pk_add_f16_1d_v4:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
+main_body:
+ %out = call <4 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v4f16.v4f16(<4 x half> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ %out_v2f32 = bitcast <4 x half> %out to <2 x float>
+ %out0 = extractelement <2 x float> %out_v2f32, i32 0
+ ret float %out0
+}
+
+define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x i16> %data, i32 %s) {
+; GFX12-LABEL: atomic_pk_add_bf16_1d_v2:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
+main_body:
+ %out = call <2 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2i16.v2i16(<2 x i16> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ %out_i32 = bitcast <2 x i16> %out to i32
+ %out_float = bitcast i32 %out_i32 to float
+ ret float %out_float
+}
+
+define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x i16> %data, i32 %s) {
+; GFX12-LABEL: atomic_pk_add_bf16_1d_v4:
+; GFX12: ; %bb.0: ; %main_body
+; GFX12-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: ; return to shader part epilog
+main_body:
+ %out = call <4 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4i16.v4i16(<4 x i16> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ %out_v2i32 = bitcast <4 x i16> %out to <2 x i32>
+ %out0 = extractelement <2 x i32> %out_v2i32, i32 0
+ %out_float = bitcast i32 %out0 to float
+ ret float %out_float
+}
+
+declare <2 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v2f16.v2f16(<2 x half>, i32, <8 x i32>, i32, i32)
+declare <4 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v4f16.v4f16(<4 x half>, i32, <8 x i32>, i32, i32)
+declare <2 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2i16.v2i16(<2 x i16>, i32, <8 x i32>, i32, i32)
+declare <4 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4i16.v4i16(<4 x i16>, i32, <8 x i32>, i32, i32)
diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
index 89078c1ad4e0498..d530d2e09c7628e 100644
--- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
@@ -22,6 +22,9 @@ buffer_atomic_min_f64 v[2:3], off, s[12:15], s4 offset:4095
buffer_atomic_pk_add_f16 v0, v2, s[4:7], 0 idxen glc
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
buffer_inv
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
@@ -169,9 +172,15 @@ flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
flat_atomic_pk_add_bf16 a4, v[2:3], a1 sc0
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+flat_atomic_pk_add_bf16 v1, v[2:3], v2 th:TH_ATOMIC_RETURN
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
flat_atomic_pk_add_f16 a4, v[2:3], a1 sc0
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+flat_atomic_pk_add_f16 v1, v[2:3], v2 th:TH_ATOMIC_RETURN
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
@@ -196,6 +205,9 @@ global_atomic_pk_add_bf16 a4, v[2:3], a1, off sc0
global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
global_load_lds_dword v2, s[4:5] offset:4
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
@@ -220,6 +232,12 @@ image_atomic_fmax v4, v32, s[96:103] dmask:0x1 dim:SQ_RSRC_IMG_1D glc
image_atomic_fmin v4, v32, s[96:103] dmask:0x1 dim:SQ_RSRC_IMG_1D glc
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+image_atomic_pk_add_f16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+image_atomic_pk_add_bf16 v4, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_3D
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
image_gather4_b_cl_o v[252:255], v[1:8], s[8:15], s[12:15] dmask:0x1
// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s
index ba32dc8820eaada..2cc7d6b085c9e97 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s
@@ -939,6 +939,81 @@ ds_permute_b32 v5, v1, v2 offset:0
ds_permute_b32 v255, v255, v255 offset:4
// GFX12: [0x04,0x00,0xc8,0xda,0xff,0xff,0x00,0xff]
+ds_pk_add_f16 v2, v1
+// GFX12: [0x00,0x00,0x68,0xda,0x02,0x01,0x00,0x00]
+
+ds_pk_add_f16 v2, v1 offset:0
+// GFX12: [0x00,0x00,0x68,0xda,0x02,0x01,0x00,0x00]
+
+ds_pk_add_f16 v2, v1 offset:4660
+// GFX12: [0x34,0x12,0x68,0xda,0x02,0x01,0x00,0x00]
+
+ds_pk_add_f16 v2, v1 offset:65535
+// GFX12: [0xff,0xff,0x68,0xda,0x02,0x01,0x00,0x00]
+
+ds_pk_add_f16 v255, v255
+// GFX12: [0x00,0x00,0x68,0xda,0xff,0xff,0x00,0x00]
+
+ds_pk_add_f16 v255, v255 offset:0
+// GFX12: [0x00,0x00,0x68,0xda,0xff,0xff,0x00,0x00]
+
+ds_pk_add_f16 v255, v255 offset:4660
+// GFX12: [0x34,0x12,0x68,0xda,0xff,0xff,0x00,0x00]
+
+ds_pk_add_f16 v255, v255 offset:65535
+// GFX12: [0xff,0xff,0x68,0xda,0xff,0xff,0x00,0x00]
+
+ds_pk_add_f16 v0, v0
+// GFX12: [0x00,0x00,0x68,0xda,0x00,0x00,0x00,0x00]
+
+ds_pk_add_bf16 v2, v1
+// GFX12: [0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00]
+
+ds_pk_add_bf16 v2, v1 offset:0
+// GFX12: [0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00]
+
+ds_pk_add_bf16 v255, v255
+// GFX12: [0x00,0x00,0x6c,0xda,0xff,0xff,0x00,0x00]
+
+ds_pk_add_bf16 v255, v255 offset:4660
+// GFX12: [0x34,0x12,0x6c,0xda,0xff,0xff,0x00,0x00]
+
+ds_pk_add_bf16 v0, v0
+// GFX12: [0x00,0x00,0x6c,0xda,0x00,0x00,0x00,0x00]
+
+ds_pk_add_bf16 v0, v0 offset:65535
+// GFX12: [0xff,0xff,0x6c,0xda,0x00,0x00,0x00,0x00]
+
+ds_pk_add_rtn_f16 v3, v2, v1
+// GFX12: [0x00,0x00,0xa8,0xda,0x02,0x01,0x00,0x03]
+
+ds_pk_add_rtn_f16 v3, v2, v1 offset:4660
+// GFX12: [0x34,0x12,0xa8,0xda,0x02,0x01,0x00,0x03]
+
+ds_pk_add_rtn_f16 v255, v0, v200
+// GFX12: [0x00,0x00,0xa8,0xda,0x00,0xc8,0x00,0xff]
+
+ds_pk_add_rtn_f16 v255, v0, v200 offset:65535
+// GFX12: [0xff,0xff,0xa8,0xda,0x00,0xc8,0x00,0xff]
+
+ds_pk_add_rtn_f16 v255, v255, v255
+// GFX12: [0x00,0x00,0xa8,0xda,0xff,0xff,0x00,0xff]
+
+ds_pk_add_rtn_bf16 v3, v2, v1
+// GFX12: [0x00,0x00,0xac,0xda,0x02,0x01,0x00,0x03]
+
+ds_pk_add_rtn_bf16 v3, v2, v1 offset:4660
+// GFX12: [0x34,0x12,0xac,0xda,0x02,0x01,0x00,0x03]
+
+ds_pk_add_rtn_bf16 v255, v0, v200
+// GFX12: [0x00,0x00,0xac,0xda,0x00,0xc8,0x00,0xff]
+
+ds_pk_add_rtn_bf16 v255, v255, v255
+// GFX12: [0x00,0x00,0xac,0xda,0xff,0xff,0x00,0xff]
+
+ds_pk_add_rtn_bf16 v255, v255, v255 offset:65535
+// GFX12: [0xff,0xff,0xac,0xda,0xff,0xff,0x00,0xff]
+
ds_read2_b32 v[5:6], v1
// GFX12: [0x00,0x00,0xdc,0xd8,0x01,0x00,0x00,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s
index a7a256cfd2b8fef..ca9fb549769bc89 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s
@@ -2527,6 +2527,138 @@ buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:8388607 th:TH_STORE_NT
buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:8388607 th:TH_STORE_BYPASS scope:SCOPE_SYS
// GFX12: encoding: [0x04,0xc0,0x01,0xc4,0x01,0x18,0xbc,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_f16 v255, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x40,0x16,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_f16 v5, off, s[12:15], s3 offset:8388607
+// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_f16 v5, off, s[96:99], s3 offset:8388607
+// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], s101 offset:8388607
+// GFX12: encoding: [0x65,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], m0 offset:8388607
+// GFX12: encoding: [0x7d,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_f16 v5, v0, s[8:11], s3 idxen offset:8388607
+// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_f16 v5, v0, s[8:11], s3 offen offset:8388607
+// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], s3
+// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], s3
+// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:7
+// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV
+// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], 0 offset:8388607
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], -1 offset:8388607
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], 0.5 offset:8388607
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], -4.0 offset:8388607
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 glc
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 slc
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 dlc
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 glc slc dlc
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_bf16 v255, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x80,0x16,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_bf16 v5, off, s[12:15], s3 offset:8388607
+// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_bf16 v5, off, s[96:99], s3 offset:8388607
+// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s101 offset:8388607
+// GFX12: encoding: [0x65,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], m0 offset:8388607
+// GFX12: encoding: [0x7d,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_bf16 v5, v0, s[8:11], s3 idxen offset:8388607
+// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_bf16 v5, v0, s[8:11], s3 offen offset:8388607
+// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3
+// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3
+// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:7
+// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV
+// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], 0 offset:8388607
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], -1 offset:8388607
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], 0.5 offset:8388607
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], -4.0 offset:8388607
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 glc
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 slc
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 dlc
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 glc slc dlc
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:8388607
// GFX12: encoding: [0x03,0x80,0x15,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
index c0ffc5247d90e83..c435bc5a16dbede 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
@@ -882,6 +882,66 @@ global_atomic_sub_clamp_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN
global_atomic_sub_clamp_u32 v[0:1], v2, off offset:64
// GFX12: encoding: [0x7c,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_pk_add_f16 v1, v0, v2, s[0:1] th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00]
+
+global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+
+global_atomic_pk_add_f16 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_pk_add_f16 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+
+global_atomic_pk_add_f16 v0, v2, s[0:1] offset:-64
+// GFX12: encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_pk_add_f16 v0, v2, s[0:1]
+// GFX12: encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+
+global_atomic_pk_add_f16 v0, v2, s[0:1] offset:64
+// GFX12: encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+
+global_atomic_pk_add_f16 v[0:1], v2, off offset:-64
+// GFX12: encoding: [0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_pk_add_f16 v[0:1], v2, off offset:64
+// GFX12: encoding: [0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+
+global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00]
+
+global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+
+global_atomic_pk_add_bf16 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_pk_add_bf16 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+
+global_atomic_pk_add_bf16 v0, v2, s[0:1] offset:-64
+// GFX12: encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_pk_add_bf16 v0, v2, s[0:1]
+// GFX12: encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+
+global_atomic_pk_add_bf16 v0, v2, s[0:1] offset:64
+// GFX12: encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+
+global_atomic_pk_add_bf16 v[0:1], v2, off offset:-64
+// GFX12: encoding: [0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+
+global_atomic_pk_add_bf16 v[0:1], v2, off offset:64
+// GFX12: encoding: [0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+
global_atomic_dec_u32 v0, v2, s[0:1] offset:-64
// GFX12: encoding: [0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s b/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s
index 72bc164c5e9bdc0..093d4de5525f440 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s
@@ -883,6 +883,60 @@ image_atomic_dec_uint v[254:255], [v4, v5, v6, v7], s[96:103] dmask:0x3 dim:SQ_R
image_atomic_dec_uint v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
// GFX12: encoding: [0x00,0x80,0x45,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00]
+image_atomic_pk_add_f16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// GFX12: encoding: [0x00,0x80,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+image_atomic_pk_add_f16 v1, [v2, v3], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+// GFX12: encoding: [0x01,0x80,0x61,0xd0,0x01,0x08,0x00,0x00,0x02,0x03,0x00,0x00]
+
+image_atomic_pk_add_f16 v4, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_3D
+// GFX12: encoding: [0x02,0x80,0x61,0xd0,0x04,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+
+image_atomic_pk_add_f16 v255, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
+// GFX12: encoding: [0x03,0x80,0x61,0xd0,0xff,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+
+image_atomic_pk_add_f16 v[0:1], [v4, v5], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_1D_ARRAY
+// GFX12: encoding: [0x04,0x80,0xe1,0xd0,0x00,0x10,0x00,0x00,0x04,0x05,0x00,0x00]
+
+image_atomic_pk_add_f16 v[1:2], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY
+// GFX12: encoding: [0x05,0x80,0xe1,0xd0,0x01,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+
+image_atomic_pk_add_f16 v[3:4], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA
+// GFX12: encoding: [0x06,0x80,0xe1,0xd0,0x03,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+
+image_atomic_pk_add_f16 v[254:255], [v4, v5, v6, v7], s[96:103] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
+// GFX12: encoding: [0x07,0x80,0xe1,0xd0,0xfe,0xc0,0x00,0x00,0x04,0x05,0x06,0x07]
+
+image_atomic_pk_add_f16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
+// GFX12: encoding: [0x00,0x80,0x61,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00]
+
+image_atomic_pk_add_bf16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+// GFX12: encoding: [0x00,0xc0,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+
+image_atomic_pk_add_bf16 v1, [v2, v3], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+// GFX12: encoding: [0x01,0xc0,0x61,0xd0,0x01,0x08,0x00,0x00,0x02,0x03,0x00,0x00]
+
+image_atomic_pk_add_bf16 v4, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_3D
+// GFX12: encoding: [0x02,0xc0,0x61,0xd0,0x04,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+
+image_atomic_pk_add_bf16 v255, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
+// GFX12: encoding: [0x03,0xc0,0x61,0xd0,0xff,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+
+image_atomic_pk_add_bf16 v[0:1], [v4, v5], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_1D_ARRAY
+// GFX12: encoding: [0x04,0xc0,0xe1,0xd0,0x00,0x10,0x00,0x00,0x04,0x05,0x00,0x00]
+
+image_atomic_pk_add_bf16 v[1:2], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY
+// GFX12: encoding: [0x05,0xc0,0xe1,0xd0,0x01,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+
+image_atomic_pk_add_bf16 v[3:4], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA
+// GFX12: encoding: [0x06,0xc0,0xe1,0xd0,0x03,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+
+image_atomic_pk_add_bf16 v[254:255], [v4, v5, v6, v7], s[96:103] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
+// GFX12: encoding: [0x07,0xc0,0xe1,0xd0,0xfe,0xc0,0x00,0x00,0x04,0x05,0x06,0x07]
+
+image_atomic_pk_add_bf16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
+// GFX12: encoding: [0x00,0xc0,0x61,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00]
+
image_bvh_intersect_ray v[4:7], [v9, v10, v[11:13], v[14:16], v[17:19]], s[4:7]
// GFX12: encoding: [0x10,0x40,0xc6,0xd3,0x04,0x08,0x00,0x11,0x09,0x0a,0x0b,0x0e]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
index d3c0e714949907e..d52c4fef0b8ad49 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt
@@ -1659,6 +1659,66 @@
# GFX12: ds_permute_b32 v255, v254, v253 offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0xfe,0xfd,0x00,0xff]
0xff,0xff,0xc8,0xda,0xfe,0xfd,0x00,0xff
+# GFX12: ds_pk_add_f16 v2, v1 ; encoding: [0x00,0x00,0x68,0xda,0x02,0x01,0x00,0x00]
+0x00,0x00,0x68,0xda,0x02,0x01,0x00,0x00
+
+# GFX12: ds_pk_add_f16 v2, v1 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x02,0x01,0x00,0x00]
+0xff,0xff,0x68,0xda,0x02,0x01,0x00,0x00
+
+# GFX12: ds_pk_add_f16 v255, v255 ; encoding: [0x00,0x00,0x68,0xda,0xff,0xff,0x00,0x00]
+0x00,0x00,0x68,0xda,0xff,0xff,0x00,0x00
+
+# GFX12: ds_pk_add_f16 v0, v0 ; encoding: [0x00,0x00,0x68,0xda,0x00,0x00,0x00,0x00]
+0x00,0x00,0x68,0xda,0x00,0x00,0x00,0x00
+
+# GFX12: ds_pk_add_f16 v0, v0 offset:4660 ; encoding: [0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00]
+0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00
+
+# gfx12: ds_pk_add_bf16 v2, v1 ; encoding: [0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00]
+0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00
+
+# GFX12: ds_pk_add_f16 v0, v0 offset:4660 ; encoding: [0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00]
+0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00
+
+# GFX12: ds_pk_add_bf16 v255, v255 ; encoding: [0x00,0x00,0x6c,0xda,0xff,0xff,0x00,0x00]
+0x00,0x00,0x6c,0xda,0xff,0xff,0x00,0x00
+
+# GFX12: ds_pk_add_bf16 v255, v255 offset:65535 ; encoding: [0xff,0xff,0x6c,0xda,0xff,0xff,0x00,0x00]
+0xff,0xff,0x6c,0xda,0xff,0xff,0x00,0x00
+
+# GFX12: ds_pk_add_bf16 v0, v0 ; encoding: [0x00,0x00,0x6c,0xda,0x00,0x00,0x00,0x00]
+0x00,0x00,0x6c,0xda,0x00,0x00,0x00,0x00
+
+# GFX12: ds_pk_add_rtn_f16 v3, v2, v1 ; encoding: [0x00,0x00,0xa8,0xda,0x02,0x01,0x00,0x03]
+0x00,0x00,0xa8,0xda,0x02,0x01,0x00,0x03
+
+# GFX12: ds_pk_add_rtn_f16 v3, v2, v1 offset:4660 ; encoding: [0x34,0x12,0xa8,0xda,0x02,0x01,0x00,0x03]
+0x34,0x12,0xa8,0xda,0x02,0x01,0x00,0x03
+
+# GFX12: ds_pk_add_rtn_f16 v255, v0, v200 ; encoding: [0x00,0x00,0xa8,0xda,0x00,0xc8,0x00,0xff]
+0x00,0x00,0xa8,0xda,0x00,0xc8,0x00,0xff
+
+# GFX12: ds_pk_add_rtn_f16 v255, v0, v200 offset:65535 ; encoding: [0xff,0xff,0xa8,0xda,0x00,0xc8,0x00,0xff]
+0xff,0xff,0xa8,0xda,0x00,0xc8,0x00,0xff
+
+# GFX12: ds_pk_add_rtn_f16 v255, v255, v255 ; encoding: [0x00,0x00,0xa8,0xda,0xff,0xff,0x00,0xff]
+0x00,0x00,0xa8,0xda,0xff,0xff,0x00,0xff
+
+# GFX12: ds_pk_add_rtn_bf16 v3, v2, v1 ; encoding: [0x00,0x00,0xac,0xda,0x02,0x01,0x00,0x03]
+0x00,0x00,0xac,0xda,0x02,0x01,0x00,0x03
+
+# GFX12: ds_pk_add_rtn_bf16 v3, v2, v1 offset:4660 ; encoding: [0x34,0x12,0xac,0xda,0x02,0x01,0x00,0x03]
+0x34,0x12,0xac,0xda,0x02,0x01,0x00,0x03
+
+# GFX12: ds_pk_add_rtn_bf16 v255, v0, v200 ; encoding: [0x00,0x00,0xac,0xda,0x00,0xc8,0x00,0xff]
+0x00,0x00,0xac,0xda,0x00,0xc8,0x00,0xff
+
+# GFX12: ds_pk_add_rtn_bf16 v255, v255, v255 ; encoding: [0x00,0x00,0xac,0xda,0xff,0xff,0x00,0xff]
+0x00,0x00,0xac,0xda,0xff,0xff,0x00,0xff
+
+# GFX12: ds_pk_add_rtn_bf16 v255, v255, v255 offset:65535 ; encoding: [0xff,0xff,0xac,0xda,0xff,0xff,0x00,0xff]
+0xff,0xff,0xac,0xda,0xff,0xff,0x00,0xff
+
# GFX12: ds_load_2addr_b32 v[254:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xd8,0x01,0x00,0x00,0xfe]
0x7f,0xff,0xdc,0xd8,0x01,0x00,0x00,0xfe
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt
index ff8437155e12edd..d882fcb6c69a0b9 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt
@@ -1560,6 +1560,90 @@
# GFX12: buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:8388607 th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0xc0,0x01,0xc4,0x01,0x18,0xbc,0x00,0x00,0xff,0xff,0x7f]
0x04,0xc0,0x01,0xc4,0x01,0x18,0xbc,0x00,0x00,0xff,0xff,0x7f
+# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_f16 v255, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x16,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+0x03,0x40,0x16,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_f16 v5, off, s[12:15], s3 offset:8388607 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+0x03,0x40,0x16,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_f16 v5, off, s[96:99], s3 offset:8388607 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
+0x03,0x40,0x16,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s101 offset:8388607 ; encoding: [0x65,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+0x65,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], m0 offset:8388607 ; encoding: [0x7d,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+0x7d,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_f16 v5, v0, s[8:11], s3 idxen offset:8388607 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
+0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_f16 v5, v0, s[8:11], s3 offen offset:8388607 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
+0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
+0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
+0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:7 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00
+
+# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+0x03,0x40,0x16,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f]
+0x03,0x40,0x16,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f]
+0x03,0x40,0x16,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_bf16 v255, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x16,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+0x03,0x80,0x16,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[12:15], s3 offset:8388607 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+0x03,0x80,0x16,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[96:99], s3 offset:8388607 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
+0x03,0x80,0x16,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s101 offset:8388607 ; encoding: [0x65,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+0x65,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], m0 offset:8388607 ; encoding: [0x7d,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+0x7d,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_bf16 v5, v0, s[8:11], s3 idxen offset:8388607 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
+0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_bf16 v5, v0, s[8:11], s3 offen offset:8388607 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
+0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
+0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
+0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:7 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00
+
+# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+0x03,0x80,0x16,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f]
+0x03,0x80,0x16,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f
+
+# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f]
+0x03,0x80,0x16,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f
+
# GFX12: buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x15,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
0x03,0x80,0x15,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
index d7f9daf295845a6..78a96f68918c460 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
@@ -525,6 +525,66 @@
# GFX12: global_atomic_sub_clamp_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
0x7c,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+# GFX12: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX12: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00]
+0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00
+
+# GFX12: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: global_atomic_pk_add_f16 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX12: global_atomic_pk_add_f16 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: global_atomic_pk_add_f16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX12: global_atomic_pk_add_f16 v0, v2, s[0:1] ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00
+
+# GFX12: global_atomic_pk_add_f16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: global_atomic_pk_add_f16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX12: global_atomic_pk_add_f16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX12: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00]
+0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00
+
+# GFX12: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: global_atomic_pk_add_bf16 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX12: global_atomic_pk_add_bf16 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: global_atomic_pk_add_bf16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX12: global_atomic_pk_add_bf16 v0, v2, s[0:1] ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00
+
+# GFX12: global_atomic_pk_add_bf16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX12: global_atomic_pk_add_bf16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
# GFX12: global_atomic_dec_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt
index eff2e09e31b1fe4..46331d6476171c0 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt
@@ -883,6 +883,60 @@
# GFX12: image_atomic_dec_uint v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT ; encoding: [0x00,0x80,0x45,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00]
0x00,0x80,0x45,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00
+# GFX12: image_atomic_pk_add_f16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x80,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x00,0x80,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: image_atomic_pk_add_f16 v1, [v2, v3], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x80,0x61,0xd0,0x01,0x08,0x00,0x00,0x02,0x03,0x00,0x00]
+0x01,0x80,0x61,0xd0,0x01,0x08,0x00,0x00,0x02,0x03,0x00,0x00
+
+# GFX12: image_atomic_pk_add_f16 v4, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_3D ; encoding: [0x02,0x80,0x61,0xd0,0x04,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+0x02,0x80,0x61,0xd0,0x04,0x10,0x00,0x00,0x04,0x05,0x06,0x00
+
+# GFX12: image_atomic_pk_add_f16 v255, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_CUBE ; encoding: [0x03,0x80,0x61,0xd0,0xff,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+0x03,0x80,0x61,0xd0,0xff,0x10,0x00,0x00,0x04,0x05,0x06,0x00
+
+# GFX12: image_atomic_pk_add_f16 v[0:1], [v4, v5], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x04,0x80,0xe1,0xd0,0x00,0x10,0x00,0x00,0x04,0x05,0x00,0x00]
+0x04,0x80,0xe1,0xd0,0x00,0x10,0x00,0x00,0x04,0x05,0x00,0x00
+
+# GFX12: image_atomic_pk_add_f16 v[1:2], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x05,0x80,0xe1,0xd0,0x01,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+0x05,0x80,0xe1,0xd0,0x01,0x10,0x00,0x00,0x04,0x05,0x06,0x00
+
+# GFX12: image_atomic_pk_add_f16 v[3:4], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA ; encoding: [0x06,0x80,0xe1,0xd0,0x03,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+0x06,0x80,0xe1,0xd0,0x03,0x10,0x00,0x00,0x04,0x05,0x06,0x00
+
+# GFX12: image_atomic_pk_add_f16 v[254:255], [v4, v5, v6, v7], s[96:103] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; encoding: [0x07,0x80,0xe1,0xd0,0xfe,0xc0,0x00,0x00,0x04,0x05,0x06,0x07]
+0x07,0x80,0xe1,0xd0,0xfe,0xc0,0x00,0x00,0x04,0x05,0x06,0x07
+
+# GFX12: image_atomic_pk_add_f16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT ; encoding: [0x00,0x80,0x61,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00]
+0x00,0x80,0x61,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: image_atomic_pk_add_bf16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0xc0,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+0x00,0xc0,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+# GFX12: image_atomic_pk_add_bf16 v1, [v2, v3], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0xc0,0x61,0xd0,0x01,0x08,0x00,0x00,0x02,0x03,0x00,0x00]
+0x01,0xc0,0x61,0xd0,0x01,0x08,0x00,0x00,0x02,0x03,0x00,0x00
+
+# GFX12: image_atomic_pk_add_bf16 v4, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_3D ; encoding: [0x02,0xc0,0x61,0xd0,0x04,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+0x02,0xc0,0x61,0xd0,0x04,0x10,0x00,0x00,0x04,0x05,0x06,0x00
+
+# GFX12: image_atomic_pk_add_bf16 v255, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_CUBE ; encoding: [0x03,0xc0,0x61,0xd0,0xff,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+0x03,0xc0,0x61,0xd0,0xff,0x10,0x00,0x00,0x04,0x05,0x06,0x00
+
+# GFX12: image_atomic_pk_add_bf16 v[0:1], [v4, v5], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x04,0xc0,0xe1,0xd0,0x00,0x10,0x00,0x00,0x04,0x05,0x00,0x00]
+0x04,0xc0,0xe1,0xd0,0x00,0x10,0x00,0x00,0x04,0x05,0x00,0x00
+
+# GFX12: image_atomic_pk_add_bf16 v[1:2], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x05,0xc0,0xe1,0xd0,0x01,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+0x05,0xc0,0xe1,0xd0,0x01,0x10,0x00,0x00,0x04,0x05,0x06,0x00
+
+# GFX12: image_atomic_pk_add_bf16 v[3:4], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA ; encoding: [0x06,0xc0,0xe1,0xd0,0x03,0x10,0x00,0x00,0x04,0x05,0x06,0x00]
+0x06,0xc0,0xe1,0xd0,0x03,0x10,0x00,0x00,0x04,0x05,0x06,0x00
+
+# GFX12: image_atomic_pk_add_bf16 v[254:255], [v4, v5, v6, v7], s[96:103] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; encoding: [0x07,0xc0,0xe1,0xd0,0xfe,0xc0,0x00,0x00,0x04,0x05,0x06,0x07]
+0x07,0xc0,0xe1,0xd0,0xfe,0xc0,0x00,0x00,0x04,0x05,0x06,0x07
+
+# GFX12: image_atomic_pk_add_bf16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT ; encoding: [0x00,0xc0,0x61,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00]
+0x00,0xc0,0x61,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00
+
# GFX12: image_bvh_intersect_ray v[4:7], [v9, v10, v[11:13], v[14:16], v[17:19]], s[4:7] ; encoding: [0x10,0x40,0xc6,0xd3,0x04,0x08,0x00,0x11,0x09,0x0a,0x0b,0x0e]
0x10,0x40,0xc6,0xd3,0x04,0x08,0x00,0x11,0x09,0x0a,0x0b,0x0e
>From f8c0a07eda452f179e85ffa48b3510feb116564e Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Mon, 15 Jan 2024 08:25:42 +0100
Subject: [PATCH 2/6] Use bf16 type
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 18 +++---
llvm/lib/Target/AMDGPU/BUFInstructions.td | 4 +-
.../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 60 ++++++++++++-------
.../AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll | 31 ++++------
4 files changed, 59 insertions(+), 54 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 4730c25d942538d..e13bfbc0d29de71 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1034,7 +1034,7 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
AMDGPUArg<LLVMMatchType<0>, "cmp">]>;
defm int_amdgcn_image_atomic_pk_add_f16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_F16">;
- defm int_amdgcn_image_atomic_pk_add_bf16 : AMDGPUImageDimAtomic<"ATOMIC_PK_ADD_BF16">;
+ defm int_amdgcn_image_atomic_pk_add_bf16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_BF16">;
}
//////////////////////////////////////////////////////////////////////////
@@ -1319,8 +1319,8 @@ def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>;
// gfx12+ intrinsic
def int_amdgcn_raw_buffer_atomic_fadd_v2bf16 : Intrinsic <
- [llvm_v2i16_ty],
- [llvm_v2i16_ty,
+ [llvm_v2bf16_ty],
+ [llvm_v2bf16_ty,
llvm_v4i32_ty,
llvm_i32_ty,
llvm_i32_ty,
@@ -1328,8 +1328,8 @@ def int_amdgcn_raw_buffer_atomic_fadd_v2bf16 : Intrinsic <
[ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<1, 0>;
def int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic <
- [llvm_v2i16_ty],
- [llvm_v2i16_ty,
+ [llvm_v2bf16_ty],
+ [llvm_v2bf16_ty,
AMDGPUBufferRsrcTy,
llvm_i32_ty,
llvm_i32_ty,
@@ -1413,8 +1413,8 @@ def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloa
def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
// gfx12 intrinsic
def int_amdgcn_struct_buffer_atomic_fadd_v2bf16 : Intrinsic <
- [llvm_v2i16_ty],
- [llvm_v2i16_ty,
+ [llvm_v2bf16_ty],
+ [llvm_v2bf16_ty,
llvm_v4i32_ty,
llvm_i32_ty,
llvm_i32_ty,
@@ -1423,8 +1423,8 @@ def int_amdgcn_struct_buffer_atomic_fadd_v2bf16 : Intrinsic <
[ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<1, 0>;
def int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic <
- [llvm_v2i16_ty],
- [llvm_v2i16_ty,
+ [llvm_v2bf16_ty],
+ [llvm_v2bf16_ty,
AMDGPUBufferRsrcTy,
llvm_i32_ty,
llvm_i32_ty,
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 9e6f602a202977d..48be9ba57900829 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1246,7 +1246,7 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
let SubtargetPredicate = isGFX12Plus in {
let FPAtomic = 1 in
defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_pk_add_bf16", VGPR_32, v2i16
+ "buffer_atomic_pk_add_bf16", VGPR_32, v2bf16
>;
}
@@ -1714,7 +1714,7 @@ let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>;
let SubtargetPredicate = isGFX12Plus in {
- defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd_bf16", v2i16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">;
+ defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd_bf16", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">;
}
let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index 0b06013a7324e04..5423d07a96900e8 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -3,9 +3,9 @@
; RUN: llc < %s -march=amdgcn -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12-GISEL
declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg)
-declare <2 x i16> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x i16>, <4 x i32>, i32, i32, i32, i32 immarg)
+declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg)
declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32)
-declare <2 x i16> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32, i32, i32)
+declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32)
declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1)
@@ -41,8 +41,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: ds_pk_add_bf16 v0, v1
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: buffer_gl0_inv
-; GFX12-NEXT: buffer_gl1_inv
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret:
@@ -52,8 +51,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
; GFX12-GISEL-NEXT: ds_pk_add_bf16 v1, v0
; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-GISEL-NEXT: buffer_gl0_inv
-; GFX12-GISEL-NEXT: buffer_gl1_inv
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret void
@@ -84,8 +82,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: buffer_gl0_inv
-; GFX12-NEXT: buffer_gl1_inv
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn:
@@ -94,8 +91,7 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX12-GISEL-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-GISEL-NEXT: buffer_gl0_inv
-; GFX12-GISEL-NEXT: buffer_gl1_inv
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
ret <2 x i16> %ret
@@ -362,24 +358,34 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4
ret void
}
-define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x i16> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX12-LABEL: struct_buffer_atomic_add_v2bf16_ret:
; GFX12: ; %bb.0:
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: flat_store_b32 v[1:2], v0
+; GFX12-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_ret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: flat_store_b32 v[1:2], v0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: ; return to shader part epilog
- %orig = call <2 x i16> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
- %r = bitcast <2 x i16> %orig to float
- ret float %r
+ %orig = call <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ store <2 x bfloat> %orig, ptr null
+ ret float 1.0
}
-define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x i16> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX12-LABEL: struct_buffer_atomic_add_v2bf16_noret:
; GFX12: ; %bb.0:
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
@@ -393,11 +399,11 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x i16> %val, <4
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
- %orig = call <2 x i16> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+ %orig = call <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
-define amdgpu_ps void @raw_buffer_atomic_add_v2bf16(<2 x i16> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+define amdgpu_ps void @raw_buffer_atomic_add_v2bf16(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
; GFX12-LABEL: raw_buffer_atomic_add_v2bf16:
; GFX12: ; %bb.0:
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
@@ -411,23 +417,33 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2bf16(<2 x i16> %val, <4 x i32> in
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
- %ret = call <2 x i16> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ %ret = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
ret void
}
-define amdgpu_ps float @raw_buffer_atomic_add_v2bf16_ret(<2 x i16> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+define amdgpu_ps float @raw_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
; GFX12-LABEL: raw_buffer_atomic_add_v2bf16_ret:
; GFX12: ; %bb.0:
; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: flat_store_b32 v[1:2], v0
+; GFX12-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2bf16_ret:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: flat_store_b32 v[1:2], v0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-GISEL-NEXT: ; return to shader part epilog
- %orig = call <2 x i16> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
- %ret = bitcast <2 x i16> %orig to float
- ret float %ret
+ %orig = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+ store <2 x bfloat> %orig, ptr null
+ ret float 1.0
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
index 9affd307c6a9709..b2d59f995a87ef7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
@@ -27,34 +27,23 @@ main_body:
ret float %out0
}
-define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x i16> %data, i32 %s) {
+define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x bfloat> %data, i32 %s) {
; GFX12-LABEL: atomic_pk_add_bf16_1d_v2:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: flat_store_b32 v[1:2], v0
+; GFX12-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: ; return to shader part epilog
main_body:
- %out = call <2 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2i16.v2i16(<2 x i16> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
- %out_i32 = bitcast <2 x i16> %out to i32
- %out_float = bitcast i32 %out_i32 to float
- ret float %out_float
-}
-
-define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x i16> %data, i32 %s) {
-; GFX12-LABEL: atomic_pk_add_bf16_1d_v4:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: ; return to shader part epilog
-main_body:
- %out = call <4 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4i16.v4i16(<4 x i16> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
- %out_v2i32 = bitcast <4 x i16> %out to <2 x i32>
- %out0 = extractelement <2 x i32> %out_v2i32, i32 0
- %out_float = bitcast i32 %out0 to float
- ret float %out_float
+ %out = call <2 x bfloat> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2bf16.v2bf16(<2 x bfloat> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ store <2 x bfloat> %out, ptr null
+ ret float 1.0
}
declare <2 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v2f16.v2f16(<2 x half>, i32, <8 x i32>, i32, i32)
declare <4 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v4f16.v4f16(<4 x half>, i32, <8 x i32>, i32, i32)
-declare <2 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2i16.v2i16(<2 x i16>, i32, <8 x i32>, i32, i32)
-declare <4 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4i16.v4i16(<4 x i16>, i32, <8 x i32>, i32, i32)
+declare <2 x bfloat> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2bf16.v2bf16(<2 x bfloat>, i32, <8 x i32>, i32, i32)
>From 41752e7d1f57ca11cccfb74725cb2ce3fc8542fe Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Wed, 17 Jan 2024 07:26:17 +0100
Subject: [PATCH 3/6] Update tests for <4 x bfloat>
---
.../CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
index b2d59f995a87ef7..b2454d4e085736c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
@@ -44,6 +44,14 @@ main_body:
ret float 1.0
}
+define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) {
+main_body:
+ %out = call <4 x bfloat> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4bf16.v4bf16(<4 x bfloat> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ store <4 x bfloat> %out, ptr null
+ ret float 1.0
+}
+
declare <2 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v2f16.v2f16(<2 x half>, i32, <8 x i32>, i32, i32)
declare <4 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v4f16.v4f16(<4 x half>, i32, <8 x i32>, i32, i32)
declare <2 x bfloat> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2bf16.v2bf16(<2 x bfloat>, i32, <8 x i32>, i32, i32)
+declare <4 x bfloat> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4bf16.v4bf16(<4 x bfloat>, i32, <8 x i32>, i32, i32)
>From 130823dd8518da0557777721a7c9a0b4aa8050c3 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Wed, 17 Jan 2024 12:40:06 +0100
Subject: [PATCH 4/6] Update image.atomic tests for unused return value
---
.../AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll | 232 ++++++++++++++++--
1 file changed, 210 insertions(+), 22 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
index b2454d4e085736c..ae468e8b259afae 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
@@ -1,25 +1,74 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -march=amdgcn -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-SDAG %s
+; RUN: llc -march=amdgcn -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-GISEL %s
define amdgpu_ps float @atomic_pk_add_f16_1d_v2(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) {
-; GFX12-LABEL: atomic_pk_add_f16_1d_v2:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2:
+; GFX12-SDAG: ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v2:
+; GFX12-GISEL: ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: ; return to shader part epilog
main_body:
%out = call <2 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v2f16.v2f16(<2 x half> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
%out_float = bitcast <2 x half> %out to float
ret float %out_float
}
+define amdgpu_ps float @atomic_pk_add_f16_1d_v2_nt(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) {
+; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2_nt:
+; GFX12-SDAG: ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v2_nt:
+; GFX12-GISEL: ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+main_body:
+ %out = call <2 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v2f16.v2f16(<2 x half> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
+ %out_float = bitcast <2 x half> %out to float
+ ret float %out_float
+}
+
+define amdgpu_ps float @atomic_pk_add_f16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) {
+; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2_noret:
+; GFX12-SDAG: ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v2_noret:
+; GFX12-GISEL: ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+main_body:
+ %unused = call <2 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v2f16.v2f16(<2 x half> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret float 1.0
+}
+
define amdgpu_ps float @atomic_pk_add_f16_1d_v4(<8 x i32> inreg %rsrc, <4 x half> %data, i32 %s) {
-; GFX12-LABEL: atomic_pk_add_f16_1d_v4:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v4:
+; GFX12-SDAG: ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v4:
+; GFX12-GISEL: ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: ; return to shader part epilog
main_body:
%out = call <4 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v4f16.v4f16(<4 x half> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
%out_v2f32 = bitcast <4 x half> %out to <2 x float>
@@ -27,30 +76,169 @@ main_body:
ret float %out0
}
+define amdgpu_ps float @atomic_pk_add_f16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 x half> %data, i32 %s) {
+; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v4_noret:
+; GFX12-SDAG: ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v4_noret:
+; GFX12-GISEL: ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+main_body:
+ %unused = call <4 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v4f16.v4f16(<4 x half> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret float 1.0
+}
+
define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x bfloat> %data, i32 %s) {
-; GFX12-LABEL: atomic_pk_add_bf16_1d_v2:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: flat_store_b32 v[1:2], v0
-; GFX12-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v2:
+; GFX12-SDAG: ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: flat_store_b32 v[1:2], v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v2:
+; GFX12-GISEL: ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: flat_store_b32 v[1:2], v0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT: ; return to shader part epilog
main_body:
%out = call <2 x bfloat> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2bf16.v2bf16(<2 x bfloat> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
store <2 x bfloat> %out, ptr null
ret float 1.0
}
+define amdgpu_ps float @atomic_pk_add_bf16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 x bfloat> %data, i32 %s) {
+; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v2_noret:
+; GFX12-SDAG: ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v2_noret:
+; GFX12-GISEL: ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+main_body:
+ %unused = call <2 x bfloat> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2bf16.v2bf16(<2 x bfloat> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret float 1.0
+}
+
define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) {
+; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4:
+; GFX12-SDAG: ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, 0
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4:
+; GFX12-GISEL: ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX12-GISEL-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-GISEL-NEXT: ; return to shader part epilog
main_body:
%out = call <4 x bfloat> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4bf16.v4bf16(<4 x bfloat> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
store <4 x bfloat> %out, ptr null
ret float 1.0
}
+define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) {
+; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_noret:
+; GFX12-SDAG: ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_noret:
+; GFX12-GISEL: ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX12-GISEL-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+main_body:
+ %unused = call <4 x bfloat> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4bf16.v4bf16(<4 x bfloat> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret float 1.0
+}
+
+define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_nt(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) {
+; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_nt:
+; GFX12-SDAG: ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_nt:
+; GFX12-GISEL: ; %bb.0: ; %main_body
+; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX12-GISEL-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
+; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-GISEL-NEXT: ; return to shader part epilog
+main_body:
+ %unused = call <4 x bfloat> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4bf16.v4bf16(<4 x bfloat> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
+ ret float 1.0
+}
+
+
declare <2 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v2f16.v2f16(<2 x half>, i32, <8 x i32>, i32, i32)
declare <4 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v4f16.v4f16(<4 x half>, i32, <8 x i32>, i32, i32)
declare <2 x bfloat> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2bf16.v2bf16(<2 x bfloat>, i32, <8 x i32>, i32, i32)
>From 549678cf6873a1bdb44543c843ccacd4d739e01b Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Thu, 18 Jan 2024 11:20:34 +0100
Subject: [PATCH 5/6] Use explicit global-isel=0 in test
---
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
index ae468e8b259afae..c58c4cf261f9b6b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-SDAG %s
+; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-SDAG %s
; RUN: llc -march=amdgcn -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-GISEL %s
define amdgpu_ps float @atomic_pk_add_f16_1d_v2(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) {
>From ca3c62daf22555ee2b9258cf789212be79e4fa27 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Thu, 18 Jan 2024 11:54:46 +0100
Subject: [PATCH 6/6] Use mtriple instead of march in tests
---
.../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 302 +++++++++---------
.../AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll | 4 +-
2 files changed, 153 insertions(+), 153 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index 5423d07a96900e8..c599c1b676589b9 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12-SDAG
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12-GISEL
declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg)
declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg)
@@ -14,13 +14,13 @@ declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x h
declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
-; GFX12-LABEL: local_atomic_fadd_v2f16_noret:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: ds_pk_add_f16 v0, v1
-; GFX12-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret:
; GFX12-GISEL: ; %bb.0:
@@ -34,15 +34,15 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
}
define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
-; GFX12-LABEL: local_atomic_fadd_v2bf16_noret:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: ds_pk_add_bf16 v0, v1
-; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret:
; GFX12-GISEL: ; %bb.0:
@@ -58,12 +58,12 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
}
define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) {
-; GFX12-LABEL: local_atomic_fadd_v2f16_rtn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
-; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_rtn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn:
; GFX12-GISEL: ; %bb.0:
@@ -76,14 +76,14 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
}
define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) {
-; GFX12-LABEL: local_atomic_fadd_v2bf16_rtn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
-; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_rtn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn:
; GFX12-GISEL: ; %bb.0:
@@ -98,14 +98,14 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
}
define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
-; GFX12-LABEL: flat_atomic_fadd_v2f16_noret:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
-; GFX12-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret:
; GFX12-GISEL: ; %bb.0:
@@ -120,12 +120,12 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
}
define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
-; GFX12-LABEL: flat_atomic_fadd_v2f16_rtn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_rtn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_rtn:
; GFX12-GISEL: ; %bb.0:
@@ -138,14 +138,14 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
}
define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
-; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
-; GFX12-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret:
; GFX12-GISEL: ; %bb.0:
@@ -160,12 +160,12 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da
}
define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
-; GFX12-LABEL: flat_atomic_fadd_v2bf16_rtn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_rtn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_rtn:
; GFX12-GISEL: ; %bb.0:
@@ -178,15 +178,15 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
}
define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
-; GFX12-LABEL: global_atomic_fadd_v2bf16_noret:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
-; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1]
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret:
; GFX12-GISEL: ; %bb.0:
@@ -202,12 +202,12 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr
}
define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) {
-; GFX12-LABEL: global_atomic_fadd_v2bf16_rtn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_rtn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_rtn:
; GFX12-GISEL: ; %bb.0:
@@ -220,11 +220,11 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
}
define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
-; GFX12-LABEL: global_atomic_pk_add_v2f16:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: global_atomic_pk_add_v2f16:
+; GFX12-SDAG: ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16:
; GFX12-GISEL: ; %bb.0: ; %main_body
@@ -237,12 +237,12 @@ main_body:
}
define <2 x half> @global_atomic_pk_add_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) {
-; GFX12-LABEL: global_atomic_pk_add_v2f16_rtn:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: global_atomic_pk_add_v2f16_rtn:
+; GFX12-SDAG: ; %bb.0: ; %main_body
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16_rtn:
; GFX12-GISEL: ; %bb.0: ; %main_body
@@ -256,12 +256,12 @@ main_body:
}
define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret_offset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-; GFX12-LABEL: raw_buffer_atomic_add_v2f16_noret_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_noret_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret_offset:
; GFX12-GISEL: ; %bb.0:
@@ -274,12 +274,12 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret_offset(<2 x half> %val,
}
define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-; GFX12-LABEL: raw_buffer_atomic_add_v2f16_noret:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_noret:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret:
; GFX12-GISEL: ; %bb.0:
@@ -292,11 +292,11 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i
}
define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret_offset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-; GFX12-LABEL: raw_buffer_atomic_add_v2f16_ret_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_ret_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret_offset:
; GFX12-GISEL: ; %bb.0:
@@ -308,11 +308,11 @@ define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret_offset(<2 x half> %
}
define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-; GFX12-LABEL: raw_buffer_atomic_add_v2f16_ret:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_ret:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret:
; GFX12-GISEL: ; %bb.0:
@@ -324,11 +324,11 @@ define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4
}
define amdgpu_ps float @struct_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-; GFX12-LABEL: struct_buffer_atomic_add_v2f16_ret:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: struct_buffer_atomic_add_v2f16_ret:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_ret:
; GFX12-GISEL: ; %bb.0:
@@ -341,12 +341,12 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x
}
define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-; GFX12-LABEL: struct_buffer_atomic_add_v2f16_noret:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: struct_buffer_atomic_add_v2f16_noret:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_noret:
; GFX12-GISEL: ; %bb.0:
@@ -359,16 +359,16 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4
}
define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-; GFX12-LABEL: struct_buffer_atomic_add_v2bf16_ret:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: flat_store_b32 v[1:2], v0
-; GFX12-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: struct_buffer_atomic_add_v2bf16_ret:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: flat_store_b32 v[1:2], v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_ret:
; GFX12-GISEL: ; %bb.0:
@@ -386,12 +386,12 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <
}
define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-; GFX12-LABEL: struct_buffer_atomic_add_v2bf16_noret:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: struct_buffer_atomic_add_v2bf16_noret:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_noret:
; GFX12-GISEL: ; %bb.0:
@@ -404,12 +404,12 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x bfloat> %val,
}
define amdgpu_ps void @raw_buffer_atomic_add_v2bf16(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-; GFX12-LABEL: raw_buffer_atomic_add_v2bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
+; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2bf16:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2bf16:
; GFX12-GISEL: ; %bb.0:
@@ -422,16 +422,16 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2bf16(<2 x bfloat> %val, <4 x i32>
}
define amdgpu_ps float @raw_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-; GFX12-LABEL: raw_buffer_atomic_add_v2bf16_ret:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: flat_store_b32 v[1:2], v0
-; GFX12-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX12-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12-NEXT: ; return to shader part epilog
+; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2bf16_ret:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX12-SDAG-NEXT: flat_store_b32 v[1:2], v0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2bf16_ret:
; GFX12-GISEL: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
index c58c4cf261f9b6b..04767a4977515c1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-SDAG %s
-; RUN: llc -march=amdgcn -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-GISEL %s
define amdgpu_ps float @atomic_pk_add_f16_1d_v2(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) {
; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2:
More information about the libcxx-commits
mailing list