[clang] [llvm] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions (PR #75917)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Dec 19 02:04:56 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mc
Author: Mariusz Sikora (mariusz-sikora-at-amd)
<details>
<summary>Changes</summary>
- image_atomic_pk_add_f16
- image_atomic_pk_add_bf16
- ds_pk_add_bf16
- ds_pk_add_f16
- ds_pk_add_rtn_bf16
- ds_pk_add_rtn_f16
- flat_atomic_pk_add_f16
- flat_atomic_pk_add_bf16
- global_atomic_pk_add_f16
- global_atomic_pk_add_bf16
- buffer_atomic_pk_add_f16
- buffer_atomic_pk_add_bf16
---
Patch is 97.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75917.diff
29 Files Affected:
- (modified) clang/test/CodeGenOpenCL/amdgpu-features.cl (+2-2)
- (added) clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl (+92)
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+45)
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUGISel.td (+1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h (+1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+23-3)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (+4)
- (modified) llvm/lib/Target/AMDGPU/BUFInstructions.td (+21)
- (modified) llvm/lib/Target/AMDGPU/DSInstructions.td (+8-4)
- (modified) llvm/lib/Target/AMDGPU/FLATInstructions.td (+4)
- (modified) llvm/lib/Target/AMDGPU/MIMGInstructions.td (+2)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+23-12)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+1)
- (modified) llvm/lib/TargetParser/TargetParser.cpp (+4)
- (added) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll (+433)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll (+60)
- (modified) llvm/test/MC/AMDGPU/gfx11_unsupported.s (+18)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_ds.s (+75)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s (+132)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vflat.s (+60)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_vimage.s (+54)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt (+60)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt (+84)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt (+60)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt (+54)
``````````diff
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 8959634572b44e..fe1798406967e8 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -100,8 +100,8 @@
// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1200: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1201: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
new file mode 100644
index 00000000000000..20ff12c3376370
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -0,0 +1,92 @@
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN: %s -S -emit-llvm -o - | FileCheck %s
+
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN: -S -o - %s | FileCheck -check-prefix=GFX12 %s
+
+// REQUIRES: amdgpu-registered-target
+
+typedef half __attribute__((ext_vector_type(2))) half2;
+typedef short __attribute__((ext_vector_type(2))) short2;
+
+// CHECK-LABEL: test_local_add_2bf16
+// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
+// GFX12-LABEL: test_local_add_2bf16
+// GFX12: ds_pk_add_rtn_bf16
+short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
+ return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
+}
+
+// CHECK-LABEL: test_local_add_2bf16_noret
+// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
+// GFX12-LABEL: test_local_add_2bf16_noret
+// GFX12: ds_pk_add_bf16
+void test_local_add_2bf16_noret(__local short2 *addr, short2 x) {
+ __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
+}
+
+// CHECK-LABEL: test_local_add_2f16
+// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
+// GFX12-LABEL: test_local_add_2f16
+// GFX12: ds_pk_add_rtn_f16
+half2 test_local_add_2f16(__local half2 *addr, half2 x) {
+ return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: test_local_add_2f16_noret
+// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
+// GFX12-LABEL: test_local_add_2f16_noret
+// GFX12: ds_pk_add_f16
+void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
+ __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: test_flat_add_2f16
+// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %{{.*}}, <2 x half> %{{.*}})
+// GFX12-LABEL: test_flat_add_2f16
+// GFX12: flat_atomic_pk_add_f16
+half2 test_flat_add_2f16(__generic half2 *addr, half2 x) {
+ return __builtin_amdgcn_flat_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: test_flat_add_2bf16
+// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, <2 x i16> %{{.*}})
+// GFX12-LABEL: test_flat_add_2bf16
+// GFX12: flat_atomic_pk_add_bf16
+short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) {
+ return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x);
+}
+
+// CHECK-LABEL: test_global_add_half2
+// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %{{.*}}, <2 x half> %{{.*}})
+// GFX12-LABEL: test_global_add_half2
+// GFX12: global_atomic_pk_add_f16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN
+void test_global_add_half2(__global half2 *addr, half2 x) {
+ half2 *rtn;
+ *rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: test_global_add_half2_noret
+// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %{{.*}}, <2 x half> %{{.*}})
+// GFX12-LABEL: test_global_add_half2_noret
+// GFX12: global_atomic_pk_add_f16 v[0:1], v2, off
+void test_global_add_half2_noret(__global half2 *addr, half2 x) {
+ __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x);
+}
+
+// CHECK-LABEL: test_global_add_2bf16
+// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
+// GFX12-LABEL: test_global_add_2bf16
+// GFX12: global_atomic_pk_add_bf16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN
+void test_global_add_2bf16(__global short2 *addr, short2 x) {
+ short2 *rtn;
+ *rtn = __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x);
+}
+
+// CHECK-LABEL: test_global_add_2bf16_noret
+// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}})
+// GFX12-LABEL: test_global_add_2bf16_noret
+// GFX12: global_atomic_pk_add_bf16 v[0:1], v2, off
+void test_global_add_2bf16_noret(__global short2 *addr, short2 x) {
+ __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 51bd9b63c127ed..bea39743525b23 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1026,6 +1026,9 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
defm int_amdgcn_image_atomic_cmpswap :
AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">,
AMDGPUArg<LLVMMatchType<0>, "cmp">]>;
+
+ defm int_amdgcn_image_atomic_pk_add_f16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_F16">;
+ defm int_amdgcn_image_atomic_pk_add_bf16 : AMDGPUImageDimAtomic<"ATOMIC_PK_ADD_BF16">;
}
//////////////////////////////////////////////////////////////////////////
@@ -1294,6 +1297,26 @@ def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
// gfx908 intrinsic
def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>;
+// gfx12+ intrinsic
+def int_amdgcn_raw_buffer_atomic_fadd_v2bf16 : Intrinsic <
+ [llvm_v2i16_ty],
+ [llvm_v2i16_ty,
+ llvm_v4i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty],
+ [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
+ AMDGPURsrcIntrinsic<1, 0>;
+def int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic <
+ [llvm_v2i16_ty],
+ [llvm_v2i16_ty,
+ AMDGPUBufferRsrcTy,
+ llvm_i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty],
+ [IntrArgMemOnly, NoCapture<ArgIndex<1>>,
+ ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
+ AMDGPURsrcIntrinsic<1, 0>;
class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
[data_ty],
@@ -1368,6 +1391,28 @@ def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
// gfx908 intrinsic
def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
+// gfx12 intrinsic
+def int_amdgcn_struct_buffer_atomic_fadd_v2bf16 : Intrinsic <
+ [llvm_v2i16_ty],
+ [llvm_v2i16_ty,
+ llvm_v4i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty],
+ [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
+ AMDGPURsrcIntrinsic<1, 0>;
+def int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic <
+ [llvm_v2i16_ty],
+ [llvm_v2i16_ty,
+ AMDGPUBufferRsrcTy,
+ llvm_i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty,
+ llvm_i32_ty],
+ [IntrArgMemOnly, NoCapture<ArgIndex<1>>,
+ ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
+ AMDGPURsrcIntrinsic<1, 0>;
// gfx90a intrinsics
def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 060fb66d38f7bc..14b2155938eef6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1473,6 +1473,10 @@ def FeatureISAVersion12 : FeatureSet<
FeatureArchitectedFlatScratch,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
+ FeatureAtomicDsPkAdd16Insts,
+ FeatureAtomicFlatPkAdd16Insts,
+ FeatureAtomicBufferGlobalPkAddF16Insts,
+ FeatureAtomicGlobalPkAddBF16Inst,
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeatureExtendedImageInsts,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 2b85024a9b40be..f426b7f38428da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -261,6 +261,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD_BF16, SIbuffer_atomic_fadd_bf16>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 156a264a7c1faa..8205bdac4e2c5d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5425,6 +5425,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
+ NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 827fb106b55199..8d972c46447b8b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -587,6 +587,7 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_CMPSWAP,
BUFFER_ATOMIC_CSUB,
BUFFER_ATOMIC_FADD,
+ BUFFER_ATOMIC_FADD_BF16,
BUFFER_ATOMIC_FMIN,
BUFFER_ATOMIC_FMAX,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index fbee2888945185..32964e5c2e3ef1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5872,6 +5872,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
@@ -6079,6 +6082,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
LLT Ty = MRI->getType(VData);
+ const bool IsAtomicPacked16Bit =
+ (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
+ BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
+
// Check for 16 bit addresses and pack if true.
LLT GradTy =
MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
@@ -6087,7 +6094,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
const bool IsG16 =
ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
const bool IsA16 = AddrTy == S16;
- const bool IsD16 = Ty.getScalarType() == S16;
+ const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
int DMaskLanes = 0;
if (!BaseOpcode->Atomic) {
@@ -6129,7 +6136,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
LLT Ty = MRI->getType(VData0);
// TODO: Allow atomic swap and bit ops for v2s16/v4s16
- if (Ty.isVector())
+ if (Ty.isVector() && !IsAtomicPacked16Bit)
return false;
if (BaseOpcode->AtomicX2) {
@@ -6265,9 +6272,18 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (NumElts > 4 || DMaskLanes > 4)
return false;
+ // Image atomic instructions are using DMask to specify how many bits
+ // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
+ // DMaskLanes for image atomic has default value '0'.
+ // We must be sure that atomic variants (especially packed) will not be
+ // truncated from v2s16 or v4s16 to s16 type.
+ //
+ // ChangeElementCount will be needed for image load where Ty is always scalar.
const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
const LLT AdjustedTy =
- Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
+ DMaskLanes == 0
+ ? Ty
+ : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
// The raw dword aligned data component of the load. The only legal cases
// where this matters should be when using the packed D16 format, for
@@ -7069,6 +7085,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
return legalizeBufferAtomic(MI, B, IntrID);
case Intrinsic::trap:
return legalizeTrapIntrinsic(MI, MRI, B);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c9412f720c62ec..d8d19f65190bb4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2920,6 +2920,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
applyDefaultMapping(OpdMapper);
@@ -4200,6 +4201,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
// vdata_out
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index beb670669581f1..f4415aaa6b1ff9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -278,6 +278,7 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
@@ -294,6 +295,7 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>;
@@ -310,6 +312,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
@@ -326,6 +329,7 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 43d35fa5291ca0..2a6ad3b0d4a25b 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1245,6 +1245,13 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
"buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag
>;
+let SubtargetPredicate = isGFX12Plus in {
+let FPAtomic = 1 in
+defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_pk_add_bf16", VGPR_32, v2i16
+>;
+}
+
//===----------------------------------------------------------------------===//
// MTBUF Instructions
//===----------------------------------------------------------------------===//
@@ -1708,6 +1715,10 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">;
let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>;
+let SubtargetPredicate = isGFX12Plus in {
+ defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd_bf16", v2i16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">;
+}
+
let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32,...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/75917
More information about the cfe-commits
mailing list