[llvm] [AMDGPU] Set MaxAtomicSizeInBitsSupported. (PR #75185)

Tue Dec 12 06:21:18 PST 2023

https://github.com/jyknight updated https://github.com/llvm/llvm-project/pull/75185

>From dabbada1f693b9a87b6bb3ce59a7bd0f8984cc60 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Tue, 12 Dec 2023 08:34:41 -0500
Subject: [PATCH 1/2] [AMDGPU] Set MaxAtomicSizeInBitsSupported.

This will result in larger atomic operations getting expanded to
`__atomic_*` libcalls via AtomicExpandPass, which matches what Clang
already does in the frontend.

While AMDGPU currently disables the use of all libcalls, I've changed
it to instead disable all of them _except_ the atomic ones. Those are
already be emitted by the Clang frontend, and by enabling them in the
backend, allows the same behavior there.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 +++++++---
 llvm/test/CodeGen/AMDGPU/atomic-oversize.ll   | 10 ++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/atomic-oversize.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index fcbdf51b03c1f..78092675057df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -506,9 +506,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT, MVT::v12f32, Promote);
   AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
 
-  // There are no libcalls of any kind.
-  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
-    setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
+  // Disable most libcalls.
+  for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
+    if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
+      setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
+  }
 
   setSchedulingPreference(Sched::RegPressure);
   setJumpIsExpensive(true);
@@ -556,6 +558,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                        ISD::FSUB,       ISD::FNEG,
                        ISD::FABS,       ISD::AssertZext,
                        ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
+
+  setMaxAtomicSizeInBitsSupported(64);
 }
 
 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
diff --git a/llvm/test/CodeGen/AMDGPU/atomic-oversize.ll b/llvm/test/CodeGen/AMDGPU/atomic-oversize.ll
new file mode 100644
index 0000000000000..f62a93f523365
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomic-oversize.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+
+define void @test(ptr %a) nounwind {
+; CHECK-LABEL: test:
+; CHECK: __atomic_load_16
+; CHECK: __atomic_store_16
+  %1 = load atomic i128, ptr %a seq_cst, align 16
+  store atomic i128 %1, ptr %a seq_cst, align 16
+  ret void
+}

>From 3f64deb990bed8c501fa4c6d31293eb8588a214c Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Tue, 12 Dec 2023 09:20:26 -0500
Subject: [PATCH 2/2] Adjust
 llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll not to expect to
 crash.

---
 .../AtomicExpand/AMDGPU/unaligned-atomic.ll   | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll
index bdfd90dc11dca..6c84474edc05b 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll
@@ -1,15 +1,13 @@
-; RUN: not --crash opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -atomic-expand %s 2>&1 | FileCheck %s
-; The AtomicExpand pass cannot handle missing libcalls (yet) so reports a fatal error.
-; CHECK: LLVM ERROR: expandAtomicOpToLibcall shouldn't fail for Load
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -atomic-expand %s 2>&1 | FileCheck --check-prefix=GCN %s
 
 define i32 @atomic_load_global_align1(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: @atomic_load_global_align1(
 ; GCN-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(1) [[PTR:%.*]] to ptr
-; GCN-NEXT:    [[TMP3:%.*]] = alloca i32, align 4
-; GCN-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP3]])
-; GCN-NEXT:    call void @0(i64 4, ptr [[TMP2]], ptr [[TMP3]], i32 5)
-; GCN-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
-; GCN-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP3]])
+; GCN-NEXT:    [[TMP3:%.*]] = alloca i32, align 4, addrspace(5)
+; GCN-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP3]])
+; GCN-NEXT:    call void @__atomic_load(i64 4, ptr [[TMP2]], ptr addrspace(5) [[TMP3]], i32 5)
+; GCN-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(5) [[TMP3]], align 4
+; GCN-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP3]])
 ; GCN-NEXT:    ret i32 [[TMP5]]
 ;
   %val = load atomic i32, ptr addrspace(1) %ptr  seq_cst, align 1
@@ -19,11 +17,11 @@ define i32 @atomic_load_global_align1(ptr addrspace(1) %ptr) {
 define void @atomic_store_global_align1(ptr addrspace(1) %ptr, i32 %val) {
 ; GCN-LABEL: @atomic_store_global_align1(
 ; GCN-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(1) [[PTR:%.*]] to ptr
-; GCN-NEXT:    [[TMP3:%.*]] = alloca i32, align 4
-; GCN-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP3]])
-; GCN-NEXT:    store i32 [[VAL:%.*]], ptr [[TMP3]], align 4
-; GCN-NEXT:    call void @1(i64 4, ptr [[TMP2]], ptr [[TMP3]], i32 0)
-; GCN-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP3]])
+; GCN-NEXT:    [[TMP3:%.*]] = alloca i32, align 4, addrspace(5)
+; GCN-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[TMP3]])
+; GCN-NEXT:    store i32 [[VAL:%.*]], ptr addrspace(5) [[TMP3]], align 4
+; GCN-NEXT:    call void @__atomic_store(i64 4, ptr [[TMP2]], ptr addrspace(5) [[TMP3]], i32 0)
+; GCN-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[TMP3]])
 ; GCN-NEXT:    ret void
 ;
   store atomic i32 %val, ptr addrspace(1) %ptr monotonic, align 1