[llvm] [AMDGPU] Add an option to disable unsafe uses of atomic xor (PR #69229)

Mon Oct 16 10:38:52 PDT 2023

https://github.com/pasaulais created https://github.com/llvm/llvm-project/pull/69229

On some systems and when accessing certain memory areas it is not safe to use atomic xor with AMDGPU. This may be the case when using fine-grained memory allocations (e.g. through `hipAllocManaged`) and when the atomic operation needs to go through the PCIe bus, which does not support atomic xor (PCIe 3.0 does support other atomic operations like fetch_and_add and cmpxchg) [1].

The issue has been worked around in DPC++/HIP by prefetching memory before executing kernels [2], however this adds an overhead that can outweigh the performance benefit of using atomic xor over a cmpxchg loop. This is why a way to switch between 'native' atomic xor and a cmpxchg-loop solution is needed.

These changes add `-munsafe-int-atomics` and `-mno-unsafe-int-atomics` options to clang that can be used to switch between the two implementations. The default is the current behaviour of generative 'native' atomic xor instructions. When `-mno-unsafe-int-atomics` is passed to clang, functions are marked with the `amdgpu-unsafe-int-atomics` attribute (set to `false`) at the IR level and this tells the backend to expand atomic xor to a cmpxchg loop. The options only affect the global and flat address spaces.

[1]: https://github.com/RadeonOpenCompute/ROCm/issues/2481
[2]: https://github.com/intel/llvm/pull/10430

>From 508e577fa67b590ad297e45aeb5210ec3190ebc9 Mon Sep 17 00:00:00 2001
From: Pierre-Andre Saulais <pierre-andre at codeplay.com>
Date: Thu, 12 Oct 2023 16:00:20 +0100
Subject: [PATCH] [AMDGPU] Add an option to disable unsafe uses of atomic xor

---
 clang/include/clang/Basic/TargetInfo.h        |  7 +++
 clang/include/clang/Basic/TargetOptions.h     |  3 +
 clang/include/clang/Driver/Options.td         |  8 +++
 clang/lib/Basic/TargetInfo.cpp                |  1 +
 clang/lib/Basic/Targets/AMDGPU.cpp            |  1 +
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |  3 +
 clang/lib/Driver/ToolChains/Clang.cpp         |  2 +
 .../amdgpu-unsafe-int-atomics.cl              | 55 +++++++++++++++++++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 15 +++++
 llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll  | 23 ++++++--
 10 files changed, 114 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl

diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 9d56e97a3d4bb88..96416c4405115cd 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -253,6 +253,8 @@ class TargetInfo : public TransferrableTargetInfo,
 
   unsigned AllowAMDGPUUnsafeFPAtomics : 1;
 
+  unsigned AllowAMDGPUUnsafeIntAtomics : 1;
+
   unsigned ARMCDECoprocMask : 8;
 
   unsigned MaxOpenCLWorkGroupSize;
@@ -995,6 +997,11 @@ class TargetInfo : public TransferrableTargetInfo,
   /// allowed.
   bool allowAMDGPUUnsafeFPAtomics() const { return AllowAMDGPUUnsafeFPAtomics; }
 
+  /// Returns whether or not the AMDGPU unsafe integer atomics are allowed.
+  bool allowAMDGPUUnsafeIntAtomics() const {
+    return AllowAMDGPUUnsafeIntAtomics;
+  }
+
   /// For ARM targets returns a mask defining which coprocessors are configured
   /// as Custom Datapath.
   uint32_t getARMCDECoprocMask() const { return ARMCDECoprocMask; }
diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h
index ba3acd029587160..4feb6f2dcd86b83 100644
--- a/clang/include/clang/Basic/TargetOptions.h
+++ b/clang/include/clang/Basic/TargetOptions.h
@@ -78,6 +78,9 @@ class TargetOptions {
   /// \brief If enabled, allow AMDGPU unsafe floating point atomics.
   bool AllowAMDGPUUnsafeFPAtomics = false;
 
+  /// \brief If enabled, allow AMDGPU unsafe integer atomics.
+  bool AllowAMDGPUUnsafeIntAtomics = true;
+
   /// \brief Enumeration value for AMDGPU code object version, which is the
   /// code object version times 100.
   enum CodeObjectVersionKind {
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index a89d6b6579f1176..265f5c23e7858ee 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4705,6 +4705,14 @@ defm unsafe_fp_atomics : BoolOption<"m", "unsafe-fp-atomics",
           "for certain memory destinations. (AMDGPU only)">,
   NegFlag<SetFalse>>, Group<m_Group>;
 
+defm unsafe_int_atomics : BoolOption<"m", "unsafe-int-atomics",
+  TargetOpts<"AllowAMDGPUUnsafeIntAtomics">, DefaultTrue,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option],
+          "Enable generation of unsafe integer atomic instructions. May "
+          "generate more efficient code, but may give incorrect results "
+          "for certain memory destinations. (AMDGPU only)">,
+  NegFlag<SetFalse, [], [ClangOption, CC1Option]>>, Group<m_Group>;
+
 def faltivec : Flag<["-"], "faltivec">, Group<f_Group>, Flags<[NoXarchOption]>;
 def fno_altivec : Flag<["-"], "fno-altivec">, Group<f_Group>,
   Flags<[NoXarchOption]>;
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 6cd5d618a4acaa5..06997cc54642623 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -157,6 +157,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) {
   HasAArch64SVETypes = false;
   HasRISCVVTypes = false;
   AllowAMDGPUUnsafeFPAtomics = false;
+  AllowAMDGPUUnsafeIntAtomics = true;
   ARMCDECoprocMask = 0;
 
   // Default to no types using fpret.
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 409ae32ab424215..20ad8e5c59c2b66 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -232,6 +232,7 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple,
   HasFloat16 = true;
   WavefrontSize = GPUFeatures & llvm::AMDGPU::FEATURE_WAVE32 ? 32 : 64;
   AllowAMDGPUUnsafeFPAtomics = Opts.AllowAMDGPUUnsafeFPAtomics;
+  AllowAMDGPUUnsafeIntAtomics = Opts.AllowAMDGPUUnsafeIntAtomics;
 
   // Set pointer width and alignment for the generic address space.
   PointerWidth = PointerAlign = getPointerWidthV(LangAS::Default);
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index f6a614b3e4d54dd..9be15d89d3f2f46 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -409,6 +409,9 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
   if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
     F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
 
+  if (!M.getContext().getTargetInfo().allowAMDGPUUnsafeIntAtomics())
+    F->addFnAttr("amdgpu-unsafe-int-atomics", "false");
+
   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
     F->addFnAttr("amdgpu-ieee", "false");
 }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 94c184435ae14de..be3076cdbdc4a68 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7372,6 +7372,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
 
     Args.addOptInFlag(CmdArgs, options::OPT_munsafe_fp_atomics,
                       options::OPT_mno_unsafe_fp_atomics);
+    Args.addOptOutFlag(CmdArgs, options::OPT_munsafe_int_atomics,
+                       options::OPT_mno_unsafe_int_atomics);
     Args.addOptOutFlag(CmdArgs, options::OPT_mamdgpu_ieee,
                        options::OPT_mno_amdgpu_ieee);
   }
diff --git a/clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl b/clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl
new file mode 100644
index 000000000000000..02dfddd5791b405
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl
@@ -0,0 +1,55 @@
+// REQUIRES: amdgpu-registered-target
+//
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN:   -emit-llvm \
+// RUN:   | FileCheck -check-prefixes=COMMON,UNSAFE-INT-DEFAULT %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN:   -emit-llvm -munsafe-int-atomics \
+// RUN:   | FileCheck -check-prefixes=COMMON,UNSAFE-INT-ON %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN:   -emit-llvm -mno-unsafe-int-atomics \
+// RUN:   | FileCheck -check-prefixes=COMMON,UNSAFE-INT-OFF %s
+
+// Check AMDGCN ISA generation.
+
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN:   -munsafe-int-atomics \
+// RUN:   | FileCheck -check-prefixes=COMMON-ISA,ISA-ON %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN:   -mno-unsafe-int-atomics \
+// RUN:   | FileCheck -check-prefixes=COMMON-ISA,ISA-OFF %s
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+typedef enum memory_scope {
+  memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+  memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+  memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+  memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
+  memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+#endif
+} memory_scope;
+
+// COMMON-ISA: kern:
+// ISA-ON: flat_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc 
+// ISA-OFF: flat_atomic_cmpswap v{{[0-9]+}},  v[{{[0-9]+}}:{{[0-9]+}}],  v[{{[0-9]+}}:{{[0-9]+}}] glc
+kernel void kern(global atomic_int *x, int y, global int *z) {
+  *z = __opencl_atomic_fetch_xor(x, y, memory_order_seq_cst, memory_scope_work_group);
+}
+
+// COMMON: define{{.*}} amdgpu_kernel void @kern{{.*}} [[ATTRS1:#[0-9]+]]
+// COMMON: atomicrmw xor ptr addrspace(1) %x, i32 %y syncscope("workgroup") seq_cst, align 4
+//
+// UNSAFE-INT-DEFAULT-NOT: attributes [[ATTRS1]] = {{.*}} "amdgpu-unsafe-int-atomics"
+// UNSAFE-INT-ON-NOT: attributes [[ATTRS1]] = {{.*}} "amdgpu-unsafe-int-atomics"
+// UNSAFE-INT-OFF: attributes [[ATTRS1]] = {{.*}} "amdgpu-unsafe-int-atomics"="false"
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9c5b166c9652238..c379cf4dd64642f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14975,6 +14975,14 @@ bool unsafeFPAtomicsDisabled(Function *F) {
          "true";
 }
 
+// The amdgpu-unsafe-int-atomics attribute disables generation of unsafe
+// integer atomic instructions. May generate more efficient code, but may
+// give incorrect results for certain memory destinations.
+bool unsafeIntAtomicsDisabled(Function *F) {
+  return F->getFnAttribute("amdgpu-unsafe-int-atomics").getValueAsString() ==
+         "false";
+}
+
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   unsigned AS = RMW->getPointerAddressSpace();
@@ -15097,6 +15105,13 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
     }
     break;
   }
+  case AtomicRMWInst::Xor: {
+    if (AMDGPU::isFlatGlobalAddrSpace(AS) &&
+        unsafeIntAtomicsDisabled(RMW->getFunction())) {
+      return AtomicExpansionKind::CmpXChg;
+    }
+    break;
+  }
   default:
     break;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 92b70ad797b88ae..6cc97019d3ff5ce 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s
-; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1100 %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX90A %s
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1100 %s
 
 define float @syncscope_system(ptr %addr, float %val) #0 {
 ; GFX908-LABEL: syncscope_system:
@@ -391,4 +391,19 @@ define float @no_unsafe(ptr %addr, float %val) {
   ret float %res
 }
 
+define i32 @default_xor(ptr %addr, i32 %val) {
+  ; GCN-LABEL: default_xor:
+  ; GCN: flat_atomic_xor
+  %res = atomicrmw xor ptr %addr, i32 %val seq_cst
+  ret i32 %res
+}
+
+define i32 @no_unsafe_xor(ptr %addr, i32 %val) #1 {
+  ; GCN-LABEL: no_unsafe_xor:
+  ; GCN: flat_atomic_cmpswap
+  %res = atomicrmw xor ptr %addr, i32 %val seq_cst
+  ret i32 %res
+}
+
 attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }
+attributes #1 = { "amdgpu-unsafe-int-atomics"="false" }