[llvm] [AMDGPU] Add an option to disable unsafe uses of atomic xor (PR #69229)
Pierre-Andre Saulais via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 16 10:38:52 PDT 2023
https://github.com/pasaulais created https://github.com/llvm/llvm-project/pull/69229
On some systems and when accessing certain memory areas it is not safe to use atomic xor with AMDGPU. This may be the case when using fine-grained memory allocations (e.g. through `hipAllocManaged`) and when the atomic operation needs to go through the PCIe bus, which does not support atomic xor (PCIe 3.0 does support other atomic operations like fetch_and_add and cmpxchg) [1].
The issue has been worked around in DPC++/HIP by prefetching memory before executing kernels [2], however this adds an overhead that can outweigh the performance benefit of using atomic xor over a cmpxchg loop. This is why a way to switch between 'native' atomic xor and a cmpxchg-loop solution is needed.
These changes add `-munsafe-int-atomics` and `-mno-unsafe-int-atomics` options to clang that can be used to switch between the two implementations. The default is the current behaviour of generative 'native' atomic xor instructions. When `-mno-unsafe-int-atomics` is passed to clang, functions are marked with the `amdgpu-unsafe-int-atomics` attribute (set to `false`) at the IR level and this tells the backend to expand atomic xor to a cmpxchg loop. The options only affect the global and flat address spaces.
[1]: https://github.com/RadeonOpenCompute/ROCm/issues/2481
[2]: https://github.com/intel/llvm/pull/10430
>From 508e577fa67b590ad297e45aeb5210ec3190ebc9 Mon Sep 17 00:00:00 2001
From: Pierre-Andre Saulais <pierre-andre at codeplay.com>
Date: Thu, 12 Oct 2023 16:00:20 +0100
Subject: [PATCH] [AMDGPU] Add an option to disable unsafe uses of atomic xor
---
clang/include/clang/Basic/TargetInfo.h | 7 +++
clang/include/clang/Basic/TargetOptions.h | 3 +
clang/include/clang/Driver/Options.td | 8 +++
clang/lib/Basic/TargetInfo.cpp | 1 +
clang/lib/Basic/Targets/AMDGPU.cpp | 1 +
clang/lib/CodeGen/Targets/AMDGPU.cpp | 3 +
clang/lib/Driver/ToolChains/Clang.cpp | 2 +
.../amdgpu-unsafe-int-atomics.cl | 55 +++++++++++++++++++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 15 +++++
llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 23 ++++++--
10 files changed, 114 insertions(+), 4 deletions(-)
create mode 100644 clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 9d56e97a3d4bb88..96416c4405115cd 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -253,6 +253,8 @@ class TargetInfo : public TransferrableTargetInfo,
unsigned AllowAMDGPUUnsafeFPAtomics : 1;
+ unsigned AllowAMDGPUUnsafeIntAtomics : 1;
+
unsigned ARMCDECoprocMask : 8;
unsigned MaxOpenCLWorkGroupSize;
@@ -995,6 +997,11 @@ class TargetInfo : public TransferrableTargetInfo,
/// allowed.
bool allowAMDGPUUnsafeFPAtomics() const { return AllowAMDGPUUnsafeFPAtomics; }
+ /// Returns whether or not the AMDGPU unsafe integer atomics are allowed.
+ bool allowAMDGPUUnsafeIntAtomics() const {
+ return AllowAMDGPUUnsafeIntAtomics;
+ }
+
/// For ARM targets returns a mask defining which coprocessors are configured
/// as Custom Datapath.
uint32_t getARMCDECoprocMask() const { return ARMCDECoprocMask; }
diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h
index ba3acd029587160..4feb6f2dcd86b83 100644
--- a/clang/include/clang/Basic/TargetOptions.h
+++ b/clang/include/clang/Basic/TargetOptions.h
@@ -78,6 +78,9 @@ class TargetOptions {
/// \brief If enabled, allow AMDGPU unsafe floating point atomics.
bool AllowAMDGPUUnsafeFPAtomics = false;
+ /// \brief If enabled, allow AMDGPU unsafe integer atomics.
+ bool AllowAMDGPUUnsafeIntAtomics = true;
+
/// \brief Enumeration value for AMDGPU code object version, which is the
/// code object version times 100.
enum CodeObjectVersionKind {
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index a89d6b6579f1176..265f5c23e7858ee 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4705,6 +4705,14 @@ defm unsafe_fp_atomics : BoolOption<"m", "unsafe-fp-atomics",
"for certain memory destinations. (AMDGPU only)">,
NegFlag<SetFalse>>, Group<m_Group>;
+defm unsafe_int_atomics : BoolOption<"m", "unsafe-int-atomics",
+ TargetOpts<"AllowAMDGPUUnsafeIntAtomics">, DefaultTrue,
+ PosFlag<SetTrue, [], [ClangOption, CC1Option],
+ "Enable generation of unsafe integer atomic instructions. May "
+ "generate more efficient code, but may give incorrect results "
+ "for certain memory destinations. (AMDGPU only)">,
+ NegFlag<SetFalse, [], [ClangOption, CC1Option]>>, Group<m_Group>;
+
def faltivec : Flag<["-"], "faltivec">, Group<f_Group>, Flags<[NoXarchOption]>;
def fno_altivec : Flag<["-"], "fno-altivec">, Group<f_Group>,
Flags<[NoXarchOption]>;
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 6cd5d618a4acaa5..06997cc54642623 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -157,6 +157,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) {
HasAArch64SVETypes = false;
HasRISCVVTypes = false;
AllowAMDGPUUnsafeFPAtomics = false;
+ AllowAMDGPUUnsafeIntAtomics = true;
ARMCDECoprocMask = 0;
// Default to no types using fpret.
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 409ae32ab424215..20ad8e5c59c2b66 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -232,6 +232,7 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple,
HasFloat16 = true;
WavefrontSize = GPUFeatures & llvm::AMDGPU::FEATURE_WAVE32 ? 32 : 64;
AllowAMDGPUUnsafeFPAtomics = Opts.AllowAMDGPUUnsafeFPAtomics;
+ AllowAMDGPUUnsafeIntAtomics = Opts.AllowAMDGPUUnsafeIntAtomics;
// Set pointer width and alignment for the generic address space.
PointerWidth = PointerAlign = getPointerWidthV(LangAS::Default);
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index f6a614b3e4d54dd..9be15d89d3f2f46 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -409,6 +409,9 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
+ if (!M.getContext().getTargetInfo().allowAMDGPUUnsafeIntAtomics())
+ F->addFnAttr("amdgpu-unsafe-int-atomics", "false");
+
if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
F->addFnAttr("amdgpu-ieee", "false");
}
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 94c184435ae14de..be3076cdbdc4a68 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7372,6 +7372,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Args.addOptInFlag(CmdArgs, options::OPT_munsafe_fp_atomics,
options::OPT_mno_unsafe_fp_atomics);
+ Args.addOptOutFlag(CmdArgs, options::OPT_munsafe_int_atomics,
+ options::OPT_mno_unsafe_int_atomics);
Args.addOptOutFlag(CmdArgs, options::OPT_mamdgpu_ieee,
options::OPT_mno_amdgpu_ieee);
}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl b/clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl
new file mode 100644
index 000000000000000..02dfddd5791b405
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl
@@ -0,0 +1,55 @@
+// REQUIRES: amdgpu-registered-target
+//
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN: -emit-llvm \
+// RUN: | FileCheck -check-prefixes=COMMON,UNSAFE-INT-DEFAULT %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN: -emit-llvm -munsafe-int-atomics \
+// RUN: | FileCheck -check-prefixes=COMMON,UNSAFE-INT-ON %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN: -emit-llvm -mno-unsafe-int-atomics \
+// RUN: | FileCheck -check-prefixes=COMMON,UNSAFE-INT-OFF %s
+
+// Check AMDGCN ISA generation.
+
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN: -munsafe-int-atomics \
+// RUN: | FileCheck -check-prefixes=COMMON-ISA,ISA-ON %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN: -mno-unsafe-int-atomics \
+// RUN: | FileCheck -check-prefixes=COMMON-ISA,ISA-OFF %s
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+
+typedef enum memory_order {
+ memory_order_relaxed = __ATOMIC_RELAXED,
+ memory_order_acquire = __ATOMIC_ACQUIRE,
+ memory_order_release = __ATOMIC_RELEASE,
+ memory_order_acq_rel = __ATOMIC_ACQ_REL,
+ memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+typedef enum memory_scope {
+ memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+ memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+ memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+ memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
+ memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+#endif
+} memory_scope;
+
+// COMMON-ISA: kern:
+// ISA-ON: flat_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc
+// ISA-OFF: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc
+kernel void kern(global atomic_int *x, int y, global int *z) {
+ *z = __opencl_atomic_fetch_xor(x, y, memory_order_seq_cst, memory_scope_work_group);
+}
+
+// COMMON: define{{.*}} amdgpu_kernel void @kern{{.*}} [[ATTRS1:#[0-9]+]]
+// COMMON: atomicrmw xor ptr addrspace(1) %x, i32 %y syncscope("workgroup") seq_cst, align 4
+//
+// UNSAFE-INT-DEFAULT-NOT: attributes [[ATTRS1]] = {{.*}} "amdgpu-unsafe-int-atomics"
+// UNSAFE-INT-ON-NOT: attributes [[ATTRS1]] = {{.*}} "amdgpu-unsafe-int-atomics"
+// UNSAFE-INT-OFF: attributes [[ATTRS1]] = {{.*}} "amdgpu-unsafe-int-atomics"="false"
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9c5b166c9652238..c379cf4dd64642f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14975,6 +14975,14 @@ bool unsafeFPAtomicsDisabled(Function *F) {
"true";
}
+// The amdgpu-unsafe-int-atomics attribute disables generation of unsafe
+// integer atomic instructions. May generate more efficient code, but may
+// give incorrect results for certain memory destinations.
+bool unsafeIntAtomicsDisabled(Function *F) {
+ return F->getFnAttribute("amdgpu-unsafe-int-atomics").getValueAsString() ==
+ "false";
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
@@ -15097,6 +15105,13 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
}
break;
}
+ case AtomicRMWInst::Xor: {
+ if (AMDGPU::isFlatGlobalAddrSpace(AS) &&
+ unsafeIntAtomicsDisabled(RMW->getFunction())) {
+ return AtomicExpansionKind::CmpXChg;
+ }
+ break;
+ }
default:
break;
}
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 92b70ad797b88ae..6cc97019d3ff5ce 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s
-; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1100 %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX90A %s
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1100 %s
define float @syncscope_system(ptr %addr, float %val) #0 {
; GFX908-LABEL: syncscope_system:
@@ -391,4 +391,19 @@ define float @no_unsafe(ptr %addr, float %val) {
ret float %res
}
+define i32 @default_xor(ptr %addr, i32 %val) {
+ ; GCN-LABEL: default_xor:
+ ; GCN: flat_atomic_xor
+ %res = atomicrmw xor ptr %addr, i32 %val seq_cst
+ ret i32 %res
+}
+
+define i32 @no_unsafe_xor(ptr %addr, i32 %val) #1 {
+ ; GCN-LABEL: no_unsafe_xor:
+ ; GCN: flat_atomic_cmpswap
+ %res = atomicrmw xor ptr %addr, i32 %val seq_cst
+ ret i32 %res
+}
+
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }
+attributes #1 = { "amdgpu-unsafe-int-atomics"="false" }
More information about the llvm-commits
mailing list