[llvm] [clang] [AMDGPU] Add an option to disable unsafe uses of atomic xor (PR #69229)
Pierre-Andre Saulais via cfe-commits
cfe-commits at lists.llvm.org
Mon Dec 4 10:12:50 PST 2023
https://github.com/pasaulais updated https://github.com/llvm/llvm-project/pull/69229
>From cd33e8471ab5e768c1b579f1bec1308c0b322e05 Mon Sep 17 00:00:00 2001
From: Pierre-Andre Saulais <pierre-andre at codeplay.com>
Date: Thu, 23 Nov 2023 17:33:18 +0000
Subject: [PATCH] [AMDGPU] Add an option to disable unsafe uses of atomic xor
---
clang/include/clang/Basic/TargetInfo.h | 6 ++
clang/include/clang/Basic/TargetOptions.h | 3 +
clang/include/clang/Driver/Options.td | 10 +++
clang/lib/Basic/TargetInfo.cpp | 1 +
clang/lib/Basic/Targets/AMDGPU.cpp | 1 +
clang/lib/CodeGen/Targets/AMDGPU.cpp | 23 +++++++
clang/lib/Driver/ToolChains/Clang.cpp | 2 +
.../amdgpu-unsafe-int-atomics.cl | 62 +++++++++++++++++++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 26 ++++++++
llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll | 24 +++++--
10 files changed, 154 insertions(+), 4 deletions(-)
create mode 100644 clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 41f3c2e403cbe..867c4ebefa933 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -269,6 +269,9 @@ class TargetInfo : public TransferrableTargetInfo,
LLVM_PREFERRED_TYPE(bool)
unsigned ARMCDECoprocMask : 8;
+ LLVM_PREFERRED_TYPE(bool)
+ unsigned AllowAMDGPUFineGrainedMem : 1;
+
unsigned MaxOpenCLWorkGroupSize;
std::optional<unsigned> MaxBitIntWidth;
@@ -1009,6 +1012,9 @@ class TargetInfo : public TransferrableTargetInfo,
/// allowed.
bool allowAMDGPUUnsafeFPAtomics() const { return AllowAMDGPUUnsafeFPAtomics; }
+ /// Returns whether or not fine-grained memory access is allowed on AMDGPU.
+ bool allowAMDGPUFineGrainedMem() const { return AllowAMDGPUFineGrainedMem; }
+
/// For ARM targets returns a mask defining which coprocessors are configured
/// as Custom Datapath.
uint32_t getARMCDECoprocMask() const { return ARMCDECoprocMask; }
diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h
index 2049f03b28893..90cfe6d71c6d0 100644
--- a/clang/include/clang/Basic/TargetOptions.h
+++ b/clang/include/clang/Basic/TargetOptions.h
@@ -78,6 +78,9 @@ class TargetOptions {
/// \brief If enabled, allow AMDGPU unsafe floating point atomics.
bool AllowAMDGPUUnsafeFPAtomics = false;
+ /// \brief If enabled, allow fine-grained memory access on AMDGPU.
+ bool AllowAMDGPUFineGrainedMem = false;
+
/// \brief Code object version for AMDGPU.
llvm::CodeObjectVersionKind CodeObjectVersion =
llvm::CodeObjectVersionKind::COV_None;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1d04e4f6e7e6d..71cc5aeb5c4e6 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4749,6 +4749,16 @@ defm unsafe_fp_atomics : BoolOption<"m", "unsafe-fp-atomics",
def faltivec : Flag<["-"], "faltivec">, Group<f_Group>;
def fno_altivec : Flag<["-"], "fno-altivec">, Group<f_Group>;
+
+defm amdgpu_fine_grained_mem : BoolOption<"m", "amdgpu-fine-grained-mem",
+ TargetOpts<"AllowAMDGPUFineGrainedMem">, DefaultFalse,
+ PosFlag<SetTrue, [], [ClangOption, CC1Option],
+ "Indicates that fine-grained memory allocations may be accessed in the "
+ "kernel. This may result in certain atomic operations being replaced "
+ "in order to guarantee correct operation when fine-grained memory "
+ "allocations are used. (AMDGPU only)">,
+ NegFlag<SetFalse, [], [ClangOption, CC1Option]>>, Group<m_Group>;
+
let Flags = [TargetSpecific] in {
def maltivec : Flag<["-"], "maltivec">, Group<m_ppc_Features_Group>,
HelpText<"Enable AltiVec vector initializer syntax">;
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 6cd5d618a4aca..8e623b59365e5 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -157,6 +157,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) {
HasAArch64SVETypes = false;
HasRISCVVTypes = false;
AllowAMDGPUUnsafeFPAtomics = false;
+ AllowAMDGPUFineGrainedMem = false;
ARMCDECoprocMask = 0;
// Default to no types using fpret.
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 409ae32ab4242..cf70d2e267d3e 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -232,6 +232,7 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple,
HasFloat16 = true;
WavefrontSize = GPUFeatures & llvm::AMDGPU::FEATURE_WAVE32 ? 32 : 64;
AllowAMDGPUUnsafeFPAtomics = Opts.AllowAMDGPUUnsafeFPAtomics;
+ AllowAMDGPUFineGrainedMem = Opts.AllowAMDGPUFineGrainedMem;
// Set pointer width and alignment for the generic address space.
PointerWidth = PointerAlign = getPointerWidthV(LangAS::Default);
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index b654e3f12af8d..13bec674db8a4 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -392,6 +392,26 @@ void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
}
}
+// Add metadata to specific atomic instructions, to mark them as potentially
+// accessing fine-grained memory locations.
+static void addFineGrainedAtomicMD(llvm::Function *F) {
+ llvm::LLVMContext &Ctx = F->getContext();
+ llvm::MDBuilder MDHelper(Ctx);
+ auto *Int32Ty = llvm::IntegerType::getInt32Ty(Ctx);
+ for (llvm::BasicBlock &BB : *F) {
+ for (llvm::Instruction &I : BB) {
+ llvm::AtomicRMWInst *ARI = llvm::dyn_cast<llvm::AtomicRMWInst>(&I);
+ if (!ARI || ARI->getOperation() != llvm::AtomicRMWInst::Xor)
+ continue;
+ auto *Key = MDHelper.createString("fine_grained");
+ auto *One = MDHelper.createConstant(llvm::ConstantInt::get(Int32Ty, 1));
+ llvm::MDNode *MD = llvm::MDNode::get(Ctx, {Key, One});
+ auto *Tuple = llvm::MDNode::get(Ctx, {MD});
+ ARI->setMetadata("amdgpu.atomic", Tuple);
+ }
+ }
+}
+
void AMDGPUTargetCodeGenInfo::setTargetAttributes(
const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
if (requiresAMDGPUProtectedVisibility(D, GV)) {
@@ -413,6 +433,9 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
+ if (M.getContext().getTargetInfo().allowAMDGPUFineGrainedMem())
+ addFineGrainedAtomicMD(F);
+
if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
F->addFnAttr("amdgpu-ieee", "false");
}
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index f02f7c841b91f..532c925520c85 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7346,6 +7346,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Args.addOptInFlag(CmdArgs, options::OPT_munsafe_fp_atomics,
options::OPT_mno_unsafe_fp_atomics);
+ Args.addOptInFlag(CmdArgs, options::OPT_mamdgpu_fine_grained_mem,
+ options::OPT_mno_amdgpu_fine_grained_mem);
Args.addOptOutFlag(CmdArgs, options::OPT_mamdgpu_ieee,
options::OPT_mno_amdgpu_ieee);
}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl b/clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl
new file mode 100644
index 0000000000000..0c81e0797d68b
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/amdgpu-unsafe-int-atomics.cl
@@ -0,0 +1,62 @@
+// REQUIRES: amdgpu-registered-target
+//
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN: -emit-llvm \
+// RUN: | FileCheck -check-prefixes=COMMON,UNSAFE-INT-DEFAULT %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN: -emit-llvm -mamdgpu-fine-grained-mem \
+// RUN: | FileCheck -check-prefixes=COMMON,UNSAFE-INT-ON %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN: -emit-llvm -mno-amdgpu-fine-grained-mem \
+// RUN: | FileCheck -check-prefixes=COMMON,UNSAFE-INT-OFF %s
+
+// Check AMDGCN ISA generation.
+
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN: | FileCheck -check-prefixes=COMMON-ISA,ISA-DEFAULT %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN: -mamdgpu-fine-grained-mem \
+// RUN: | FileCheck -check-prefixes=COMMON-ISA,ISA-ON %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -O3 -S -o - %s \
+// RUN: -mno-amdgpu-fine-grained-mem \
+// RUN: | FileCheck -check-prefixes=COMMON-ISA,ISA-OFF %s
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+
+typedef enum memory_order {
+ memory_order_relaxed = __ATOMIC_RELAXED,
+ memory_order_acquire = __ATOMIC_ACQUIRE,
+ memory_order_release = __ATOMIC_RELEASE,
+ memory_order_acq_rel = __ATOMIC_ACQ_REL,
+ memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+typedef enum memory_scope {
+ memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+ memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+ memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+ memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
+ memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+#endif
+} memory_scope;
+
+// COMMON-ISA: kern:
+// ISA-ON: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc
+// ISA-OFF: flat_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc
+// ISA-DEFAULT: flat_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc
+kernel void kern(global atomic_int *x, int y, global int *z) {
+ *z = __opencl_atomic_fetch_xor(x, y, memory_order_seq_cst, memory_scope_work_group);
+}
+
+// COMMON: define{{.*}} amdgpu_kernel void @kern
+// COMMON: atomicrmw xor ptr addrspace(1) %x, i32 %y syncscope("workgroup") seq_cst, align 4
+
+// UNSAFE-INT-ON-SAME: !amdgpu.atomic ![[REF:[0-9]+]]
+// UNSAFE-INT-ON: ![[REF]] = !{![[REF2:[0-9]+]]}
+// UNSAFE-INT-ON: ![[REF2]] = !{!"fine_grained", i32 1}
+
+// UNSAFE-INT-OFF-NOT: !amdgpu.atomic
+
+// UNSAFE-INT-DEFAULT-NOT: !amdgpu.atomic
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 53ab5da013539..e71a874a59af8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15107,6 +15107,26 @@ bool unsafeFPAtomicsDisabled(Function *F) {
"true";
}
+// Inspect the instruction's metadata to determine whether or not it can access
+// fine-grained memory allocations. Some atomic instructions may fail on certain
+// systems when accessing fine-grained memory.
+static bool canAccessFineGrainedMem(AtomicRMWInst &RMW) {
+ if (MDNode *MD = RMW.getMetadata("amdgpu.atomic")) {
+ for (const MDOperand &Op : MD->operands()) {
+ MDNode *OpMD = dyn_cast<MDNode>(&*Op);
+ if (!OpMD || OpMD->getNumOperands() < 2)
+ continue;
+ const MDString *NameOp = dyn_cast<MDString>(OpMD->getOperand(0));
+ const MDOperand &ValOp = OpMD->getOperand(1);
+ if (NameOp->getString().equals("fine_grained") &&
+ mdconst::extract<ConstantInt>(ValOp)->getZExtValue() != 0) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
@@ -15229,6 +15249,12 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
}
break;
}
+ case AtomicRMWInst::Xor: {
+ if (AMDGPU::isFlatGlobalAddrSpace(AS) && canAccessFineGrainedMem(*RMW)) {
+ return AtomicExpansionKind::CmpXChg;
+ }
+ break;
+ }
default:
break;
}
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 92b70ad797b88..75f2bd99175a7 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s
-; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX1100 %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX90A %s
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX940 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1100 %s
define float @syncscope_system(ptr %addr, float %val) #0 {
; GFX908-LABEL: syncscope_system:
@@ -391,4 +391,20 @@ define float @no_unsafe(ptr %addr, float %val) {
ret float %res
}
+define i32 @default_xor(ptr %addr, i32 %val) {
+ ; GCN-LABEL: default_xor:
+ ; GCN: flat_atomic_xor
+ %res = atomicrmw xor ptr %addr, i32 %val seq_cst
+ ret i32 %res
+}
+
+define i32 @no_unsafe_xor(ptr %addr, i32 %val) {
+ ; GCN-LABEL: no_unsafe_xor:
+ ; GCN: flat_atomic_cmpswap
+ %res = atomicrmw xor ptr %addr, i32 %val seq_cst, !amdgpu.atomic !0
+ ret i32 %res
+}
+
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" }
+!0 = !{!1}
+!1 = !{!"fine_grained", i32 1}
More information about the cfe-commits
mailing list