[clang] 595b961 - [CUDA] Use monotonic ordering for __nvvm_atom* builtins (#185822)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Mar 12 09:48:14 PDT 2026
Author: Artem Belevich
Date: 2026-03-12T09:48:09-07:00
New Revision: 595b96140013cff96b0b92b148857e3e72fa2786
URL: https://github.com/llvm/llvm-project/commit/595b96140013cff96b0b92b148857e3e72fa2786
DIFF: https://github.com/llvm/llvm-project/commit/595b96140013cff96b0b92b148857e3e72fa2786.diff
LOG: [CUDA] Use monotonic ordering for __nvvm_atom* builtins (#185822)
CUDA's __nvvm_atom* builtins are expected to produce atomic operations
with relaxed ordering. However, Clang lowered tham as atomicrmw and cmpxchg
with the default seq_cst ordering. That mismatch went unnoticed because
until recently NVPTX back end was unable to lower all atomic instructions correctly,
and despite using `cst_seq` ordering in IR we ended up generating the intended
PTX instructions with relaxed ordering, It worked well enough until
https://github.com/llvm/llvm-project/pull/179553 implemented correct NVPTX
atomic lowering.
That, in turn, caused severe performance regression for the code that
relied on these builtins.
Thanks to @akshayrdeodhar for figuring out what happened.
Switching __nvvm_atom* builtins to generate atomic instructions with
monotonic ordering matches the expected semantics of the builtins,
and restores performance of the generated code.
See:
https://github.com/llvm/llvm-project/pull/179553#issuecomment-4035193968
Added:
Modified:
clang/lib/CodeGen/CGBuiltin.cpp
clang/lib/CodeGen/CGBuiltin.h
clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
clang/test/CodeGen/builtins-nvptx-ptx50.cu
clang/test/CodeGen/builtins-nvptx.c
Removed:
################################################################################
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 6fb43d5cb0fbf..df03e84ce9f81 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -390,7 +390,9 @@ static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,
/// Note: In order to lower Microsoft's _InterlockedCompareExchange* intrinsics
/// invoke the function EmitAtomicCmpXchgForMSIntrin.
Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E,
- bool ReturnBool) {
+ bool ReturnBool,
+ llvm::AtomicOrdering SuccessOrdering,
+ llvm::AtomicOrdering FailureOrdering) {
QualType T = ReturnBool ? E->getArg(1)->getType() : E->getType();
Address DestAddr = CheckAtomicAlignment(CGF, E);
@@ -403,8 +405,7 @@ Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E,
Value *New = EmitToInt(CGF, CGF.EmitScalarExpr(E->getArg(2)), T, IntType);
Value *Pair = CGF.Builder.CreateAtomicCmpXchg(
- DestAddr, Cmp, New, llvm::AtomicOrdering::SequentiallyConsistent,
- llvm::AtomicOrdering::SequentiallyConsistent);
+ DestAddr, Cmp, New, SuccessOrdering, FailureOrdering);
if (ReturnBool)
// Extract boolean success flag and zext it to int.
return CGF.Builder.CreateZExt(CGF.Builder.CreateExtractValue(Pair, 1),
@@ -5079,14 +5080,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
case Builtin::BI__sync_val_compare_and_swap_4:
case Builtin::BI__sync_val_compare_and_swap_8:
case Builtin::BI__sync_val_compare_and_swap_16:
- return RValue::get(MakeAtomicCmpXchgValue(*this, E, false));
+ return RValue::get(MakeAtomicCmpXchgValue(
+ *this, E, false, AtomicOrdering::SequentiallyConsistent,
+ AtomicOrdering::SequentiallyConsistent));
case Builtin::BI__sync_bool_compare_and_swap_1:
case Builtin::BI__sync_bool_compare_and_swap_2:
case Builtin::BI__sync_bool_compare_and_swap_4:
case Builtin::BI__sync_bool_compare_and_swap_8:
case Builtin::BI__sync_bool_compare_and_swap_16:
- return RValue::get(MakeAtomicCmpXchgValue(*this, E, true));
+ return RValue::get(MakeAtomicCmpXchgValue(
+ *this, E, true, AtomicOrdering::SequentiallyConsistent,
+ AtomicOrdering::SequentiallyConsistent));
case Builtin::BI__sync_swap_1:
case Builtin::BI__sync_swap_2:
diff --git a/clang/lib/CodeGen/CGBuiltin.h b/clang/lib/CodeGen/CGBuiltin.h
index 667bce845f5c0..df71e46629884 100644
--- a/clang/lib/CodeGen/CGBuiltin.h
+++ b/clang/lib/CodeGen/CGBuiltin.h
@@ -99,7 +99,8 @@ llvm::Value *EmitOverflowIntrinsic(clang::CodeGen::CodeGenFunction &CGF,
llvm::Value *&Carry);
llvm::Value *MakeAtomicCmpXchgValue(clang::CodeGen::CodeGenFunction &CGF,
- const clang::CallExpr *E,
- bool ReturnBool);
+ const clang::CallExpr *E, bool ReturnBool,
+ llvm::AtomicOrdering SuccessOrdering,
+ llvm::AtomicOrdering FailureOrdering);
#endif
diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
index b4f7342e23473..423a7a3097119 100644
--- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
@@ -431,52 +431,62 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
case NVPTX::BI__nvvm_atom_add_gen_i:
case NVPTX::BI__nvvm_atom_add_gen_l:
case NVPTX::BI__nvvm_atom_add_gen_ll:
- return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E);
+ return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E,
+ AtomicOrdering::Monotonic);
case NVPTX::BI__nvvm_atom_sub_gen_i:
case NVPTX::BI__nvvm_atom_sub_gen_l:
case NVPTX::BI__nvvm_atom_sub_gen_ll:
- return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E);
+ return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E,
+ AtomicOrdering::Monotonic);
case NVPTX::BI__nvvm_atom_and_gen_i:
case NVPTX::BI__nvvm_atom_and_gen_l:
case NVPTX::BI__nvvm_atom_and_gen_ll:
- return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E);
+ return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E,
+ AtomicOrdering::Monotonic);
case NVPTX::BI__nvvm_atom_or_gen_i:
case NVPTX::BI__nvvm_atom_or_gen_l:
case NVPTX::BI__nvvm_atom_or_gen_ll:
- return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E);
+ return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E,
+ AtomicOrdering::Monotonic);
case NVPTX::BI__nvvm_atom_xor_gen_i:
case NVPTX::BI__nvvm_atom_xor_gen_l:
case NVPTX::BI__nvvm_atom_xor_gen_ll:
- return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E);
+ return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E,
+ AtomicOrdering::Monotonic);
case NVPTX::BI__nvvm_atom_xchg_gen_i:
case NVPTX::BI__nvvm_atom_xchg_gen_l:
case NVPTX::BI__nvvm_atom_xchg_gen_ll:
- return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E);
+ return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E,
+ AtomicOrdering::Monotonic);
case NVPTX::BI__nvvm_atom_max_gen_i:
case NVPTX::BI__nvvm_atom_max_gen_l:
case NVPTX::BI__nvvm_atom_max_gen_ll:
- return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E);
+ return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E,
+ AtomicOrdering::Monotonic);
case NVPTX::BI__nvvm_atom_max_gen_ui:
case NVPTX::BI__nvvm_atom_max_gen_ul:
case NVPTX::BI__nvvm_atom_max_gen_ull:
- return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMax, E);
+ return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMax, E,
+ AtomicOrdering::Monotonic);
case NVPTX::BI__nvvm_atom_min_gen_i:
case NVPTX::BI__nvvm_atom_min_gen_l:
case NVPTX::BI__nvvm_atom_min_gen_ll:
- return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E);
+ return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E,
+ AtomicOrdering::Monotonic);
case NVPTX::BI__nvvm_atom_min_gen_ui:
case NVPTX::BI__nvvm_atom_min_gen_ul:
case NVPTX::BI__nvvm_atom_min_gen_ull:
- return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E);
+ return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E,
+ AtomicOrdering::Monotonic);
case NVPTX::BI__nvvm_atom_cas_gen_us:
case NVPTX::BI__nvvm_atom_cas_gen_i:
@@ -484,7 +494,9 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
case NVPTX::BI__nvvm_atom_cas_gen_ll:
// __nvvm_atom_cas_gen_* should return the old value rather than the
// success flag.
- return MakeAtomicCmpXchgValue(*this, E, /*ReturnBool=*/false);
+ return MakeAtomicCmpXchgValue(*this, E, /*ReturnBool=*/false,
+ AtomicOrdering::Monotonic,
+ AtomicOrdering::Monotonic);
case NVPTX::BI__nvvm_atom_add_gen_f:
case NVPTX::BI__nvvm_atom_add_gen_d: {
@@ -492,14 +504,16 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
Value *Val = EmitScalarExpr(E->getArg(1));
return Builder.CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, DestAddr, Val,
- AtomicOrdering::SequentiallyConsistent);
+ AtomicOrdering::Monotonic);
}
case NVPTX::BI__nvvm_atom_inc_gen_ui:
- return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UIncWrap, E);
+ return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UIncWrap, E,
+ AtomicOrdering::Monotonic);
case NVPTX::BI__nvvm_atom_dec_gen_ui:
- return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UDecWrap, E);
+ return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UDecWrap, E,
+ AtomicOrdering::Monotonic);
case NVPTX::BI__nvvm_ldg_c:
case NVPTX::BI__nvvm_ldg_sc:
diff --git a/clang/test/CodeGen/builtins-nvptx-ptx50.cu b/clang/test/CodeGen/builtins-nvptx-ptx50.cu
index a2d527537aed0..2a141baf3a6d0 100644
--- a/clang/test/CodeGen/builtins-nvptx-ptx50.cu
+++ b/clang/test/CodeGen/builtins-nvptx-ptx50.cu
@@ -17,7 +17,7 @@
// CHECK-LABEL: test_fn
__device__ void test_fn(double d, double* double_ptr) {
- // CHECK: atomicrmw fadd ptr {{.*}} seq_cst, align 8
+ // CHECK: atomicrmw fadd ptr {{.*}} monotonic, align 8
// expected-error at +1 {{'__nvvm_atom_add_gen_d' needs target feature sm_60}}
__nvvm_atom_add_gen_d(double_ptr, d);
}
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 2e1acc0aac259..f1b41ba557426 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -309,91 +309,91 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
unsigned short *usp, unsigned short us, int *ip,
int i, unsigned int *uip, unsigned ui, long *lp,
long l, long long *llp, long long ll) {
- // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4
+ // CHECK: atomicrmw add ptr {{.*}} monotonic, align 4
__nvvm_atom_add_gen_i(ip, i);
- // CHECK: atomicrmw add ptr {{.*}} seq_cst, align {{4|8}}
+ // CHECK: atomicrmw add ptr {{.*}} monotonic, align {{4|8}}
__nvvm_atom_add_gen_l(&dl, l);
- // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 8
+ // CHECK: atomicrmw add ptr {{.*}} monotonic, align 8
__nvvm_atom_add_gen_ll(&sll, ll);
- // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 4
+ // CHECK: atomicrmw sub ptr {{.*}} monotonic, align 4
__nvvm_atom_sub_gen_i(ip, i);
- // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align {{4|8}}
+ // CHECK: atomicrmw sub ptr {{.*}} monotonic, align {{4|8}}
__nvvm_atom_sub_gen_l(&dl, l);
- // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 8
+ // CHECK: atomicrmw sub ptr {{.*}} monotonic, align 8
__nvvm_atom_sub_gen_ll(&sll, ll);
- // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 4
+ // CHECK: atomicrmw and ptr {{.*}} monotonic, align 4
__nvvm_atom_and_gen_i(ip, i);
- // CHECK: atomicrmw and ptr {{.*}} seq_cst, align {{4|8}}
+ // CHECK: atomicrmw and ptr {{.*}} monotonic, align {{4|8}}
__nvvm_atom_and_gen_l(&dl, l);
- // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 8
+ // CHECK: atomicrmw and ptr {{.*}} monotonic, align 8
__nvvm_atom_and_gen_ll(&sll, ll);
- // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 4
+ // CHECK: atomicrmw or ptr {{.*}} monotonic, align 4
__nvvm_atom_or_gen_i(ip, i);
- // CHECK: atomicrmw or ptr {{.*}} seq_cst, align {{4|8}}
+ // CHECK: atomicrmw or ptr {{.*}} monotonic, align {{4|8}}
__nvvm_atom_or_gen_l(&dl, l);
- // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 8
+ // CHECK: atomicrmw or ptr {{.*}} monotonic, align 8
__nvvm_atom_or_gen_ll(&sll, ll);
- // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 4
+ // CHECK: atomicrmw xor ptr {{.*}} monotonic, align 4
__nvvm_atom_xor_gen_i(ip, i);
- // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align {{4|8}}
+ // CHECK: atomicrmw xor ptr {{.*}} monotonic, align {{4|8}}
__nvvm_atom_xor_gen_l(&dl, l);
- // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 8
+ // CHECK: atomicrmw xor ptr {{.*}} monotonic, align 8
__nvvm_atom_xor_gen_ll(&sll, ll);
- // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 4
+ // CHECK: atomicrmw xchg ptr {{.*}} monotonic, align 4
__nvvm_atom_xchg_gen_i(ip, i);
- // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align {{4|8}}
+ // CHECK: atomicrmw xchg ptr {{.*}} monotonic, align {{4|8}}
__nvvm_atom_xchg_gen_l(&dl, l);
- // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 8
+ // CHECK: atomicrmw xchg ptr {{.*}} monotonic, align 8
__nvvm_atom_xchg_gen_ll(&sll, ll);
- // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 4
+ // CHECK: atomicrmw max ptr {{.*}} monotonic, align 4
__nvvm_atom_max_gen_i(ip, i);
- // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 4
+ // CHECK: atomicrmw umax ptr {{.*}} monotonic, align 4
__nvvm_atom_max_gen_ui((unsigned int *)ip, i);
- // CHECK: atomicrmw max ptr {{.*}} seq_cst, align {{4|8}}
+ // CHECK: atomicrmw max ptr {{.*}} monotonic, align {{4|8}}
__nvvm_atom_max_gen_l(&dl, l);
- // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align {{4|8}}
+ // CHECK: atomicrmw umax ptr {{.*}} monotonic, align {{4|8}}
__nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
- // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 8
+ // CHECK: atomicrmw max ptr {{.*}} monotonic, align 8
__nvvm_atom_max_gen_ll(&sll, ll);
- // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 8
+ // CHECK: atomicrmw umax ptr {{.*}} monotonic, align 8
__nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);
- // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 4
+ // CHECK: atomicrmw min ptr {{.*}} monotonic, align 4
__nvvm_atom_min_gen_i(ip, i);
- // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 4
+ // CHECK: atomicrmw umin ptr {{.*}} monotonic, align 4
__nvvm_atom_min_gen_ui((unsigned int *)ip, i);
- // CHECK: atomicrmw min ptr {{.*}} seq_cst, align {{4|8}}
+ // CHECK: atomicrmw min ptr {{.*}} monotonic, align {{4|8}}
__nvvm_atom_min_gen_l(&dl, l);
- // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align {{4|8}}
+ // CHECK: atomicrmw umin ptr {{.*}} monotonic, align {{4|8}}
__nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
- // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 8
+ // CHECK: atomicrmw min ptr {{.*}} monotonic, align 8
__nvvm_atom_min_gen_ll(&sll, ll);
- // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 8
+ // CHECK: atomicrmw umin ptr {{.*}} monotonic, align 8
__nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);
- // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 4
+ // CHECK: cmpxchg ptr {{.*}} monotonic monotonic, align 4
// CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0
__nvvm_atom_cas_gen_i(ip, 0, i);
- // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align {{4|8}}
+ // CHECK: cmpxchg ptr {{.*}} monotonic monotonic, align {{4|8}}
// CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0
__nvvm_atom_cas_gen_l(&dl, 0, l);
- // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 8
+ // CHECK: cmpxchg ptr {{.*}} monotonic monotonic, align 8
// CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0
__nvvm_atom_cas_gen_ll(&sll, 0, ll);
- // CHECK: atomicrmw fadd ptr {{.*}} seq_cst, align 4
+ // CHECK: atomicrmw fadd ptr {{.*}} monotonic, align 4
__nvvm_atom_add_gen_f(fp, f);
- // CHECK: atomicrmw uinc_wrap ptr {{.*}} seq_cst, align 4
+ // CHECK: atomicrmw uinc_wrap ptr {{.*}} monotonic, align 4
__nvvm_atom_inc_gen_ui(uip, ui);
- // CHECK: atomicrmw udec_wrap ptr {{.*}} seq_cst, align 4
+ // CHECK: atomicrmw udec_wrap ptr {{.*}} monotonic, align 4
__nvvm_atom_dec_gen_ui(uip, ui);
@@ -646,7 +646,7 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
#endif
#if __CUDA_ARCH__ >= 700
- // CHECK_PTX63_SM70: cmpxchg ptr {{.*}} seq_cst seq_cst, align 2
+ // CHECK_PTX63_SM70: cmpxchg ptr {{.*}} monotonic monotonic, align 2
// CHECK_PTX63_SM70-NEXT: extractvalue { i16, i1 } {{%[0-9]+}}, 0
__nvvm_atom_cas_gen_us(usp, 0, us);
// CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.cta.i16.p0
More information about the cfe-commits
mailing list