[llvm] [NVPTX] Support for memory orderings for cmpxchg (PR #126159)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 6 16:04:30 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-nvptx
Author: Akshay Deodhar (akshayrdeodhar)
<details>
<summary>Changes</summary>
So far, all cmpxchg instructions were lowered to atom.cas. This change adds support for memory orders in lowering. Specifically:
- For cmpxchg which are emulated, memory ordering is enforced by adding fences around the emulation loops.
- For cmpxchg which are lowered to PTX directly, where the memory order is supported in ptx, lower directly to the correct ptx instruction.
- For seq_cst cmpxchg which are lowered to PTX directly, use a sequence (fence.sc; atom.cas.acquire) to provide the semantics that we want.
Also adds tests for all possible combinations of (size, memory ordering, address space, SM/PTX versions)- (suggestions welcome- we would ideally like to cover everything)
This also adds `atomicOperationOrderAfterFenceSplit` in TargetLowering, for specially handling seq_cst atomics.
---
Patch is 789.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/126159.diff
13 Files Affected:
- (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+8)
- (modified) llvm/lib/CodeGen/AtomicExpandPass.cpp (+4-2)
- (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+68)
- (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.h (+11)
- (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+87-52)
- (modified) llvm/lib/Target/NVPTX/NVPTXSubtarget.h (+1-1)
- (modified) llvm/test/CodeGen/NVPTX/atomics.ll (+3-3)
- (added) llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll (+5680)
- (added) llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll (+5680)
- (added) llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll (+5680)
- (modified) llvm/test/CodeGen/NVPTX/cmpxchg.ll (+1549-80)
- (added) llvm/test/CodeGen/NVPTX/cmpxchg.py (+57)
- (modified) llvm/test/CodeGen/NVPTX/lit.local.cfg (+1-1)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index bbecc7a6ddaee7..ac3233ec30ee3e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2173,6 +2173,14 @@ class TargetLoweringBase {
return false;
}
+ // The memory ordering that AtomicExpandPass should assign to a atomic
+ // instruction that it has lowered by adding fences. This can be used
+ // to "fold" one of the fences into the atomic instruction.
+ virtual AtomicOrdering
+ atomicOperationOrderAfterFenceSplit(const Instruction *I) const {
+ return AtomicOrdering::Monotonic;
+ }
+
/// Whether AtomicExpandPass should automatically insert a trailing fence
/// without reducing the ordering for this atomic. Defaults to false.
virtual bool
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index a75fa688d87a8d..a3e9700fa3089c 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -324,8 +324,10 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
// failure path. As a result, fence insertion is directly done by
// expandAtomicCmpXchg in that case.
FenceOrdering = CASI->getMergedOrdering();
- CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
- CASI->setFailureOrdering(AtomicOrdering::Monotonic);
+ auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
+
+ CASI->setSuccessOrdering(CASOrdering);
+ CASI->setFailureOrdering(CASOrdering);
}
if (FenceOrdering != AtomicOrdering::Monotonic) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 58ad92a8934a66..db5f45fb7d09c8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -42,6 +42,7 @@
#include "llvm/IR/FPEnv.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
@@ -49,6 +50,7 @@
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Alignment.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
@@ -995,6 +997,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// actions
computeRegisterProperties(STI.getRegisterInfo());
+ // PTX support for 16-bit CAS is emulated. Only use 32+
setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
setMaxAtomicSizeInBitsSupported(64);
setMaxDivRemBitWidthSupported(64);
@@ -5565,6 +5568,71 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
return AtomicExpansionKind::CmpXChg;
}
+bool NVPTXTargetLowering::shouldInsertFencesForAtomic(
+ const Instruction *I) const {
+ auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
+ // When CAS bitwidth is not supported on the hardware, the CAS is emulated
+ // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce
+ // the memory order using explicit fences around the retry loop.
+ // The memory order of natively supported CAS operations can be enforced
+ // by lowering to an atom.cas with the right memory synchronizing effect.
+ // However, atom.cas only supports relaxed, acquire, release and acq_rel.
+ // So we also use explicit fences for enforcing memory order for
+ // seq_cast CAS with natively-supported bitwidths.
+ return CI &&
+ (cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
+ STI.getMinCmpXchgSizeInBits() ||
+ CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent);
+}
+
+AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
+ const Instruction *I) const {
+ auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
+ bool BitwidthSupportedAndIsSeqCst =
+ CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
+ cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
+ STI.getMinCmpXchgSizeInBits();
+ return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire
+ : AtomicOrdering::Monotonic;
+}
+
+Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
+ // Specialize for cmpxchg
+ if (isa<AtomicCmpXchgInst>(Inst)) {
+ // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
+ if (isReleaseOrStronger(Ord))
+ return Ord == AtomicOrdering::SequentiallyConsistent
+ ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
+ : Builder.CreateFence(AtomicOrdering::Release);
+ } else {
+ return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
+ }
+ return nullptr;
+}
+
+Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
+ // Specialize for cmpxchg
+ if (isa<AtomicCmpXchgInst>(Inst)) {
+ auto CASWidth =
+ cast<IntegerType>(
+ dyn_cast<AtomicCmpXchgInst>(Inst)->getCompareOperand()->getType())
+ ->getBitWidth();
+ // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
+ if (isAcquireOrStronger(Ord))
+ return (Ord == AtomicOrdering::SequentiallyConsistent &&
+ CASWidth >= STI.getMinCmpXchgSizeInBits())
+ ? nullptr
+ : Builder.CreateFence(AtomicOrdering::Acquire);
+ } else {
+ return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
+ }
+ return nullptr;
+}
+
// Pin NVPTXTargetObjectFile's vtables to this file.
NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 5adf69d621552f..1ee7a9b9ab8e36 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -17,6 +17,7 @@
#include "NVPTX.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Support/AtomicOrdering.h"
namespace llvm {
namespace NVPTXISD {
@@ -260,6 +261,16 @@ class NVPTXTargetLowering : public TargetLowering {
return true;
}
+ bool shouldInsertFencesForAtomic(const Instruction *) const override;
+
+ AtomicOrdering
+ atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;
+
+ Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+ Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+
private:
const NVPTXSubtarget &STI; // cache the subtarget here
SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 06c629c01d9ab0..14744411bc74bc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1962,41 +1962,41 @@ multiclass F_ATOMIC_2_NEG<ValueType regT, NVPTXRegClass regclass, string SpaceSt
// has 3 operands
multiclass F_ATOMIC_3_imp<ValueType ptrT, NVPTXRegClass ptrclass,
- ValueType regT, NVPTXRegClass regclass,
+ ValueType regT, NVPTXRegClass regclass, string SemStr,
string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
Operand IMMType, list<Predicate> Pred> {
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def reg : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, regclass:$b, regclass:$c),
- !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+ !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), (regT regclass:$c)))]>,
Requires<Pred>;
def imm1 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, IMMType:$b, regclass:$c),
- !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+ !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, (regT regclass:$c)))]>,
Requires<Pred>;
def imm2 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, regclass:$b, IMMType:$c),
- !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
+ !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), imm:$c))]>,
Requires<Pred>;
def imm3 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, IMMType:$b, IMMType:$c),
- !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+ !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, imm:$c))]>,
Requires<Pred>;
}
}
-multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SpaceStr, string TypeStr,
- string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
- defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
- IntOp, IMMType, Pred>;
- defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
- IntOp, IMMType, Pred>;
+multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SemStr, string SpaceStr,
+ string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
+ defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SemStr, SpaceStr, TypeStr,
+ OpcStr, IntOp, IMMType, Pred>;
+ defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SemStr, SpaceStr, TypeStr,
+ OpcStr, IntOp, IMMType, Pred>;
}
// atom_add
@@ -2378,51 +2378,86 @@ defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".xor",
defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64",
".xor", atomic_load_xor_i64_gen, i64imm, imm, [hasSM<32>]>;
-// atom_cas
-
-def atomic_cmp_swap_i16_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i16_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i16_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
-
-defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
+multiclass ternary_atomic_op_as {
+ // one record per address space
+ def NAME#_gen: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
+ AS_match.generic>;
+
+ def NAME#_g: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
+ AS_match.global>;
+
+ def NAME#_s: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
+ AS_match.shared>;
+}
+
+// generate pattern fragments for size x memory order
+// NOTE: i8 cmpxchg is not supported in ptx, and AtomicExpandPass will emulate all i8 cmpxchgs
+// using larger-bitwidth cas
+foreach size = ["i16", "i32", "i64"] in {
+ foreach order = ["", "_monotonic", "_acquire", "_release", "_acq_rel", "_seq_cst"] in {
+ defm atomic_cmp_swap#_#size#order: ternary_atomic_op_as;
+ }
+}
+
+multiclass INT_PTX_ATOM_CAS_AS<string atomic_cmp_swap_without_as, string type, string order> {
+ // eg. with type = 32, order = .acquire, atomic_cmp_swap_without_as = atomic_cmp_swap_i32_acquire, and _GEN becomes
+ // F_ATOMIC_3<i32, Int32Regs, ".acquire", ".b32", ".cas", atomic_cmp_swap_i32_acquire, i32imm>
+
+
+ // For SM70, PTX63 +, memory orders are supported
+ defm _GEN: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, "", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+
+ defm _GEN_USE_G: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+
+ defm _G: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_g), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+
+ defm _S: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, ".shared", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_s), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+
+ // For older archs, we fall back to lowering to relaxed atom.cas
+ defm _GEN_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", "", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm")>;
+
+ defm _GEN_USE_G_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", ".global", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm")>;
+
+ defm _G_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", ".global", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_g), !cast<Operand>("i"#type#"imm")>;
+
+ defm _S_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", ".shared", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_s), !cast<Operand>("i"#type#"imm")>;
+}
+
+// Define atom.cas for all combinations of size x memory order supported in PTX *and* on the hardware.
+foreach size = ["32", "64"] in {
+ // We enumerate the instructions with specific memory orders ["acquire", "release", "acq_rel"]
+ // *before* the instructions with the unspecified memory order [""].
+ // This ensures that `cmpxchg ... acquire` is lowered to "atom.acquire.cas" instead of "atom.cas"
+ // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
+ foreach order = ["acquire", "release", "acq_rel", ""] in {
+ if !eq(order, "") then {
+ defm INT_PTX_ATOM_CAS_#size : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size, size, ".relaxed">;
+ } else {
+ defm INT_PTX_ATOM_CAS_#size#_#order : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size#_#order, size, "."#order>;
+ }
+ }
+}
+
+// Note that 16-bit CAS support in PTX is *emulated*. (TODO: public?)
+defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, "", ".global", ".b16", ".cas",
atomic_cmp_swap_i16_g, i16imm, [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, ".shared", ".b16", ".cas",
+defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, "", ".shared", ".b16", ".cas",
atomic_cmp_swap_i16_s, i16imm, [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", ".b16", ".cas",
+defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", "", ".b16", ".cas",
atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
+defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, "", ".global", ".b16", ".cas",
atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas",
- atomic_cmp_swap_i32_g, i32imm>;
-defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas",
- atomic_cmp_swap_i32_s, i32imm>;
-defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<i32, Int32Regs, "", ".b32", ".cas",
- atomic_cmp_swap_i32_gen, i32imm>;
-defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32",
- ".cas", atomic_cmp_swap_i32_gen, i32imm>;
-defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64", ".cas",
- atomic_cmp_swap_i64_g, i64imm>;
-defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<i64, Int64Regs, ".shared", ".b64", ".cas",
- atomic_cmp_swap_i64_s, i64imm>;
-defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<i64, Int64Regs, "", ".b64", ".cas",
- atomic_cmp_swap_i64_gen, i64imm>;
-defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64",
- ".cas", atomic_cmp_swap_i64_gen, i64imm>;
// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 851c9152e4cb8f..f893f1aefec848 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -147,7 +147,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
// set of equivalent memory operations with a scalar data-type, executed in
// an unspecified order on the elements in the vector.
unsigned getMaxRequiredAlignment() const { return 8; }
- // Emulated loops with 32-bit/64-bit CAS generate better SASS than 16-bit CAS
+ // Get the smallest cmpxchg word size that the hardware supports.
unsigned getMinCmpXchgSizeInBits() const { return 32; }
unsigned getPTXVersion() const { return PTXVersion; }
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index fccc3a8844ffc4..ac2bd8828b2057 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -177,7 +177,7 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
; CHECK-LABEL: atomicrmw_add_f16_generic
define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
-; CHECK: atom.cas
+; CHECK: atom.acquire.cas
%ret = atomicrmw fadd ptr %addr, half %val seq_cst
ret half %ret
}
@@ -198,14 +198,14 @@ define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
; CHECK-LABEL: atomic_cmpxchg_i32
define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) {
-; CHECK: atom.cas.b32
+; CHECK: atom.acquire.cas.b32
%pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
ret i32 %new
}
; CHECK-LABEL: atomic_cmpxchg_i64
define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) {
-; CHECK: atom.cas.b64
+; CHECK: atom.acquire.cas.b64
%pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
ret i64 %new
}
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
new file mode 100644
index 00000000000000..ea308c2a7673b7
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
@@ -0,0 +1,5680 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/126159
More information about the llvm-commits
mailing list