[llvm] [LSV][AMDGPU] Vectorize unordered and monotonic atomic loads (PR #190152)
Harrison Hao via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 2 04:58:43 PDT 2026
https://github.com/harrisonGPU updated https://github.com/llvm/llvm-project/pull/190152
>From 50d15efc7adf45446a227db9d1cab74268aeef78 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 2 Apr 2026 18:38:33 +0800
Subject: [PATCH 1/2] [LSV][AMDGPU] Vectorize unordered and monotonic atomic
loads
---
.../llvm/Analysis/TargetTransformInfoImpl.h | 4 +-
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 11 +
.../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 1 +
.../Vectorize/LoadStoreVectorizer.cpp | 67 +++-
llvm/test/CodeGen/AMDGPU/atomic-load-merge.ll | 351 ++++++++++++++++++
5 files changed, 415 insertions(+), 19 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/atomic-load-merge.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 419f2eedd6d8f..333b95e26dd50 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1112,7 +1112,9 @@ class TargetTransformInfoImplBase {
return 128;
}
- virtual bool isLegalToVectorizeLoad(LoadInst *LI) const { return true; }
+ virtual bool isLegalToVectorizeLoad(LoadInst *LI) const {
+ return LI->isSimple();
+ }
virtual bool isLegalToVectorizeStore(StoreInst *SI) const { return true; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index d02bc45bc14f6..298e5f95a7b05 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -398,6 +398,17 @@ unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
return 128;
}
+bool GCNTTIImpl::isLegalToVectorizeLoad(LoadInst *LI) const {
+ // Simple (non-atomic, non-volatile) loads are always legal.
+ if (LI->isSimple())
+ return true;
+ // Allow unordered and monotonic atomic loads to be vectorized. These
+ // orderings impose no cross address synchronization constraints, so merging
+ // adjacent accesses into a wider load is safe. The hardware guarantees
+ // atomicity for naturally aligned loads up to 128 bits.
+ return !LI->isVolatile() && !isStrongerThanMonotonic(LI->getOrdering());
+}
+
bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
Align Alignment,
unsigned AddrSpace) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index ea2bf72836199..edadc11e84056 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -139,6 +139,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
VectorType *VecTy) const override;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override;
+ bool isLegalToVectorizeLoad(LoadInst *LI) const override;
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const;
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 89ba9227d8952..e7816439a6e85 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -130,17 +130,24 @@ namespace {
// (We could in theory remove element-size from the this tuple. We'd just need
// to fix up the vector packing/unpacking code.)
using EqClassKey =
- std::tuple<const Value * /* result of getUnderlyingObject() */,
- unsigned /* AddrSpace */,
- unsigned /* Load/Store element size bits */,
- char /* IsLoad; char b/c bool can't be a DenseMap key */
+ std::tuple<const Value *, // result of getUnderlyingObject()
+ unsigned, // AddrSpace
+ unsigned, // Load/Store element size bits
+ char, // IsLoad; char b/c bool can't be a DenseMap key
+ unsigned, // AtomicOrdering
+ unsigned // SyncScopeID
>;
+
[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
const EqClassKey &K) {
- const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
- return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
- << " of element size " << ElementSize << " bits in addrspace "
- << AddrSpace;
+ const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad, Ordering,
+ SSID] = K;
+ OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
+ << " of element size " << ElementSize << " bits in addrspace "
+ << AddrSpace;
+ if (Ordering != static_cast<unsigned>(AtomicOrdering::NotAtomic))
+ OS << " atomic " << toIRString(static_cast<AtomicOrdering>(Ordering));
+ return OS;
}
// A Chain is a set of instructions such that:
@@ -1159,6 +1166,12 @@ bool Vectorizer::vectorizeChain(Chain &C) {
// i.e. the root of the vector.
VecInst = Builder.CreateAlignedLoad(
VecTy, getLoadStorePointerOperand(C[0].Inst), Alignment);
+ // Preserve atomic ordering and syncscope on the merged load.
+ auto *OrigLI = cast<LoadInst>(C[0].Inst);
+ if (OrigLI->isAtomic()) {
+ cast<LoadInst>(VecInst)->setOrdering(OrigLI->getOrdering());
+ cast<LoadInst>(VecInst)->setSyncScopeID(OrigLI->getSyncScopeID());
+ }
}
for (const ChainElem &E : C) {
@@ -1581,12 +1594,14 @@ void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
// The reduced key has all elements of the ECClassKey except the underlying
// object. Check that EqClassKey has 4 elements and define the reduced key.
- static_assert(std::tuple_size_v<EqClassKey> == 4,
+ static_assert(std::tuple_size_v<EqClassKey> == 6,
"EqClassKey has changed - EqClassReducedKey needs changes too");
using EqClassReducedKey =
std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
std::tuple_element_t<2, EqClassKey> /* Element size */,
- std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
+ std::tuple_element_t<3, EqClassKey> /* IsLoad */,
+ std::tuple_element_t<4, EqClassKey> /* AtomicOrdering */,
+ std::tuple_element_t<5, EqClassKey> /* SyncScopeID */>;
using ECReducedKeyToUnderlyingObjectMap =
MapVector<EqClassReducedKey,
SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
@@ -1599,7 +1614,8 @@ void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
for (const auto &EC : EQClasses) {
const auto &Key = EC.first;
EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
- std::get<3>(Key)};
+ std::get<3>(Key), std::get<4>(Key),
+ std::get<5>(Key)};
auto &UOMap = RedKeyToUOMap[RedKey];
UOMap.insert(std::get<0>(Key));
if (UOMap.size() > 1)
@@ -1621,7 +1637,9 @@ void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
for (const auto &RedKeyToUO : RedKeyToUOMap) {
dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
<< std::get<1>(RedKeyToUO.first) << ", "
- << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> "
+ << static_cast<int>(std::get<2>(RedKeyToUO.first)) << ", "
+ << std::get<3>(RedKeyToUO.first) << ", "
+ << std::get<4>(RedKeyToUO.first) << "} --> "
<< RedKeyToUO.second.size() << " underlying objects:\n";
for (auto UObject : RedKeyToUO.second)
dbgs() << " " << *UObject << '\n';
@@ -1661,10 +1679,15 @@ void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
if (UObject == UltimateTarget)
continue;
- EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
- std::get<2>(RedKey)};
- EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
- std::get<2>(RedKey)};
+ EqClassKey KeyFrom{UObject,
+ std::get<0>(RedKey),
+ std::get<1>(RedKey),
+ std::get<2>(RedKey),
+ std::get<3>(RedKey),
+ std::get<4>(RedKey)};
+ EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey),
+ std::get<1>(RedKey), std::get<2>(RedKey),
+ std::get<3>(RedKey), std::get<4>(RedKey)};
// The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
// request the reference to the instructions vector for KeyTo first.
const auto &VecTo = EQClasses[KeyTo];
@@ -1714,7 +1737,13 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
if (!LI && !SI)
continue;
- if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
+ if ((LI && LI->isVolatile()) || (SI && !SI->isSimple()))
+ continue;
+
+ // Skip atomic accesses with ordering stronger than monotonic. Unordered
+ // and monotonic atomic loads can be safely vectorized as they impose no
+ // cross address ordering constraints.
+ if (LI && isStrongerThanMonotonic(LI->getOrdering()))
continue;
if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
@@ -1755,9 +1784,11 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
(VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
continue;
+ unsigned Ordering = LI ? static_cast<unsigned>(LI->getOrdering()) : 0;
+ unsigned SSID = LI ? static_cast<unsigned>(LI->getSyncScopeID()) : 0;
Ret[{GetUnderlyingObject(Ptr), AS,
DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
- /*IsLoad=*/LI != nullptr}]
+ /*IsLoad=*/LI != nullptr, Ordering, SSID}]
.emplace_back(&I);
}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic-load-merge.ll b/llvm/test/CodeGen/AMDGPU/atomic-load-merge.ll
new file mode 100644
index 0000000000000..827f8ce26646f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomic-load-merge.ll
@@ -0,0 +1,351 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S < %s | FileCheck %s
+
+define amdgpu_cs void @atomic_two_load_monotonic_merge(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_two_load_monotonic_merge(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load atomic <2 x float>, ptr addrspace(1) [[P]] syncscope("agent") monotonic, align 4
+; CHECK-NEXT: [[A01:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; CHECK-NEXT: [[A12:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[A01]], [[A12]]
+; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("agent") monotonic, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p1 syncscope("agent") monotonic, align 4
+ %res = fadd float %a0, %a1
+ store float %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_two_load_monotonic_merge2(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_two_load_monotonic_merge2(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 4
+; CHECK-NEXT: [[TMP1:%.*]] = load atomic <2 x float>, ptr addrspace(1) [[P1]] syncscope("agent") monotonic, align 4
+; CHECK-NEXT: [[A01:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; CHECK-NEXT: [[A12:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[A01]], [[A12]]
+; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %p2 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 8
+ %a0 = load atomic float, ptr addrspace(1) %p1 syncscope("agent") monotonic, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p2 syncscope("agent") monotonic, align 4
+ %res = fadd float %a0, %a1
+ store float %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_two_load_unordered_merge(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_two_load_unordered_merge(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load atomic <2 x float>, ptr addrspace(1) [[P]] syncscope("agent") unordered, align 4
+; CHECK-NEXT: [[A01:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; CHECK-NEXT: [[A12:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[A01]], [[A12]]
+; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("agent") unordered, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p1 syncscope("agent") unordered, align 4
+ %res = fadd float %a0, %a1
+ store float %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_two_load_unordered_workgroup_merge(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_two_load_unordered_workgroup_merge(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load atomic <2 x float>, ptr addrspace(1) [[P]] syncscope("workgroup") unordered, align 4
+; CHECK-NEXT: [[A01:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; CHECK-NEXT: [[A12:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[A01]], [[A12]]
+; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("workgroup") unordered, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p1 syncscope("workgroup") unordered, align 4
+ %res = fadd float %a0, %a1
+ store float %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_two_load_monotonic_workgroup_merge(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_two_load_monotonic_workgroup_merge(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load atomic <2 x float>, ptr addrspace(1) [[P]] syncscope("workgroup") monotonic, align 4
+; CHECK-NEXT: [[A01:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; CHECK-NEXT: [[A12:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[A01]], [[A12]]
+; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("workgroup") monotonic, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p1 syncscope("workgroup") monotonic, align 4
+ %res = fadd float %a0, %a1
+ store float %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_two_load_volatile_no_merge(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_two_load_volatile_no_merge(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 4
+; CHECK-NEXT: [[A0:%.*]] = load atomic volatile float, ptr addrspace(1) [[P]] syncscope("agent") monotonic, align 4
+; CHECK-NEXT: [[A1:%.*]] = load atomic volatile float, ptr addrspace(1) [[P1]] syncscope("agent") monotonic, align 4
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[A0]], [[A1]]
+; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %a0 = load atomic volatile float, ptr addrspace(1) %p syncscope("agent") monotonic, align 4
+ %a1 = load atomic volatile float, ptr addrspace(1) %p1 syncscope("agent") monotonic, align 4
+ %res = fadd float %a0, %a1
+ store float %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_two_load_mixed_atomic_noatomic_no_merge(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_two_load_mixed_atomic_noatomic_no_merge(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 4
+; CHECK-NEXT: [[A01:%.*]] = load atomic float, ptr addrspace(1) [[P]] syncscope("agent") monotonic, align 4
+; CHECK-NEXT: [[A12:%.*]] = load float, ptr addrspace(1) [[P1]], align 4
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[A01]], [[A12]]
+; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("agent") monotonic, align 4
+ %a1 = load float, ptr addrspace(1) %p1, align 4
+ %res = fadd float %a0, %a1
+ store float %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_two_load_gap_no_merge(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_two_load_gap_no_merge(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 8
+; CHECK-NEXT: [[A0:%.*]] = load atomic float, ptr addrspace(1) [[P]] syncscope("agent") monotonic, align 4
+; CHECK-NEXT: [[A1:%.*]] = load atomic float, ptr addrspace(1) [[P1]] syncscope("agent") monotonic, align 4
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[A0]], [[A1]]
+; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 8
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("agent") monotonic, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p1 syncscope("agent") monotonic, align 4
+ %res = fadd float %a0, %a1
+ store float %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_two_load_acquire_no_merge(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_two_load_acquire_no_merge(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 4
+; CHECK-NEXT: [[A0:%.*]] = load atomic float, ptr addrspace(1) [[P]] syncscope("agent") acquire, align 4
+; CHECK-NEXT: [[A1:%.*]] = load atomic float, ptr addrspace(1) [[P1]] syncscope("agent") acquire, align 4
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[A0]], [[A1]]
+; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("agent") acquire, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p1 syncscope("agent") acquire, align 4
+ %res = fadd float %a0, %a1
+ store float %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_two_load_seq_cst_no_merge(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_two_load_seq_cst_no_merge(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 4
+; CHECK-NEXT: [[A0:%.*]] = load atomic float, ptr addrspace(1) [[P]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT: [[A1:%.*]] = load atomic float, ptr addrspace(1) [[P1]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[A0]], [[A1]]
+; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("agent") seq_cst, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p1 syncscope("agent") seq_cst, align 4
+ %res = fadd float %a0, %a1
+ store float %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_two_load_same_offset_no_merge(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_two_load_same_offset_no_merge(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load atomic float, ptr addrspace(1) [[P]] syncscope("agent") monotonic, align 4
+; CHECK-NEXT: [[NUM1:%.*]] = fadd float [[TMP1]], 1.000000e+00
+; CHECK-NEXT: [[NUM2:%.*]] = fadd float [[TMP1]], 2.000000e+00
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[NUM1]], [[NUM2]]
+; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("agent") monotonic, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p syncscope("agent") monotonic, align 4
+ %num1 = fadd float %a0, 1.0
+ %num2 = fadd float %a1, 2.0
+ %res = fadd float %num1, %num2
+ store float %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_two_load_mixed_order_no_merge(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_two_load_mixed_order_no_merge(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 4
+; CHECK-NEXT: [[A01:%.*]] = load atomic float, ptr addrspace(1) [[P]] syncscope("agent") unordered, align 4
+; CHECK-NEXT: [[A12:%.*]] = load atomic float, ptr addrspace(1) [[P1]] syncscope("agent") monotonic, align 4
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[A01]], [[A12]]
+; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("agent") unordered, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p1 syncscope("agent") monotonic, align 4
+ %res = fadd float %a0, %a1
+ store float %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_two_load_mixed_scope_no_merge(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_two_load_mixed_scope_no_merge(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 4
+; CHECK-NEXT: [[A01:%.*]] = load atomic float, ptr addrspace(1) [[P]] syncscope("agent") monotonic, align 4
+; CHECK-NEXT: [[A12:%.*]] = load atomic float, ptr addrspace(1) [[P1]] syncscope("workgroup") monotonic, align 4
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[A01]], [[A12]]
+; CHECK-NEXT: store float [[RES]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("agent") monotonic, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p1 syncscope("workgroup") monotonic, align 4
+ %res = fadd float %a0, %a1
+ store float %res, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_four_load_monotonic_merge_b128(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_four_load_monotonic_merge_b128(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load atomic <4 x float>, ptr addrspace(1) [[P]] syncscope("agent") monotonic, align 4
+; CHECK-NEXT: [[A01:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT: [[A12:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT: [[A34:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT: [[S01:%.*]] = fadd float [[A01]], [[A12]]
+; CHECK-NEXT: [[S23:%.*]] = fadd float [[A23]], [[A34]]
+; CHECK-NEXT: [[SUM:%.*]] = fadd float [[S01]], [[S23]]
+; CHECK-NEXT: store float [[SUM]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %p2 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 8
+ %p3 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 12
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("agent") monotonic, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p1 syncscope("agent") monotonic, align 4
+ %a2 = load atomic float, ptr addrspace(1) %p2 syncscope("agent") monotonic, align 4
+ %a3 = load atomic float, ptr addrspace(1) %p3 syncscope("agent") monotonic, align 4
+ %s01 = fadd float %a0, %a1
+ %s23 = fadd float %a2, %a3
+ %sum = fadd float %s01, %s23
+ store float %sum, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_four_load_monotonic_workgroup_merge_b128(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_four_load_monotonic_workgroup_merge_b128(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load atomic <4 x float>, ptr addrspace(1) [[P]] syncscope("workgroup") monotonic, align 4
+; CHECK-NEXT: [[A01:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT: [[A12:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT: [[A34:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT: [[S01:%.*]] = fadd float [[A01]], [[A12]]
+; CHECK-NEXT: [[S23:%.*]] = fadd float [[A23]], [[A34]]
+; CHECK-NEXT: [[SUM:%.*]] = fadd float [[S01]], [[S23]]
+; CHECK-NEXT: store float [[SUM]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %p2 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 8
+ %p3 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 12
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("workgroup") monotonic, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p1 syncscope("workgroup") monotonic, align 4
+ %a2 = load atomic float, ptr addrspace(1) %p2 syncscope("workgroup") monotonic, align 4
+ %a3 = load atomic float, ptr addrspace(1) %p3 syncscope("workgroup") monotonic, align 4
+ %s01 = fadd float %a0, %a1
+ %s23 = fadd float %a2, %a3
+ %sum = fadd float %s01, %s23
+ store float %sum, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_four_load_unordered_merge_b128(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_four_load_unordered_merge_b128(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load atomic <4 x float>, ptr addrspace(1) [[P]] syncscope("agent") unordered, align 4
+; CHECK-NEXT: [[A01:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT: [[A12:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT: [[A34:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT: [[S01:%.*]] = fadd float [[A01]], [[A12]]
+; CHECK-NEXT: [[S23:%.*]] = fadd float [[A23]], [[A34]]
+; CHECK-NEXT: [[SUM:%.*]] = fadd float [[S01]], [[S23]]
+; CHECK-NEXT: store float [[SUM]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %p2 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 8
+ %p3 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 12
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("agent") unordered, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p1 syncscope("agent") unordered, align 4
+ %a2 = load atomic float, ptr addrspace(1) %p2 syncscope("agent") unordered, align 4
+ %a3 = load atomic float, ptr addrspace(1) %p3 syncscope("agent") unordered, align 4
+ %s01 = fadd float %a0, %a1
+ %s23 = fadd float %a2, %a3
+ %sum = fadd float %s01, %s23
+ store float %sum, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_cs void @atomic_four_load_unordered_workgroup_merge_b128(ptr addrspace(1) align 16 %p, ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_cs void @atomic_four_load_unordered_workgroup_merge_b128(
+; CHECK-SAME: ptr addrspace(1) align 16 [[P:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load atomic <4 x float>, ptr addrspace(1) [[P]] syncscope("workgroup") unordered, align 4
+; CHECK-NEXT: [[A01:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT: [[A12:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT: [[A34:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT: [[S01:%.*]] = fadd float [[A01]], [[A12]]
+; CHECK-NEXT: [[S23:%.*]] = fadd float [[A23]], [[A34]]
+; CHECK-NEXT: [[SUM:%.*]] = fadd float [[S01]], [[S23]]
+; CHECK-NEXT: store float [[SUM]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %p1 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 4
+ %p2 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 8
+ %p3 = getelementptr inbounds i8, ptr addrspace(1) %p, i64 12
+ %a0 = load atomic float, ptr addrspace(1) %p syncscope("workgroup") unordered, align 4
+ %a1 = load atomic float, ptr addrspace(1) %p1 syncscope("workgroup") unordered, align 4
+ %a2 = load atomic float, ptr addrspace(1) %p2 syncscope("workgroup") unordered, align 4
+ %a3 = load atomic float, ptr addrspace(1) %p3 syncscope("workgroup") unordered, align 4
+ %s01 = fadd float %a0, %a1
+ %s23 = fadd float %a2, %a3
+ %sum = fadd float %s01, %s23
+ store float %sum, ptr addrspace(1) %out, align 4
+ ret void
+}
>From 4a964023eac94feeb6e01644bf14ec4c08a78148 Mon Sep 17 00:00:00 2001
From: Harrison Hao <tsworld1314 at gmail.com>
Date: Thu, 2 Apr 2026 19:56:26 +0800
Subject: [PATCH 2/2] Update atomic emum
---
.../Vectorize/LoadStoreVectorizer.cpp | 21 ++++++++++---------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index e7816439a6e85..f3e2a215d7d2a 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -130,12 +130,12 @@ namespace {
// (We could in theory remove element-size from the this tuple. We'd just need
// to fix up the vector packing/unpacking code.)
using EqClassKey =
- std::tuple<const Value *, // result of getUnderlyingObject()
- unsigned, // AddrSpace
- unsigned, // Load/Store element size bits
- char, // IsLoad; char b/c bool can't be a DenseMap key
- unsigned, // AtomicOrdering
- unsigned // SyncScopeID
+ std::tuple<const Value *, // result of getUnderlyingObject()
+ unsigned, // AddrSpace
+ unsigned, // Load/Store element size bits
+ char, // IsLoad; char b/c bool can't be a DenseMap key
+ AtomicOrdering, // AtomicOrdering
+ unsigned // SyncScopeID
>;
[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
@@ -145,8 +145,8 @@ using EqClassKey =
OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
<< " of element size " << ElementSize << " bits in addrspace "
<< AddrSpace;
- if (Ordering != static_cast<unsigned>(AtomicOrdering::NotAtomic))
- OS << " atomic " << toIRString(static_cast<AtomicOrdering>(Ordering));
+ if (Ordering != AtomicOrdering::NotAtomic)
+ OS << " atomic " << toIRString(Ordering);
return OS;
}
@@ -1638,7 +1638,7 @@ void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
<< std::get<1>(RedKeyToUO.first) << ", "
<< static_cast<int>(std::get<2>(RedKeyToUO.first)) << ", "
- << std::get<3>(RedKeyToUO.first) << ", "
+ << static_cast<unsigned>(std::get<3>(RedKeyToUO.first)) << ", "
<< std::get<4>(RedKeyToUO.first) << "} --> "
<< RedKeyToUO.second.size() << " underlying objects:\n";
for (auto UObject : RedKeyToUO.second)
@@ -1784,7 +1784,8 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
(VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
continue;
- unsigned Ordering = LI ? static_cast<unsigned>(LI->getOrdering()) : 0;
+ AtomicOrdering Ordering =
+ LI ? LI->getOrdering() : AtomicOrdering::NotAtomic;
unsigned SSID = LI ? static_cast<unsigned>(LI->getSyncScopeID()) : 0;
Ret[{GetUnderlyingObject(Ptr), AS,
DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
More information about the llvm-commits
mailing list