[llvm] [LoadStoreVectorizer] Allow redundant stores (PR #169946)
Gang Chen via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 28 11:28:32 PST 2025
https://github.com/cmc-rep updated https://github.com/llvm/llvm-project/pull/169946
>From 75954833f636a61c841d37736bc5620c81225a88 Mon Sep 17 00:00:00 2001
From: Gang Chen <Gang.Chen at amd.com>
Date: Fri, 28 Nov 2025 09:49:11 -0800
Subject: [PATCH] [LoadStoreVectorizer] Allow redundant stores
---
.../Vectorize/LoadStoreVectorizer.cpp | 68 +++--------
.../AMDGPU/multiple_tails.ll | 4 -
.../AMDGPU/vectorize-redund-stores.ll | 108 ++++++++++++++++++
3 files changed, 124 insertions(+), 56 deletions(-)
create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-stores.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index c28314f6ab124..d48c00688e878 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -316,14 +316,12 @@ class Vectorizer {
/// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
/// instructions.
///
- /// The map ChainElemOffsets must contain all of the elements in
- /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
- /// address. It's ok if it contains additional entries.
+ /// The map ChainSet must contain all of the elements in
+ /// [ChainBegin, ChainElem]. It's ok if it contains additional entries.
template <bool IsLoadChain>
- bool isSafeToMove(
- Instruction *ChainElem, Instruction *ChainBegin,
- const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
- BatchAAResults &BatchAA);
+ bool isSafeToMove(Instruction *ChainElem, Instruction *ChainBegin,
+ const DenseSet<Instruction *> &ChainSet,
+ BatchAAResults &BatchAA);
/// Merges the equivalence classes if they have underlying objects that differ
/// by one level of indirection (i.e., one is a getelementptr and the other is
@@ -540,9 +538,9 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
// We know that elements in the chain with nonverlapping offsets can't
// alias, but AA may not be smart enough to figure this out. Use a
// hashtable.
- DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
+ DenseSet<Instruction *> ChainSet;
for (const auto &E : C)
- ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
+ ChainSet.insert(E.Inst);
// Across a single invocation of this function the IR is not changing, so
// using a batched Alias Analysis is safe and can reduce compile time.
@@ -573,8 +571,8 @@ std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
SmallVector<ChainElem, 1> NewChain;
NewChain.emplace_back(*ChainBegin);
for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
- if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
- ChainOffsets, BatchAA)) {
+ if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst, ChainSet,
+ BatchAA)) {
LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
<< *ChainIt->Inst << " into " << *ChainBegin->Inst
<< "\n");
@@ -1037,10 +1035,9 @@ bool Vectorizer::vectorizeChain(Chain &C) {
}
template <bool IsLoadChain>
-bool Vectorizer::isSafeToMove(
- Instruction *ChainElem, Instruction *ChainBegin,
- const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
- BatchAAResults &BatchAA) {
+bool Vectorizer::isSafeToMove(Instruction *ChainElem, Instruction *ChainBegin,
+ const DenseSet<Instruction *> &ChainSet,
+ BatchAAResults &BatchAA) {
LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
<< *ChainBegin << ")\n");
@@ -1066,10 +1063,6 @@ bool Vectorizer::isSafeToMove(
return BasicBlock::iterator(ChainBegin);
}());
- const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
- const unsigned ChainElemSize =
- DL.getTypeStoreSize(getLoadStoreType(ChainElem));
-
for (; BBIt != BBItEnd; ++BBIt) {
Instruction *I = &*BBIt;
@@ -1084,39 +1077,10 @@ bool Vectorizer::isSafeToMove(
if (!IsLoadChain && isInvariantLoad(I))
continue;
- // If I is in the chain, we can tell whether it aliases ChainIt by checking
- // what offset ChainIt accesses. This may be better than AA is able to do.
- //
- // We should really only have duplicate offsets for stores (the duplicate
- // loads should be CSE'ed), but in case we have a duplicate load, we'll
- // split the chain so we don't have to handle this case specially.
- if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
- // I and ChainElem overlap if:
- // - I and ChainElem have the same offset, OR
- // - I's offset is less than ChainElem's, but I touches past the
- // beginning of ChainElem, OR
- // - ChainElem's offset is less than I's, but ChainElem touches past the
- // beginning of I.
- const APInt &IOffset = OffsetIt->second;
- unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
- if (IOffset == ChainElemOffset ||
- (IOffset.sle(ChainElemOffset) &&
- (IOffset + IElemSize).sgt(ChainElemOffset)) ||
- (ChainElemOffset.sle(IOffset) &&
- (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
- LLVM_DEBUG({
- // Double check that AA also sees this alias. If not, we probably
- // have a bug.
- ModRefInfo MR =
- BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
- assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
- dbgs() << "LSV: Found alias in chain: " << *I << "\n";
- });
- return false; // We found an aliasing instruction; bail.
- }
-
- continue; // We're confident there's no alias.
- }
+ // Allow on-chain aliasing because write-order is preserved when stores are
+ // vectorized.
+ if (ChainSet.count(I))
+ continue;
LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
ModRefInfo MR = BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
index 57da5976b3cfa..6f3c2fc5f387e 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
@@ -10,11 +10,7 @@ define amdgpu_kernel void @no_crash(i32 %arg) {
; GCN-SAME: i32 [[ARG:%.*]]) {
; GCN-NEXT: [[TEMP2:%.*]] = add i32 [[ARG]], 14
; GCN-NEXT: [[TEMP3:%.*]] = getelementptr [16384 x i32], ptr addrspace(3) @[[GLOB0:[0-9]+]], i32 0, i32 [[TEMP2]]
-; GCN-NEXT: [[TEMP4:%.*]] = add i32 [[ARG]], 15
-; GCN-NEXT: [[TEMP5:%.*]] = getelementptr [16384 x i32], ptr addrspace(3) @[[GLOB0]], i32 0, i32 [[TEMP4]]
; GCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(3) [[TEMP3]], align 4
-; GCN-NEXT: store i32 0, ptr addrspace(3) [[TEMP5]], align 4
-; GCN-NEXT: store i32 0, ptr addrspace(3) [[TEMP5]], align 4
; GCN-NEXT: ret void
;
%temp2 = add i32 %arg, 14
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-stores.ll
new file mode 100644
index 0000000000000..cd3e3bded681a
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vectorize-redund-stores.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
+
+define void @onevec(ptr %ptr, <1 x i32> %sd0, i32 %sd1, i32 %sd2, <1 x i32> %sd3, <1 x i32> %sd4, <1 x i32> %sd5) {
+; CHECK-LABEL: define void @onevec(
+; CHECK-SAME: ptr [[PTR:%.*]], <1 x i32> [[SD0:%.*]], i32 [[SD1:%.*]], i32 [[SD2:%.*]], <1 x i32> [[SD3:%.*]], <1 x i32> [[SD4:%.*]], <1 x i32> [[SD5:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i32> [[SD0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x i32> [[TMP2]], i32 [[SD1]], i32 0
+; CHECK-NEXT: store <1 x i32> [[TMP3]], ptr [[PTR]], align 4
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <1 x i32> poison, i32 [[SD2]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i32> [[SD3]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <1 x i32> [[TMP4]], i32 [[TMP5]], i32 0
+; CHECK-NEXT: store <1 x i32> [[TMP6]], ptr [[GEP1]], align 4
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 32
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i32> [[SD4]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <1 x i32> poison, i32 [[TMP7]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <1 x i32> [[SD5]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <1 x i32> [[TMP8]], i32 [[TMP9]], i32 0
+; CHECK-NEXT: store <1 x i32> [[TMP10]], ptr [[GEP2]], align 4
+; CHECK-NEXT: ret void
+;
+ store <1 x i32> %sd0, ptr %ptr, align 4
+ store i32 %sd1, ptr %ptr, align 4
+
+ %gep1 = getelementptr inbounds i8, ptr %ptr, i32 16
+ store i32 %sd2, ptr %gep1, align 4
+ store <1 x i32> %sd3, ptr %gep1, align 4
+
+ %gep2 = getelementptr inbounds i8, ptr %ptr, i32 32
+ store <1 x i32> %sd4, ptr %gep2, align 4
+ store <1 x i32> %sd5, ptr %gep2, align 4
+ ret void
+}
+
+define void @test(ptr %ptr, i32 %sd0, <2 x i32> %sd1, <2 x i32> %sd2, i32 %sd3) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[SD0:%.*]], <2 x i32> [[SD1:%.*]], <2 x i32> [[SD2:%.*]], i32 [[SD3:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[SD0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SD1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[SD1]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP4]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[SD2]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i32 2
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[SD2]], i32 1
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 3
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[SD3]], i32 2
+; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr [[PTR]], align 4
+; CHECK-NEXT: ret void
+;
+ store i32 %sd0, ptr %ptr, align 4
+ %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4
+ store <2 x i32> %sd1, ptr %gep1, align 4
+ %gep2 = getelementptr inbounds i8, ptr %ptr, i32 8
+ store <2 x i32> %sd2, ptr %gep2, align 4
+ %gep3 = getelementptr inbounds i8, ptr %ptr, i32 8
+ store i32 %sd3, ptr %gep3, align 4
+ ret void
+}
+
+define void @vect_zext_bitcast_i8_st4_to_i32_idx(ptr addrspace(1) %arg1, i32 %base, i32 %sd1, i32 %sd2, i32 %sd25, i32 %sd3, i32 %sd4) {
+; CHECK-LABEL: define void @vect_zext_bitcast_i8_st4_to_i32_idx(
+; CHECK-SAME: ptr addrspace(1) [[ARG1:%.*]], i32 [[BASE:%.*]], i32 [[SD1:%.*]], i32 [[SD2:%.*]], i32 [[SD25:%.*]], i32 [[SD3:%.*]], i32 [[SD4:%.*]]) {
+; CHECK-NEXT: [[ADD1:%.*]] = add nuw i32 [[BASE]], 0
+; CHECK-NEXT: [[ZEXT1:%.*]] = zext i32 [[ADD1]] to i64
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT1]]
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[SD1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[SD2]], i32 1
+; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(1) [[GEP1]], align 4
+; CHECK-NEXT: [[ADD25:%.*]] = add nuw i32 [[BASE]], 6
+; CHECK-NEXT: [[ZEXT25:%.*]] = zext i32 [[ADD25]] to i64
+; CHECK-NEXT: [[GEP25:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT25]]
+; CHECK-NEXT: store i32 [[SD25]], ptr addrspace(1) [[GEP25]], align 4
+; CHECK-NEXT: [[ADD3:%.*]] = add nuw i32 [[BASE]], 8
+; CHECK-NEXT: [[ZEXT3:%.*]] = zext i32 [[ADD3]] to i64
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG1]], i64 [[ZEXT3]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[SD3]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[SD4]], i32 1
+; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr addrspace(1) [[GEP3]], align 4
+; CHECK-NEXT: ret void
+;
+ %add1 = add nuw i32 %base, 0
+ %zext1 = zext i32 %add1 to i64
+ %gep1 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext1
+ store i32 %sd1, ptr addrspace(1) %gep1, align 4
+ %add2 = add nuw i32 %base, 4
+ %zext2 = zext i32 %add2 to i64
+ %gep2 = getelementptr inbounds i8,ptr addrspace(1) %arg1, i64 %zext2
+ store i32 %sd2, ptr addrspace(1) %gep2, align 4
+
+ ; A store with 2-byte overlap breaks continuity.
+ %add25 = add nuw i32 %base, 6
+ %zext25 = zext i32 %add25 to i64
+ %gep25 = getelementptr inbounds i8,ptr addrspace(1) %arg1, i64 %zext25
+ store i32 %sd25, ptr addrspace(1) %gep25, align 4
+
+ %add3 = add nuw i32 %base, 8
+ %zext3 = zext i32 %add3 to i64
+ %gep3 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext3
+ store i32 %sd3, ptr addrspace(1) %gep3, align 4
+ %add4 = add nuw i32 %base, 12
+ %zext4 = zext i32 %add4 to i64
+ %gep4 = getelementptr inbounds i8, ptr addrspace(1) %arg1, i64 %zext4
+ store i32 %sd4, ptr addrspace(1) %gep4, align 4
+ ret void
+}
More information about the llvm-commits
mailing list