[llvm] [LoadStoreVectorizer] Fill gaps in load/store chains to enable vectorization (PR #159388)
Drew Kersnar via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 25 09:21:16 PDT 2025
https://github.com/dakersnar updated https://github.com/llvm/llvm-project/pull/159388
>From 0eb966900d2001a7c5abff565fc218a0f93966e8 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Wed, 17 Sep 2025 15:32:39 +0000
Subject: [PATCH 1/5] [LoadStoreVectorizer] Fill gaps in loads/stores to enable
vectorization
---
.../llvm/Analysis/TargetTransformInfo.h | 6 +
.../llvm/Analysis/TargetTransformInfoImpl.h | 2 +
llvm/lib/Analysis/TargetTransformInfo.cpp | 4 +
.../Target/NVPTX/NVPTXTargetTransformInfo.h | 2 +
.../Vectorize/LoadStoreVectorizer.cpp | 435 ++++++++++++--
.../test/CodeGen/NVPTX/LoadStoreVectorizer.ll | 40 +-
.../CodeGen/NVPTX/param-vectorize-device.ll | 6 +-
llvm/test/CodeGen/NVPTX/variadics-backend.ll | 2 +-
.../LoadStoreVectorizer/NVPTX/extend-chain.ll | 81 +++
.../NVPTX/gap-fill-cleanup.ll | 37 ++
.../NVPTX/gap-fill-invariant.ll | 83 +++
.../NVPTX/gap-fill-vectors.ll | 186 ++++++
.../LoadStoreVectorizer/NVPTX/gap-fill.ll | 194 +++++++
.../LoadStoreVectorizer/NVPTX/masked-store.ll | 541 ++++++++++++++++++
.../LoadStoreVectorizer/NVPTX/vectorize_i8.ll | 3 +-
15 files changed, 1544 insertions(+), 78 deletions(-)
create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll
create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll
create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll
create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll
create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll
create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 41ff54f0781a2..f8f134c833ea2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -817,6 +817,12 @@ class TargetTransformInfo {
LLVM_ABI bool isLegalMaskedLoad(Type *DataType, Align Alignment,
unsigned AddressSpace) const;
+ /// Return true if it is legal to widen loads beyond their current width,
+ /// assuming the result is still well-aligned. For example, converting a load
+ /// i32 to a load i64, or vectorizing three continuous load i32s into a load
+ /// <4 x i32>.
+ LLVM_ABI bool isLegalToWidenLoads() const;
+
/// Return true if the target supports nontemporal store.
LLVM_ABI bool isLegalNTStore(Type *DataType, Align Alignment) const;
/// Return true if the target supports nontemporal load.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 566e1cf51631a..55bd4bd709589 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -318,6 +318,8 @@ class TargetTransformInfoImplBase {
return false;
}
+ virtual bool isLegalToWidenLoads() const { return false; }
+
virtual bool isLegalNTStore(Type *DataType, Align Alignment) const {
// By default, assume nontemporal memory stores are available for stores
// that are aligned and have a size that is a power of 2.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 09b50c5270e57..89cda79558057 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -476,6 +476,10 @@ bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType, Align Alignment,
return TTIImpl->isLegalMaskedLoad(DataType, Alignment, AddressSpace);
}
+bool TargetTransformInfo::isLegalToWidenLoads() const {
+ return TTIImpl->isLegalToWidenLoads();
+}
+
bool TargetTransformInfo::isLegalNTStore(Type *DataType,
Align Alignment) const {
return TTIImpl->isLegalNTStore(DataType, Alignment);
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index b32d931bd3074..d56cff1ce3695 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -72,6 +72,8 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
return isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, AddrSpace);
}
+ bool isLegalToWidenLoads() const override { return true; };
+
// NVPTX has infinite registers of all kinds, but the actual machine doesn't.
// We conservatively return 1 here which is just enough to enable the
// vectorizers but disables heuristics based on the number of registers.
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 7b5137b0185ab..04f4e92826a52 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -119,6 +119,29 @@ using namespace llvm;
#define DEBUG_TYPE "load-store-vectorizer"
+cl::opt<bool>
+ ExtendLoads("vect-extend-loads", cl::Hidden,
+ cl::desc("Load more elements if the target VF is higher "
+ "than the chain length."),
+ cl::init(true));
+
+cl::opt<bool> ExtendStores(
+ "vect-extend-stores", cl::Hidden,
+ cl::desc("Store more elements if the target VF is higher "
+ "than the chain length and we have access to masked stores."),
+ cl::init(true));
+
+cl::opt<bool> FillLoadGaps(
+ "vect-fill-load-gaps", cl::Hidden,
+ cl::desc("Should Loads be introduced in gaps to enable vectorization."),
+ cl::init(true));
+
+cl::opt<bool>
+ FillStoreGaps("vect-fill-store-gaps", cl::Hidden,
+ cl::desc("Should Stores be introduced in gaps to enable "
+ "vectorization into masked stores."),
+ cl::init(true));
+
STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
@@ -246,12 +269,16 @@ class Vectorizer {
const DataLayout &DL;
IRBuilder<> Builder;
- // We could erase instrs right after vectorizing them, but that can mess up
- // our BB iterators, and also can make the equivalence class keys point to
- // freed memory. This is fixable, but it's simpler just to wait until we're
- // done with the BB and erase all at once.
+ /// We could erase instrs right after vectorizing them, but that can mess up
+ /// our BB iterators, and also can make the equivalence class keys point to
+ /// freed memory. This is fixable, but it's simpler just to wait until we're
+ /// done with the BB and erase all at once.
SmallVector<Instruction *, 128> ToErase;
+ /// We insert load/store instructions and GEPs to fill gaps and extend chains
+ /// to enable vectorization. Keep track and delete them later.
+ DenseSet<Instruction *> ExtraElements;
+
public:
Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
@@ -344,6 +371,28 @@ class Vectorizer {
/// Postcondition: For all i, ret[i][0].second == 0, because the first instr
/// in the chain is the leader, and an instr touches distance 0 from itself.
std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
+
+ /// Is a load/store with this alignment allowed by TTI and at least as fast
+ /// as an unvectorized load/store.
+ bool accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS, Align Alignment,
+ unsigned VecElemBits) const;
+
+ /// Before attempting to fill gaps, check if the chain is a candidate for
+ /// a masked store, to save compile time if it is not possible for the address
+ /// space and element type.
+ bool shouldAttemptMaskedStore(const ArrayRef<ChainElem> C) const;
+
+ /// Create a new GEP and a new Load/Store instruction such that the GEP
+ /// is pointing at PrevElem + Offset. In the case of stores, store poison.
+ /// Extra elements will either be combined into a vector/masked store or
+ /// deleted before the end of the pass.
+ ChainElem createExtraElementAfter(const ChainElem &PrevElem, APInt Offset,
+ StringRef Prefix,
+ Align Alignment = Align(1));
+
+ /// Delete dead GEPs and extra Load/Store instructions created by
+ /// createExtraElementAfter
+ void deleteExtraElements();
};
class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -457,12 +506,21 @@ bool Vectorizer::run() {
Changed |= runOnPseudoBB(*It, *std::next(It));
for (Instruction *I : ToErase) {
+ // These will get deleted in deleteExtraElements.
+ // This is because ExtraElements will include both extra elements
+ // that *were* vectorized and extra elements that *were not*
+ // vectorized. ToErase will only include extra elements that *were*
+ // vectorized, so in order to avoid double deletion we skip them here and
+ // handle them in deleteExtraElements.
+ if (ExtraElements.contains(I))
+ continue;
auto *PtrOperand = getLoadStorePointerOperand(I);
if (I->use_empty())
I->eraseFromParent();
RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
}
ToErase.clear();
+ deleteExtraElements();
}
return Changed;
@@ -623,6 +681,29 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
dumpChain(C);
});
+ // If the chain is not contiguous, we try to fill the gap with "extra"
+ // elements to artificially make it contiguous, to try to enable
+ // vectorization.
+ // - Filling gaps in loads is always ok if the target supports widening loads.
+ // - For stores, we only fill gaps if there is a potentially legal masked
+ // store for the target. If later on, we don't end up with a chain that
+ // could be vectorized into a legal masked store, the chains with extra
+ // elements will be filtered out in splitChainByAlignment.
+ bool TryFillGaps = isa<LoadInst>(C[0].Inst)
+ ? (FillLoadGaps && TTI.isLegalToWidenLoads())
+ : (FillStoreGaps && shouldAttemptMaskedStore(C));
+
+ unsigned ASPtrBits =
+ DL.getIndexSizeInBits(getLoadStoreAddressSpace(C[0].Inst));
+
+ // Compute the alignment of the leader of the chain (which every stored offset
+ // is based on) using the current first element of the chain. This is
+ // conservative, we may be able to derive better alignment by iterating over
+ // the chain and finding the leader.
+ Align LeaderOfChainAlign =
+ commonAlignment(getLoadStoreAlignment(C[0].Inst),
+ C[0].OffsetFromLeader.abs().getLimitedValue());
+
std::vector<Chain> Ret;
Ret.push_back({C.front()});
@@ -633,7 +714,8 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
"collectEquivalenceClass");
- APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
+ APInt PrevSzBytes = APInt(ASPtrBits, SzBits / 8);
+ APInt PrevReadEnd = Prev.OffsetFromLeader + PrevSzBytes;
// Add this instruction to the end of the current chain, or start a new one.
bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
@@ -642,10 +724,54 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
<< *Prev.Inst << " (ends at offset " << PrevReadEnd
<< ") -> " << *It->Inst << " (starts at offset "
<< It->OffsetFromLeader << ")\n");
- if (AreContiguous)
+
+ if (AreContiguous) {
CurChain.push_back(*It);
- else
- Ret.push_back({*It});
+ continue;
+ }
+
+ // For now, we aren't filling gaps between load/stores of different sizes.
+ // Additionally, as a conservative heuristic, we only fill gaps of 1-2
+ // elements. Generating loads/stores with too many unused bytes has a side
+ // effect of increasing register pressure (on NVIDIA targets at least),
+ // which could cancel out the benefits of reducing number of load/stores.
+ if (TryFillGaps &&
+ SzBits == DL.getTypeSizeInBits(getLoadStoreType(It->Inst))) {
+ APInt OffsetOfGapStart = Prev.OffsetFromLeader + PrevSzBytes;
+ APInt GapSzBytes = It->OffsetFromLeader - OffsetOfGapStart;
+ if (GapSzBytes == PrevSzBytes) {
+ // There is a single gap between Prev and Curr, create one extra element
+ ChainElem NewElem = createExtraElementAfter(
+ Prev, PrevSzBytes, "GapFill",
+ commonAlignment(LeaderOfChainAlign,
+ OffsetOfGapStart.abs().getLimitedValue()));
+ CurChain.push_back(NewElem);
+ CurChain.push_back(*It);
+ continue;
+ }
+ // There are two gaps between Prev and Curr, only create two extra
+ // elements if Prev is the first element in a sequence of four.
+ // This has the highest chance of resulting in a beneficial vectorization.
+ if ((GapSzBytes == 2 * PrevSzBytes) && (CurChain.size() % 4 == 1)) {
+ ChainElem NewElem1 = createExtraElementAfter(
+ Prev, PrevSzBytes, "GapFill",
+ commonAlignment(LeaderOfChainAlign,
+ OffsetOfGapStart.abs().getLimitedValue()));
+ ChainElem NewElem2 = createExtraElementAfter(
+ NewElem1, PrevSzBytes, "GapFill",
+ commonAlignment(
+ LeaderOfChainAlign,
+ (OffsetOfGapStart + PrevSzBytes).abs().getLimitedValue()));
+ CurChain.push_back(NewElem1);
+ CurChain.push_back(NewElem2);
+ CurChain.push_back(*It);
+ continue;
+ }
+ }
+
+ // The chain is not contiguous and cannot be made contiguous with gap
+ // filling, so we need to start a new chain.
+ Ret.push_back({*It});
}
// Filter out length-1 chains, these are uninteresting.
@@ -721,6 +847,14 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
+ // For compile time reasons, we cache whether or not the superset
+ // of all candidate chains contains any extra stores from earlier gap
+ // filling.
+ bool CandidateChainsMayContainExtraStores =
+ !IsLoadChain && any_of(C, [this](const ChainElem &E) {
+ return ExtraElements.contains(E.Inst);
+ });
+
std::vector<Chain> Ret;
for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
// Find candidate chains of size not greater than the largest vector reg.
@@ -769,41 +903,6 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
continue;
}
- // Is a load/store with this alignment allowed by TTI and at least as fast
- // as an unvectorized load/store?
- //
- // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
- auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
- &F = F](Align Alignment) {
- if (Alignment.value() % SizeBytes == 0)
- return true;
- unsigned VectorizedSpeed = 0;
- bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
- F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
- if (!AllowsMisaligned) {
- LLVM_DEBUG(dbgs()
- << "LSV: Access of " << SizeBytes << "B in addrspace "
- << AS << " with alignment " << Alignment.value()
- << " is misaligned, and therefore can't be vectorized.\n");
- return false;
- }
-
- unsigned ElementwiseSpeed = 0;
- (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
- Alignment, &ElementwiseSpeed);
- if (VectorizedSpeed < ElementwiseSpeed) {
- LLVM_DEBUG(dbgs()
- << "LSV: Access of " << SizeBytes << "B in addrspace "
- << AS << " with alignment " << Alignment.value()
- << " has relative speed " << VectorizedSpeed
- << ", which is lower than the elementwise speed of "
- << ElementwiseSpeed
- << ". Therefore this access won't be vectorized.\n");
- return false;
- }
- return true;
- };
-
// If we're loading/storing from an alloca, align it if possible.
//
// FIXME: We eagerly upgrade the alignment, regardless of whether TTI
@@ -818,8 +917,7 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
isa<AllocaInst>(PtrOperand->stripPointerCasts());
Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
Align PrefAlign = Align(StackAdjustedAlignment);
- if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
- IsAllowedAndFast(PrefAlign)) {
+ if (IsAllocaAccess && Alignment.value() % SizeBytes != 0) {
Align NewAlign = getOrEnforceKnownAlignment(
PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
if (NewAlign >= Alignment) {
@@ -831,7 +929,59 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
}
}
- if (!IsAllowedAndFast(Alignment)) {
+ Chain ExtendingLoadsStores;
+ bool ExtendChain = IsLoadChain
+ ? ExtendLoads
+ : ExtendStores;
+ if (ExtendChain && NumVecElems < TargetVF && NumVecElems % 2 != 0 &&
+ VecElemBits >= 8) {
+ // TargetVF may be a lot higher than NumVecElems,
+ // so only extend to the next power of 2.
+ assert(VecElemBits % 8 == 0);
+ unsigned VecElemBytes = VecElemBits / 8;
+ unsigned NewNumVecElems = PowerOf2Ceil(NumVecElems);
+ unsigned NewSizeBytes = VecElemBytes * NewNumVecElems;
+
+ assert(NewNumVecElems <= TargetVF);
+
+ LLVM_DEBUG(dbgs() << "LSV: attempting to extend chain of "
+ << NumVecElems << " "
+ << (IsLoadChain ? "loads" : "stores") << " to "
+ << NewNumVecElems << " elements\n");
+ // Do not artificially increase the chain if it becomes misaligned,
+ // otherwise we may unnecessary split the chain when the target actually
+ // supports non-pow2 VF.
+ if (accessIsAllowedAndFast(NewSizeBytes, AS, Alignment, VecElemBits) &&
+ ((IsLoadChain ? TTI.isLegalToWidenLoads()
+ : TTI.isLegalMaskedStore(
+ FixedVectorType::get(VecElemTy, NewNumVecElems),
+ Alignment, AS, /*IsMaskConstant=*/true)))) {
+ LLVM_DEBUG(dbgs()
+ << "LSV: extending " << (IsLoadChain ? "load" : "store")
+ << " chain of " << NumVecElems << " "
+ << (IsLoadChain ? "loads" : "stores")
+ << " with total byte size of " << SizeBytes << " to "
+ << NewNumVecElems << " "
+ << (IsLoadChain ? "loads" : "stores")
+ << " with total byte size of " << NewSizeBytes
+ << ", TargetVF=" << TargetVF << " \n");
+
+ unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
+ ChainElem Prev = C[CEnd];
+ for (unsigned i = 0; i < (NewNumVecElems - NumVecElems); i++) {
+ ChainElem NewElem = createExtraElementAfter(
+ Prev, APInt(ASPtrBits, VecElemBytes), "Extend");
+ ExtendingLoadsStores.push_back(NewElem);
+ Prev = ExtendingLoadsStores.back();
+ }
+
+ // Update the size and number of elements for upcoming checks.
+ SizeBytes = NewSizeBytes;
+ NumVecElems = NewNumVecElems;
+ }
+ }
+
+ if (!accessIsAllowedAndFast(SizeBytes, AS, Alignment, VecElemBits)) {
LLVM_DEBUG(
dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
"because its alignment is not AllowedAndFast: "
@@ -849,10 +999,41 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
continue;
}
+ if (CandidateChainsMayContainExtraStores) {
+ // The legality of adding extra stores to ExtendingLoadsStores has
+ // already been checked, but if the candidate chain contains extra
+ // stores from an earlier optimization, confirm legality now.
+ // This filter is essential because, when filling gaps in
+ // splitChainByContinuity, we queried the API to check that (for a given
+ // element type and address space) there *may* be a legal masked store
+ // we can try to create. Now, we need to check if the actual chain we
+ // ended up with is legal to turn into a masked store.
+ // This is relevant for NVPTX targets, for example, where a masked store
+ // is only legal if we have ended up with a 256-bit vector.
+ bool CandidateChainContainsExtraStores = llvm::any_of(
+ ArrayRef<ChainElem>(C).slice(CBegin, CEnd - CBegin + 1),
+ [this](const ChainElem &E) {
+ return ExtraElements.contains(E.Inst);
+ });
+
+ if (CandidateChainContainsExtraStores &&
+ !TTI.isLegalMaskedStore(
+ FixedVectorType::get(VecElemTy, NumVecElems), Alignment, AS,
+ /*IsMaskConstant=*/true)) {
+ LLVM_DEBUG(dbgs()
+ << "LSV: splitChainByAlignment discarding candidate chain "
+ "because it contains extra stores that we cannot "
+ "legally vectorize into a masked store \n");
+ continue;
+ }
+ }
+
// Hooray, we can vectorize this chain!
Chain &NewChain = Ret.emplace_back();
for (unsigned I = CBegin; I <= CEnd; ++I)
NewChain.emplace_back(C[I]);
+ for (ChainElem E : ExtendingLoadsStores)
+ NewChain.emplace_back(E);
CBegin = CEnd; // Skip over the instructions we've added to the chain.
break;
}
@@ -864,6 +1045,12 @@ bool Vectorizer::vectorizeChain(Chain &C) {
if (C.size() < 2)
return false;
+ // If we are left with a two-element chain, and one of the elements is an
+ // extra element, we don't want to vectorize
+ if (C.size() == 2 && (ExtraElements.contains(C[0].Inst) ||
+ ExtraElements.contains(C[1].Inst)))
+ return false;
+
sortChainInOffsetOrder(C);
LLVM_DEBUG({
@@ -983,12 +1170,41 @@ bool Vectorizer::vectorizeChain(Chain &C) {
}
}
- // Chain is in offset order, so C[0] is the instr with the lowest offset,
- // i.e. the root of the vector.
- VecInst = Builder.CreateAlignedStore(
- Vec,
- getLoadStorePointerOperand(C[0].Inst),
- Alignment);
+ // If the chain originates from extra stores, we need to vectorize into a
+ // masked store.
+ bool ChainContainsExtraStores = llvm::any_of(C, [this](const ChainElem &E) {
+ return ExtraElements.contains(E.Inst);
+ });
+ if (ChainContainsExtraStores) {
+ assert(TTI.isLegalMaskedStore(Vec->getType(), Alignment, AS,
+ /*IsMaskConstant=*/true));
+ unsigned MaskIdx = 0;
+ // loop through the chain and create a mask for the masked store
+ Value *Mask = PoisonValue::get(FixedVectorType::get(
+ Builder.getInt1Ty(), cast<FixedVectorType>(VecTy)->getNumElements()));
+ for (const ChainElem &E : C) {
+ bool IsExtraStore = ExtraElements.contains(E.Inst);
+ if (FixedVectorType *VT =
+ dyn_cast<FixedVectorType>(getLoadStoreType(E.Inst))) {
+ for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
+ Mask = Builder.CreateInsertElement(Mask,
+ Builder.getInt1(!IsExtraStore),
+ Builder.getInt32(MaskIdx++));
+ }
+ } else {
+ Mask =
+ Builder.CreateInsertElement(Mask, Builder.getInt1(!IsExtraStore),
+ Builder.getInt32(MaskIdx++));
+ }
+ }
+ VecInst = Builder.CreateMaskedStore(
+ Vec, getLoadStorePointerOperand(C[0].Inst), Alignment, Mask);
+ } else {
+ // Chain is in offset order, so C[0] is the instr with the lowest offset,
+ // i.e. the root of the vector.
+ VecInst = Builder.CreateAlignedStore(
+ Vec, getLoadStorePointerOperand(C[0].Inst), Alignment);
+ }
}
propagateMetadata(VecInst, C);
@@ -1641,3 +1857,118 @@ std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
.sextOrTrunc(OrigBitWidth);
return std::nullopt;
}
+
+bool Vectorizer::accessIsAllowedAndFast(unsigned SizeBytes, unsigned AS,
+ Align Alignment,
+ unsigned VecElemBits) const {
+ if (Alignment.value() % SizeBytes == 0)
+ return true;
+ unsigned VectorizedSpeed = 0;
+ bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
+ F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
+ if (!AllowsMisaligned) {
+ LLVM_DEBUG(
+ dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace " << AS
+ << " with alignment " << Alignment.value()
+ << " is misaligned, and therefore can't be vectorized.\n");
+ return false;
+ }
+
+ unsigned ElementwiseSpeed = 0;
+ (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
+ Alignment, &ElementwiseSpeed);
+ if (VectorizedSpeed < ElementwiseSpeed) {
+ LLVM_DEBUG(dbgs() << "LSV: Access of " << SizeBytes << "B in addrspace "
+ << AS << " with alignment " << Alignment.value()
+ << " has relative speed " << VectorizedSpeed
+ << ", which is lower than the elementwise speed of "
+ << ElementwiseSpeed
+ << ". Therefore this access won't be vectorized.\n");
+ return false;
+ }
+ return true;
+}
+
+bool Vectorizer::shouldAttemptMaskedStore(const ArrayRef<ChainElem> C) const {
+ assert(isa<StoreInst>(C[0].Inst));
+
+ unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
+ Type *ElementType = getLoadStoreType(C[0].Inst)->getScalarType();
+ unsigned VecRegBits = TTI.getLoadStoreVecRegBitWidth(AS);
+ // Assume max alignment, splitChainByAlignment will legalize it later if the
+ // necessary alignment is not reached.
+ Align OptimisticAlign = Align(VecRegBits / 8);
+ unsigned int MaxVectorNumElems =
+ VecRegBits / DL.getTypeSizeInBits(ElementType);
+
+ // Attempt to find the smallest power-of-two number of elements that, if
+ // well aligned, could be represented as a legal masked store.
+ // If one exists for a given element type and address space, it is worth
+ // attempting to fill gaps as we may be able to create a legal masked store.
+ // If we do not end up with a legal masked store, chains with extra elements
+ // will be discarded.
+ const unsigned MinMaskedStoreNumElems = 4;
+ for (unsigned NumElems = MinMaskedStoreNumElems;
+ NumElems <= MaxVectorNumElems; NumElems *= 2) {
+ FixedVectorType *VectorType = FixedVectorType::get(ElementType, NumElems);
+ if (TTI.isLegalMaskedStore(VectorType, OptimisticAlign, AS,
+ /*IsMaskConstant=*/true))
+ return true;
+ }
+ return false;
+}
+
+ChainElem Vectorizer::createExtraElementAfter(const ChainElem &Prev,
+ APInt Offset, StringRef Prefix,
+ Align Alignment) {
+ Instruction *NewElement = nullptr;
+ Builder.SetInsertPoint(Prev.Inst->getNextNode());
+ if (LoadInst *PrevLoad = dyn_cast<LoadInst>(Prev.Inst)) {
+ Value *NewGep = Builder.CreatePtrAdd(
+ PrevLoad->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP");
+ LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n");
+ NewElement = Builder.CreateAlignedLoad(PrevLoad->getType(), NewGep,
+ Alignment, Prefix);
+ } else {
+ StoreInst *PrevStore = cast<StoreInst>(Prev.Inst);
+
+ Value *NewGep = Builder.CreatePtrAdd(
+ PrevStore->getPointerOperand(), Builder.getInt(Offset), Prefix + "GEP");
+ LLVM_DEBUG(dbgs() << "LSV: Extra GEP Created: \n" << *NewGep << "\n");
+ NewElement = Builder.CreateAlignedStore(
+ PoisonValue::get(PrevStore->getValueOperand()->getType()), NewGep,
+ Alignment);
+ }
+
+ // Attach all metadata to the new element.
+ // propagateMetadata will fold it into the final vector when applicable.
+ NewElement->copyMetadata(*Prev.Inst);
+
+ // Cache created elements for tracking and cleanup
+ ExtraElements.insert(NewElement);
+
+ APInt NewOffsetFromLeader = Prev.OffsetFromLeader + Offset;
+ LLVM_DEBUG(dbgs() << "LSV: Extra Element Created: \n"
+ << *NewElement
+ << " OffsetFromLeader: " << NewOffsetFromLeader << "\n");
+ return ChainElem{NewElement, NewOffsetFromLeader};
+}
+
+void Vectorizer::deleteExtraElements() {
+ for (auto *ExtraElement : ExtraElements) {
+ if (isa<LoadInst>(ExtraElement)) {
+ [[maybe_unused]] bool Deleted =
+ RecursivelyDeleteTriviallyDeadInstructions(ExtraElement);
+ assert(Deleted && "Extra Load should always be trivially dead");
+ } else {
+ // Unlike Extra Loads, Extra Stores won't be "dead", but should all be
+ // deleted regardless. They will have either been combined into a masked
+ // store, or will be left behind and need to be cleaned up.
+ auto *PtrOperand = getLoadStorePointerOperand(ExtraElement);
+ ExtraElement->eraseFromParent();
+ RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
+ }
+ }
+
+ ExtraElements.clear();
+}
diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index dd9a472984c25..19ec2574e32b4 100644
--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -45,29 +45,31 @@ define half @fh(ptr %p) {
; ENABLED-LABEL: fh(
; ENABLED: {
; ENABLED-NEXT: .reg .b16 %rs<10>;
-; ENABLED-NEXT: .reg .b32 %r<13>;
+; ENABLED-NEXT: .reg .b32 %r<17>;
; ENABLED-NEXT: .reg .b64 %rd<2>;
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0];
-; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
-; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8];
-; ENABLED-NEXT: cvt.f32.f16 %r1, %rs2;
-; ENABLED-NEXT: cvt.f32.f16 %r2, %rs1;
-; ENABLED-NEXT: add.rn.f32 %r3, %r2, %r1;
-; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r3;
-; ENABLED-NEXT: cvt.f32.f16 %r4, %rs4;
-; ENABLED-NEXT: cvt.f32.f16 %r5, %rs3;
-; ENABLED-NEXT: add.rn.f32 %r6, %r5, %r4;
-; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r6;
-; ENABLED-NEXT: cvt.f32.f16 %r7, %rs7;
-; ENABLED-NEXT: cvt.f32.f16 %r8, %rs6;
-; ENABLED-NEXT: add.rn.f32 %r9, %r8, %r7;
-; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r9;
-; ENABLED-NEXT: cvt.f32.f16 %r10, %rs8;
-; ENABLED-NEXT: cvt.f32.f16 %r11, %rs5;
-; ENABLED-NEXT: add.rn.f32 %r12, %r10, %r11;
-; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r12;
+; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; }
+; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2;
+; ENABLED-NEXT: mov.b32 {%rs4, %rs5}, %r1;
+; ENABLED-NEXT: cvt.f32.f16 %r5, %rs5;
+; ENABLED-NEXT: cvt.f32.f16 %r6, %rs4;
+; ENABLED-NEXT: add.rn.f32 %r7, %r6, %r5;
+; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r7;
+; ENABLED-NEXT: cvt.f32.f16 %r8, %rs3;
+; ENABLED-NEXT: cvt.f32.f16 %r9, %rs2;
+; ENABLED-NEXT: add.rn.f32 %r10, %r9, %r8;
+; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r10;
+; ENABLED-NEXT: cvt.f32.f16 %r11, %rs7;
+; ENABLED-NEXT: cvt.f32.f16 %r12, %rs6;
+; ENABLED-NEXT: add.rn.f32 %r13, %r12, %r11;
+; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r13;
+; ENABLED-NEXT: cvt.f32.f16 %r14, %rs8;
+; ENABLED-NEXT: cvt.f32.f16 %r15, %rs1;
+; ENABLED-NEXT: add.rn.f32 %r16, %r14, %r15;
+; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r16;
; ENABLED-NEXT: st.param.b16 [func_retval0], %rs9;
; ENABLED-NEXT: ret;
;
diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
index 51f6b00601069..4870050dd2d43 100644
--- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
+++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
@@ -171,8 +171,7 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by
; CHECK: .func (.param .align 16 .b8 func_retval0[12])
; CHECK-LABEL: callee_St4x3(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12]
- ; CHECK: ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0];
- ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8];
+ ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0];
; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]];
; CHECK-NEXT: ret;
@@ -394,8 +393,7 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by
; CHECK-LABEL: callee_St4x7(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28]
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
- ; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16];
- ; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24];
+ ; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16];
; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
; CHECK-DAG: st.param.b32 [func_retval0+24], [[R7]];
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 61ff80632c789..5499dbce61bae 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -110,7 +110,7 @@ define dso_local i32 @foo() {
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot1;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
-; CHECK-PTX-NEXT: st.b64 [%SP], 4294967297;
+; CHECK-PTX-NEXT: st.v2.b32 [%SP], {1, 1};
; CHECK-PTX-NEXT: st.b32 [%SP+8], 1;
; CHECK-PTX-NEXT: st.b64 [%SP+16], 1;
; CHECK-PTX-NEXT: st.b64 [%SP+24], 4607182418800017408;
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll
new file mode 100644
index 0000000000000..24d0dea086ba8
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s
+
+;; Check that the vectorizer extends a Chain to the next power of two,
+;; essentially loading more vector elements than the original
+;; code. Alignment and other requirement for vectorization should
+;; still be met.
+
+define void @load3to4(ptr %p) #0 {
+; CHECK-LABEL: define void @load3to4(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P_0]], align 16
+; CHECK-NEXT: [[V01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[V12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[V23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT: [[EXTEND4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT: ret void
+;
+ %p.0 = getelementptr i32, ptr %p, i32 0
+ %p.1 = getelementptr i32, ptr %p, i32 1
+ %p.2 = getelementptr i32, ptr %p, i32 2
+
+ %v0 = load i32, ptr %p.0, align 16
+ %v1 = load i32, ptr %p.1, align 4
+ %v2 = load i32, ptr %p.2, align 8
+
+ ret void
+}
+
+define void @load5to8(ptr %p) #0 {
+; CHECK-LABEL: define void @load5to8(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT: [[P_0:%.*]] = getelementptr i16, ptr [[P]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P_0]], align 16
+; CHECK-NEXT: [[V05:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; CHECK-NEXT: [[V16:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
+; CHECK-NEXT: [[V27:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
+; CHECK-NEXT: [[V38:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
+; CHECK-NEXT: [[V49:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
+; CHECK-NEXT: [[EXTEND10:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
+; CHECK-NEXT: [[EXTEND211:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
+; CHECK-NEXT: [[EXTEND412:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+; CHECK-NEXT: ret void
+;
+ %p.0 = getelementptr i16, ptr %p, i32 0
+ %p.1 = getelementptr i16, ptr %p, i32 1
+ %p.2 = getelementptr i16, ptr %p, i32 2
+ %p.3 = getelementptr i16, ptr %p, i32 3
+ %p.4 = getelementptr i16, ptr %p, i32 4
+
+ %v0 = load i16, ptr %p.0, align 16
+ %v1 = load i16, ptr %p.1, align 2
+ %v2 = load i16, ptr %p.2, align 4
+ %v3 = load i16, ptr %p.3, align 8
+ %v4 = load i16, ptr %p.4, align 2
+
+ ret void
+}
+
+define void @load3to4_unaligned(ptr %p) #0 {
+; CHECK-LABEL: define void @load3to4_unaligned(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0
+; CHECK-NEXT: [[P_2:%.*]] = getelementptr i32, ptr [[P]], i32 2
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P_0]], align 8
+; CHECK-NEXT: [[V01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[V12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[P_2]], align 8
+; CHECK-NEXT: ret void
+;
+ %p.0 = getelementptr i32, ptr %p, i32 0
+ %p.1 = getelementptr i32, ptr %p, i32 1
+ %p.2 = getelementptr i32, ptr %p, i32 2
+
+ %v0 = load i32, ptr %p.0, align 8
+ %v1 = load i32, ptr %p.1, align 4
+ %v2 = load i32, ptr %p.2, align 8
+
+ ret void
+}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll
new file mode 100644
index 0000000000000..e812f8750fa76
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s
+
+; Test that gap filled instructions get deleted if they are not used
+%struct.S10 = type { i32, i32, i32, i32 }
+
+; First, confirm that gap instructions get generated and would be vectorized if the alignment is correct
+define void @fillTwoGapsCanVectorize(ptr %in) {
+; CHECK-LABEL: define void @fillTwoGapsCanVectorize(
+; CHECK-SAME: ptr [[IN:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16
+; CHECK-NEXT: [[LOAD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT: [[LOAD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT: ret void
+;
+ %load0 = load i32, ptr %in, align 16
+ %getElem = getelementptr i8, ptr %in, i64 12
+ %load3 = load i32, ptr %getElem, align 4
+ ret void
+}
+
+; Then, confirm that gap instructions get deleted if the alignment prevents the vectorization
+define void @fillTwoGapsCantVectorize(ptr %in) {
+; CHECK-LABEL: define void @fillTwoGapsCantVectorize(
+; CHECK-SAME: ptr [[IN:%.*]]) {
+; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[IN]], align 4
+; CHECK-NEXT: [[GETELEM:%.*]] = getelementptr i8, ptr [[IN]], i64 12
+; CHECK-NEXT: [[LOAD3:%.*]] = load i32, ptr [[GETELEM]], align 4
+; CHECK-NEXT: ret void
+;
+ %load0 = load i32, ptr %in, align 4
+ %getElem = getelementptr i8, ptr %in, i64 12
+ %load3 = load i32, ptr %getElem, align 4
+ ret void
+}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll
new file mode 100644
index 0000000000000..6d0dfc677780d
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-invariant.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s
+
+; Test that gap filled instructions don't lose invariant metadata
+%struct.S10 = type { i32, i32, i32, i32 }
+
+; With no gaps, if every load is invariant, the vectorized load will be too.
+define i32 @noGaps(ptr %in) {
+; CHECK-LABEL: define i32 @noGaps(
+; CHECK-SAME: ptr [[IN:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16, !invariant.load [[META0:![0-9]+]]
+; CHECK-NEXT: [[TMP01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT: [[SUM01:%.*]] = add i32 [[TMP01]], [[TMP12]]
+; CHECK-NEXT: [[SUM012:%.*]] = add i32 [[SUM01]], [[TMP23]]
+; CHECK-NEXT: [[SUM0123:%.*]] = add i32 [[SUM012]], [[TMP34]]
+; CHECK-NEXT: ret i32 [[SUM0123]]
+;
+ %load0 = load i32, ptr %in, align 16, !invariant.load !0
+ %getElem1 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 1
+ %load1 = load i32, ptr %getElem1, align 4, !invariant.load !0
+ %getElem2 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 2
+ %load2 = load i32, ptr %getElem2, align 4, !invariant.load !0
+ %getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3
+ %load3 = load i32, ptr %getElem3, align 4, !invariant.load !0
+ %sum01 = add i32 %load0, %load1
+ %sum012 = add i32 %sum01, %load2
+ %sum0123 = add i32 %sum012, %load3
+ ret i32 %sum0123
+}
+
+; If one of the loads is not invariant, the vectorized load will not be invariant.
+define i32 @noGapsMissingInvariant(ptr %in) {
+; CHECK-LABEL: define i32 @noGapsMissingInvariant(
+; CHECK-SAME: ptr [[IN:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16
+; CHECK-NEXT: [[TMP01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT: [[SUM01:%.*]] = add i32 [[TMP01]], [[TMP12]]
+; CHECK-NEXT: [[SUM012:%.*]] = add i32 [[SUM01]], [[TMP23]]
+; CHECK-NEXT: [[SUM0123:%.*]] = add i32 [[SUM012]], [[TMP34]]
+; CHECK-NEXT: ret i32 [[SUM0123]]
+;
+ %load0 = load i32, ptr %in, align 16, !invariant.load !0
+ %getElem1 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 1
+ %load1 = load i32, ptr %getElem1, align 4, !invariant.load !0
+ %getElem2 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 2
+ %load2 = load i32, ptr %getElem2, align 4, !invariant.load !0
+ %getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3
+ %load3 = load i32, ptr %getElem3, align 4
+ %sum01 = add i32 %load0, %load1
+ %sum012 = add i32 %sum01, %load2
+ %sum0123 = add i32 %sum012, %load3
+ ret i32 %sum0123
+}
+
+; With two gaps, if every real load is invariant, the vectorized load will be too.
+define i32 @twoGaps(ptr %in) {
+; CHECK-LABEL: define i32 @twoGaps(
+; CHECK-SAME: ptr [[IN:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16, !invariant.load [[META0]]
+; CHECK-NEXT: [[LOAD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT: [[LOAD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LOAD03]], [[LOAD36]]
+; CHECK-NEXT: ret i32 [[SUM]]
+;
+ %load0 = load i32, ptr %in, align 16, !invariant.load !0
+ %getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3
+ %load3 = load i32, ptr %getElem3, align 4, !invariant.load !0
+ %sum = add i32 %load0, %load3
+ ret i32 %sum
+}
+
+!0 = !{}
+;.
+; CHECK: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll
new file mode 100644
index 0000000000000..fe7123898d450
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-vectors.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -mcpu=sm_100 -mattr=+ptx88 -S < %s | FileCheck %s
+
+; The LSV can handle vector inputs, and gap filling can too, with one exception:
+; currently, we do not gap fill when the loads enclosing the gap are different sizes
+; Otherwise, vectors are treated the same as any other scalar types
+
+define void @i1x8_gap_gap_i1x8(ptr %ptr) {
+; CHECK-LABEL: define void @i1x8_gap_gap_i1x8(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i1>, ptr [[PTR0]], align 4
+; CHECK-NEXT: [[L03:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[GAPFILL4:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[GAPFILL25:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT: [[L36:%.*]] = shufflevector <32 x i1> [[TMP1]], <32 x i1> poison, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: ret void
+;
+ %ptr0 = getelementptr i8, ptr %ptr, i64 0
+ %ptr3 = getelementptr i8, ptr %ptr, i64 3
+
+ %l0 = load <8 x i1>, ptr %ptr0, align 4
+ %l3 = load <8 x i1>, ptr %ptr3, align 1
+
+ ret void
+}
+
+; The chain elements are different sizes, gap filling won't kick in
+define void @i1x8_gap_gap_i1x16(ptr %ptr) {
+; CHECK-LABEL: define void @i1x8_gap_gap_i1x16(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr [[PTR]], i64 3
+; CHECK-NEXT: [[L0:%.*]] = load <8 x i1>, ptr [[PTR0]], align 4
+; CHECK-NEXT: [[L3:%.*]] = load <16 x i1>, ptr [[PTR3]], align 2
+; CHECK-NEXT: ret void
+;
+ %ptr0 = getelementptr i8, ptr %ptr, i64 0
+ %ptr3 = getelementptr i8, ptr %ptr, i64 3
+
+ %l0 = load <8 x i1>, ptr %ptr0, align 4
+ %l3 = load <16 x i1>, ptr %ptr3, align 2
+
+ ret void
+}
+
+; Gap of two load <2 x i8>s gets filled
+define void @i8x2_gap_gap_i8x2(ptr %ptr) {
+; CHECK-LABEL: define void @i8x2_gap_gap_i8x2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[PTR0]], align 8
+; CHECK-NEXT: [[L03:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[GAPFILL4:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[GAPFILL25:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT: [[L36:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT: ret void
+;
+ %ptr0 = getelementptr i8, ptr %ptr, i64 0
+ %ptr3 = getelementptr i8, ptr %ptr, i64 6
+
+ %l0 = load <2 x i8>, ptr %ptr0, align 8
+ %l3 = load <2 x i8>, ptr %ptr3, align 2
+
+ ret void
+}
+
+; The chain elements are different sizes, gap filling won't kick in
+define void @i8x2_gap_gap_i8(ptr %ptr) {
+; CHECK-LABEL: define void @i8x2_gap_gap_i8(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[PTR3:%.*]] = getelementptr i8, ptr [[PTR]], i64 6
+; CHECK-NEXT: [[L0:%.*]] = load <2 x i8>, ptr [[PTR0]], align 8
+; CHECK-NEXT: [[L3:%.*]] = load i8, ptr [[PTR3]], align 1
+; CHECK-NEXT: ret void
+;
+ %ptr0 = getelementptr i8, ptr %ptr, i64 0
+ %ptr3 = getelementptr i8, ptr %ptr, i64 6
+
+ %l0 = load <2 x i8>, ptr %ptr0, align 8
+ %l3 = load i8, ptr %ptr3, align 1
+
+ ret void
+}
+
+
+define void @i16x2_gap_i16x2_i16x2(ptr %ptr) {
+; CHECK-LABEL: define void @i16x2_gap_i16x2_i16x2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[PTR0]], align 16
+; CHECK-NEXT: [[L01:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[GAPFILL2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[L23:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT: [[L34:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT: ret void
+;
+ %ptr0 = getelementptr i8, ptr %ptr, i64 0
+ %ptr2 = getelementptr i8, ptr %ptr, i64 8
+ %ptr3 = getelementptr i8, ptr %ptr, i64 12
+
+ %l0 = load <2 x i16>, ptr %ptr0, align 16
+ %l2 = load <2 x i16>, ptr %ptr2, align 2
+ %l3 = load <2 x i16>, ptr %ptr3, align 2
+
+ ret void
+}
+
+define void @i16x2_gap_gap_i16x2(ptr %ptr) {
+; CHECK-LABEL: define void @i16x2_gap_gap_i16x2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[PTR0]], align 16
+; CHECK-NEXT: [[L03:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[GAPFILL4:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[GAPFILL25:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT: [[L36:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT: ret void
+;
+ %ptr0 = getelementptr i8, ptr %ptr, i64 0
+ %ptr3 = getelementptr i8, ptr %ptr, i64 12
+
+ %l0 = load <2 x i16>, ptr %ptr0, align 16
+ %l3 = load <2 x i16>, ptr %ptr3, align 4
+
+ ret void
+}
+
+define void @i32x2_i32x2_gap_i32x2(ptr addrspace(1) %in) {
+; CHECK-LABEL: define void @i32x2_i32x2_gap_i32x2(
+; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[IN]], align 32
+; CHECK-NEXT: [[VEC01:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[VEC12:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[GAPFILL3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT: [[VEC34:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT: ret void
+;
+ %vec0 = load <2 x i32>, ptr addrspace(1) %in, align 32
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8
+ %vec1 = load <2 x i32>, ptr addrspace(1) %getElem1, align 8
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 24
+ %vec3 = load <2 x i32>, ptr addrspace(1) %getElem3, align 8
+ ret void
+}
+
+; This gap is filled but then eventually discarded because the total size
+; of the vector is larger than the target supports.
+define void @i64x2_gap_i64x2_i64x2(ptr addrspace(1) %in) {
+; CHECK-LABEL: define void @i64x2_gap_i64x2_i64x2(
+; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[VEC0:%.*]] = load <2 x i64>, ptr addrspace(1) [[IN]], align 32
+; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN]], i32 32
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(1) [[GETELEM3]], align 32
+; CHECK-NEXT: [[VEC31:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[VEC12:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: ret void
+;
+ %vec0 = load <2 x i64>, ptr addrspace(1) %in, align 32
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
+ %vec3 = load <2 x i64>, ptr addrspace(1) %getElem3, align 32
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 48
+ %vec1 = load <2 x i64>, ptr addrspace(1) %getElem1, align 16
+ ret void
+}
+
+; This gap is filled but then eventually discarded because the total size
+; of the vector is larger than the target supports.
+define void @i64x2_i64x2_gap_i64x2(ptr addrspace(1) %in) {
+; CHECK-LABEL: define void @i64x2_i64x2_gap_i64x2(
+; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(1) [[IN]], align 32
+; CHECK-NEXT: [[VEC01:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: [[VEC32:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN]], i32 48
+; CHECK-NEXT: [[VEC1:%.*]] = load <2 x i64>, ptr addrspace(1) [[GETELEM1]], align 8
+; CHECK-NEXT: ret void
+;
+ %vec0 = load <2 x i64>, ptr addrspace(1) %in, align 32
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16
+ %vec3 = load <2 x i64>, ptr addrspace(1) %getElem3, align 16
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 48
+ %vec1 = load <2 x i64>, ptr addrspace(1) %getElem1, align 8
+ ret void
+}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll
new file mode 100644
index 0000000000000..82ebffed7f765
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill.ll
@@ -0,0 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s
+
+; Load elements 0, 1, and 3, filling the gap with a generated load of element 2
+define void @test(ptr %ptr) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16
+; CHECK-NEXT: [[LD01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[LD12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[GAPFILL3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT: [[LD34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT: ret void
+;
+ %ld0 = load i32, ptr %ptr, align 16
+ %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4
+ %ld1 = load i32, ptr %gep1, align 4
+ %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12
+ %ld3 = load i32, ptr %gep3, align 4
+ ret void
+}
+
+; Load elements 0, 2, and 3, filling the gap with a generated load of element 1
+define void @test2(ptr %ptr) {
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16
+; CHECK-NEXT: [[LD01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[GAPFILL2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[LD23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT: [[LD34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT: ret void
+;
+ %ld0 = load i32, ptr %ptr, align 16
+ %gep2 = getelementptr inbounds i8, ptr %ptr, i32 8
+ %ld2 = load i32, ptr %gep2, align 4
+ %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12
+ %ld3 = load i32, ptr %gep3, align 4
+ ret void
+}
+
+; This gap can be filled, but the types are too large to do a v4 load,
+; So we should end up with a v2 load and a single scalar load
+define void @test3(ptr %ptr) {
+; CHECK-LABEL: define void @test3(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[PTR]], align 16
+; CHECK-NEXT: [[LD01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT: [[LD12:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 24
+; CHECK-NEXT: [[LD3:%.*]] = load i64, ptr [[GEP3]], align 4
+; CHECK-NEXT: ret void
+;
+ %ld0 = load i64, ptr %ptr, align 16
+ %gep1 = getelementptr inbounds i8, ptr %ptr, i32 8
+ %ld1 = load i64, ptr %gep1, align 4
+ %gep3 = getelementptr inbounds i8, ptr %ptr, i32 24
+ %ld3 = load i64, ptr %gep3, align 4
+ ret void
+}
+
+; This gap can be filled, but the types are too large to do a v4 load,
+; So we should end up with a v2 load and a single scalar load
+define void @test4(ptr %ptr) {
+; CHECK-LABEL: define void @test4(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT: [[LD0:%.*]] = load i64, ptr [[PTR]], align 16
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[GEP2]], align 16
+; CHECK-NEXT: [[LD21:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT: [[LD32:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; CHECK-NEXT: ret void
+;
+ %ld0 = load i64, ptr %ptr, align 16
+ %gep2 = getelementptr inbounds i8, ptr %ptr, i32 16
+ %ld2 = load i64, ptr %gep2, align 16
+ %gep3 = getelementptr inbounds i8, ptr %ptr, i32 24
+ %ld3 = load i64, ptr %gep3, align 4
+ ret void
+}
+
+; Load elements 0 and 3, filling the gap with a generated load of element 1 and 2
+define void @test5(ptr %ptr) {
+; CHECK-LABEL: define void @test5(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16
+; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT: [[LD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT: ret void
+;
+ %ld0 = load i32, ptr %ptr, align 16
+ %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12
+ %ld3 = load i32, ptr %gep3, align 4
+ ret void
+}
+
+; Load elements 0, 1, 3, 4, 6, and 7, filling gaps at elements 2 and 5.
+define void @test6(ptr %ptr) {
+; CHECK-LABEL: define void @test6(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16
+; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[LD14:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[GAPFILL5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT: [[LD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GEP4]], align 16
+; CHECK-NEXT: [[LD47:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[GAPFILL28:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[LD69:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT: [[LD710:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT: ret void
+;
+ %ld0 = load i32, ptr %ptr, align 16
+ %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4
+ %ld1 = load i32, ptr %gep1, align 4
+ %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12
+ %ld3 = load i32, ptr %gep3, align 4
+
+ %gep4 = getelementptr inbounds i8, ptr %ptr, i32 16
+ %ld4 = load i32, ptr %gep4, align 16
+ %gep6 = getelementptr inbounds i8, ptr %ptr, i32 24
+ %ld6 = load i32, ptr %gep6, align 4
+ %gep7 = getelementptr inbounds i8, ptr %ptr, i32 28
+ %ld7 = load i32, ptr %gep7, align 4
+ ret void
+}
+
+; Load elements 0, 1, 3, 4 and 7, elements 2, 5, and 6 will be filled
+define void @test7(ptr %ptr) {
+; CHECK-LABEL: define void @test7(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16
+; CHECK-NEXT: [[LD05:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[LD16:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[GAPFILL7:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT: [[LD38:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GEP4]], align 16
+; CHECK-NEXT: [[LD49:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[GAPFILL210:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[GAPFILL411:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT: [[LD712:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT: ret void
+;
+ %ld0 = load i32, ptr %ptr, align 16
+ %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4
+ %ld1 = load i32, ptr %gep1, align 4
+ %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12
+ %ld3 = load i32, ptr %gep3, align 4
+
+ %gep4 = getelementptr inbounds i8, ptr %ptr, i32 16
+ %ld4 = load i32, ptr %gep4, align 16
+ %gep7 = getelementptr inbounds i8, ptr %ptr, i32 28
+ %ld7 = load i32, ptr %gep7, align 4
+ ret void
+}
+
+; Load elements 0, 1, 3, 5, 6, and 7. Elements 2 and 4 will be filled.
+; Element 4 will be created and well-aligned because of its
+; distance from the first load.
+define void @test8(ptr %ptr) {
+; CHECK-LABEL: define void @test8(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR]], align 16
+; CHECK-NEXT: [[LD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[LD14:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[GAPFILL5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT: [[LD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 12
+; CHECK-NEXT: [[GAPFILLGEP1:%.*]] = getelementptr i8, ptr [[GEP3]], i64 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GAPFILLGEP1]], align 16
+; CHECK-NEXT: [[GAPFILL27:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[LD58:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[LD69:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT: [[LD710:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT: ret void
+;
+ %ld0 = load i32, ptr %ptr, align 16
+ %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4
+ %ld1 = load i32, ptr %gep1, align 4
+ %gep3 = getelementptr inbounds i8, ptr %ptr, i32 12
+ %ld3 = load i32, ptr %gep3, align 4
+
+ %gep5 = getelementptr inbounds i8, ptr %ptr, i32 20
+ %ld5 = load i32, ptr %gep5, align 16
+ %gep6 = getelementptr inbounds i8, ptr %ptr, i32 24
+ %ld6 = load i32, ptr %gep6, align 4
+ %gep7 = getelementptr inbounds i8, ptr %ptr, i32 28
+ %ld7 = load i32, ptr %gep7, align 4
+ ret void
+}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll
new file mode 100644
index 0000000000000..1346bd0a3fc26
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/masked-store.ll
@@ -0,0 +1,541 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=infer-alignment,load-store-vectorizer -mcpu=sm_100 -mattr=+ptx88 -S -o - %s | FileCheck %s
+
+; POSITIVE TESTS
+
+; store elements 0, 1, and 3, filling the gap with a generated store of element 2
+define void @singleGap(ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @singleGap(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> <i64 1, i64 2, i64 poison, i64 4>, ptr addrspace(1) [[OUT]], i32 32, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
+; CHECK-NEXT: ret void
+;
+ store i64 1, ptr addrspace(1) %out, align 32
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
+ store i64 2, ptr addrspace(1) %getElem1, align 8
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
+ store i64 4, ptr addrspace(1) %getElem3, align 8
+ ret void
+}
+
+; store elements 0, 1, and 3, filling the gap with a generated store of element 2
+define void @singleGapDouble(ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @singleGapDouble(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.masked.store.v4f64.p1(<4 x double> <double 1.000000e+00, double 2.000000e+00, double poison, double 4.000000e+00>, ptr addrspace(1) [[OUT]], i32 32, <4 x i1> <i1 true, i1 true, i1 false, i1 true>)
+; CHECK-NEXT: ret void
+;
+ store double 1.0, ptr addrspace(1) %out, align 32
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
+ store double 2.0, ptr addrspace(1) %getElem1, align 8
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
+ store double 4.0, ptr addrspace(1) %getElem3, align 8
+ ret void
+}
+
+; store elements 0, 3, filling the gaps with generated stores of elements 1 and 2
+define void @multipleGaps(ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @multipleGaps(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> <i64 1, i64 poison, i64 poison, i64 4>, ptr addrspace(1) [[OUT]], i32 32, <4 x i1> <i1 true, i1 false, i1 false, i1 true>)
+; CHECK-NEXT: ret void
+;
+ store i64 1, ptr addrspace(1) %out, align 32
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
+ store i64 4, ptr addrspace(1) %getElem3, align 8
+ ret void
+}
+
+; store elements 0, 3, 4, 7, filling the gaps with generated stores of elements 1, 2, 5, 6
+define void @multipleGaps8xi32(ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @multipleGaps8xi32(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> <i32 1, i32 poison, i32 poison, i32 2, i32 4, i32 poison, i32 poison, i32 8>, ptr addrspace(1) [[OUT]], i32 32, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true>)
+; CHECK-NEXT: ret void
+;
+ store i32 1, ptr addrspace(1) %out, align 32
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12
+ store i32 2, ptr addrspace(1) %getElem3, align 4
+ %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
+ store i32 4, ptr addrspace(1) %getElem4, align 4
+ %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28
+ store i32 8, ptr addrspace(1) %getElem7, align 4
+ ret void
+}
+
+; store elements 0, 1, 2, 3, 5, 6, 7, filling the gap with a generated store of element 4,
+; resulting in two 4xi64 stores with the second one led by a gap filled store.
+define void @singleGapLongerChain(ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @singleGapLongerChain(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24
+; CHECK-NEXT: store <4 x i64> <i64 1, i64 2, i64 3, i64 4>, ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: [[GAPFILLGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[GETELEM3]], i64 8
+; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> <i64 poison, i64 6, i64 7, i64 8>, ptr addrspace(1) [[GAPFILLGEP]], i32 32, <4 x i1> <i1 false, i1 true, i1 true, i1 true>)
+; CHECK-NEXT: ret void
+;
+ store i64 1, ptr addrspace(1) %out, align 32
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
+ store i64 2, ptr addrspace(1) %getElem1, align 8
+ %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
+ store i64 3, ptr addrspace(1) %getElem2, align 8
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
+ store i64 4, ptr addrspace(1) %getElem3, align 8
+ %getElem5 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 40
+ store i64 6, ptr addrspace(1) %getElem5, align 8
+ %getElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 48
+ store i64 7, ptr addrspace(1) %getElem6, align 8
+ %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 56
+ store i64 8, ptr addrspace(1) %getElem7, align 8
+ ret void
+}
+
+; store elements 0, 1, and 3, filling the gap with a generated store of element 2
+define void @vectorElements(ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @vectorElements(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 7, i32 8>, ptr addrspace(1) [[OUT]], i32 32, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 true>)
+; CHECK-NEXT: ret void
+;
+ store <2 x i32> <i32 1, i32 2>, ptr addrspace(1) %out, align 32
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
+ store <2 x i32> <i32 3, i32 4>, ptr addrspace(1) %getElem1, align 8
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
+ store <2 x i32> <i32 7, i32 8>, ptr addrspace(1) %getElem3, align 8
+ ret void
+}
+
+; store elements 0, 1, 3. 2 should not end up filled because 8xi64 is not legal.
+define void @vectorElements64(ptr addrspace(1) %in) {
+; CHECK-LABEL: define void @vectorElements64(
+; CHECK-SAME: ptr addrspace(1) [[IN:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: store <4 x i64> <i64 1, i64 2, i64 3, i64 4>, ptr addrspace(1) [[IN]], align 32
+; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN]], i32 48
+; CHECK-NEXT: store <2 x i64> <i64 7, i64 8>, ptr addrspace(1) [[GETELEM1]], align 16
+; CHECK-NEXT: ret void
+;
+ store <2 x i64> <i64 1, i64 2>, ptr addrspace(1) %in, align 32
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16
+ store <2 x i64> <i64 3, i64 4>, ptr addrspace(1) %getElem1, align 16
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 48
+ store <2 x i64> <i64 7, i64 8>, ptr addrspace(1) %getElem3, align 16
+ ret void
+}
+
+; store elements 0, 1, 2, extending element 3
+define void @extendStores(ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @extendStores(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.masked.store.v4i64.p1(<4 x i64> <i64 1, i64 2, i64 3, i64 poison>, ptr addrspace(1) [[OUT]], i32 32, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+; CHECK-NEXT: ret void
+;
+ store i64 1, ptr addrspace(1) %out, align 32
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
+ store i64 2, ptr addrspace(1) %getElem1, align 8
+ %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
+ store i64 3, ptr addrspace(1) %getElem2, align 8
+ ret void
+}
+
+; store elements 0, 1, 2, 3, 4 extending elements 5, 6, 7
+define void @extendStores8xi32(ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @extendStores8xi32(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison>, ptr addrspace(1) [[OUT]], i32 32, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false>)
+; CHECK-NEXT: ret void
+;
+ store i32 1, ptr addrspace(1) %out, align 32
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4
+ store i32 2, ptr addrspace(1) %getElem1, align 4
+ %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
+ store i32 3, ptr addrspace(1) %getElem2, align 4
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12
+ store i32 4, ptr addrspace(1) %getElem3, align 4
+ %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
+ store i32 5, ptr addrspace(1) %getElem4, align 4
+ ret void
+}
+
+; store elements 0, 1, 2, 3, 4 extending elements 5, 6, 7
+define void @extendStoresFromLoads8xi32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @extendStoresFromLoads8xi32(
+; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[IN]], align 32
+; CHECK-NEXT: [[LOAD05:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[LOAD16:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[LOAD27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; CHECK-NEXT: [[LOAD38:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; CHECK-NEXT: [[LOAD49:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; CHECK-NEXT: [[EXTENDLOAD10:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; CHECK-NEXT: [[EXTENDLOAD211:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6
+; CHECK-NEXT: [[EXTENDLOAD412:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[LOAD05]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOAD16]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[LOAD27]], i32 2
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[LOAD38]], i32 3
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[LOAD49]], i32 4
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 poison, i32 5
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 poison, i32 6
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 poison, i32 7
+; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP9]], ptr addrspace(1) [[OUT]], i32 32, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false>)
+; CHECK-NEXT: ret void
+;
+ %load0 = load i32, ptr addrspace(1) %in, align 32
+ %loadGetElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4
+ %load1 = load i32, ptr addrspace(1) %loadGetElem1, align 4
+ %loadGetElem2 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8
+ %load2 = load i32, ptr addrspace(1) %loadGetElem2, align 4
+ %loadGetElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12
+ %load3 = load i32, ptr addrspace(1) %loadGetElem3, align 4
+ %loadGetElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16
+ %load4 = load i32, ptr addrspace(1) %loadGetElem4, align 4
+
+ store i32 %load0, ptr addrspace(1) %out, align 32
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4
+ store i32 %load1, ptr addrspace(1) %getElem1, align 4
+ %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
+ store i32 %load2, ptr addrspace(1) %getElem2, align 4
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12
+ store i32 %load3, ptr addrspace(1) %getElem3, align 4
+ %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
+ store i32 %load4, ptr addrspace(1) %getElem4, align 4
+ ret void
+}
+
+; store elements 0, 1, 3, 4, gap fill element 2, extend elements 5, 6, 7
+define void @extendAndGapFillStoresFromLoads8xi32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @extendAndGapFillStoresFromLoads8xi32(
+; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[IN]], align 32
+; CHECK-NEXT: [[LOAD05:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[LOAD16:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[LOAD27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; CHECK-NEXT: [[LOAD38:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; CHECK-NEXT: [[LOAD49:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; CHECK-NEXT: [[EXTENDLOAD10:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; CHECK-NEXT: [[EXTENDLOAD211:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6
+; CHECK-NEXT: [[EXTENDLOAD412:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[LOAD05]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOAD16]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 poison, i32 2
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[LOAD38]], i32 3
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[LOAD49]], i32 4
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 poison, i32 5
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 poison, i32 6
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 poison, i32 7
+; CHECK-NEXT: call void @llvm.masked.store.v8i32.p1(<8 x i32> [[TMP9]], ptr addrspace(1) [[OUT]], i32 32, <8 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false>)
+; CHECK-NEXT: ret void
+;
+ %load0 = load i32, ptr addrspace(1) %in, align 32
+ %loadGetElem1 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4
+ %load1 = load i32, ptr addrspace(1) %loadGetElem1, align 4
+ %loadGetElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12
+ %load3 = load i32, ptr addrspace(1) %loadGetElem3, align 4
+ %loadGetElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16
+ %load4 = load i32, ptr addrspace(1) %loadGetElem4, align 4
+
+ store i32 %load0, ptr addrspace(1) %out, align 32
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4
+ store i32 %load1, ptr addrspace(1) %getElem1, align 4
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12
+ store i32 %load3, ptr addrspace(1) %getElem3, align 4
+ %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
+ store i32 %load4, ptr addrspace(1) %getElem4, align 4
+ ret void
+}
+
+
+; NEGATIVE TESTS
+
+; Wrong address space, no gap filling
+define void @singleGapWrongAddrSpace(ptr addrspace(3) %out) {
+; CHECK-LABEL: define void @singleGapWrongAddrSpace(
+; CHECK-SAME: ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: store <2 x i64> <i64 1, i64 2>, ptr addrspace(3) [[OUT]], align 32
+; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[OUT]], i32 24
+; CHECK-NEXT: store i64 4, ptr addrspace(3) [[GETELEM3]], align 8
+; CHECK-NEXT: ret void
+;
+ store i64 1, ptr addrspace(3) %out, align 32
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(3) %out, i32 8
+ store i64 2, ptr addrspace(3) %getElem1, align 8
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(3) %out, i32 24
+ store i64 4, ptr addrspace(3) %getElem3, align 8
+ ret void
+}
+
+; Not enough alignment for masked store, but we still vectorize the smaller vector
+define void @singleGapMisaligned(ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @singleGapMisaligned(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: store <2 x i64> <i64 1, i64 2>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT: [[GETELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24
+; CHECK-NEXT: store i64 4, ptr addrspace(1) [[GETELEM3]], align 8
+; CHECK-NEXT: ret void
+;
+ store i64 1, ptr addrspace(1) %out, align 16
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
+ store i64 2, ptr addrspace(1) %getElem1, align 8
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
+ store i64 4, ptr addrspace(1) %getElem3, align 8
+ ret void
+}
+
+; Not enough bytes to meet the minimum masked store size for the target
+define void @singleGap4xi32(ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @singleGap4xi32(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: store i32 1, ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: [[GETELEM2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8
+; CHECK-NEXT: store <2 x i32> <i32 3, i32 4>, ptr addrspace(1) [[GETELEM2]], align 8
+; CHECK-NEXT: ret void
+;
+ store i32 1, ptr addrspace(1) %out, align 32
+ %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
+ store i32 3, ptr addrspace(1) %getElem2, align 4
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12
+ store i32 4, ptr addrspace(1) %getElem3, align 4
+ ret void
+}
+
+; store elements 0, 1, 2, 5, 6, 7. 3 and 4 don't get filled because the heuristic
+; only fills 2-element gaps that are in the middle of a multiple of 4
+define void @gapInWrongLocation(ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @gapInWrongLocation(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: store <2 x i32> <i32 1, i32 2>, ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: [[GETELEM2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8
+; CHECK-NEXT: store i32 3, ptr addrspace(1) [[GETELEM2]], align 8
+; CHECK-NEXT: [[GETELEM5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 20
+; CHECK-NEXT: store i32 5, ptr addrspace(1) [[GETELEM5]], align 4
+; CHECK-NEXT: [[GETELEM6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24
+; CHECK-NEXT: store <2 x i32> <i32 6, i32 7>, ptr addrspace(1) [[GETELEM6]], align 8
+; CHECK-NEXT: ret void
+;
+ store i32 1, ptr addrspace(1) %out, align 32
+ %getElem1 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4
+ store i32 2, ptr addrspace(1) %getElem1, align 4
+ %getElem2 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
+ store i32 3, ptr addrspace(1) %getElem2, align 4
+ %getElem5 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 20
+ store i32 5, ptr addrspace(1) %getElem5, align 4
+ %getElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
+ store i32 6, ptr addrspace(1) %getElem6, align 4
+ %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28
+ store i32 7, ptr addrspace(1) %getElem7, align 4
+ ret void
+}
+
+; This test has 32-bytes of i8s with a 2-element gap in the middle of each 4-byte chunk.
+; i8s are not supported by masked stores on the target, so the stores will not be vectorized.
+; The loads, on the other hand, get gap filled.
+define void @cantMaski8(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @cantMaski8(
+; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr addrspace(1) [[IN]], align 32
+; CHECK-NEXT: [[LOAD031:%.*]] = extractelement <32 x i8> [[TMP1]], i32 0
+; CHECK-NEXT: [[GAPFILL32:%.*]] = extractelement <32 x i8> [[TMP1]], i32 1
+; CHECK-NEXT: [[GAPFILL233:%.*]] = extractelement <32 x i8> [[TMP1]], i32 2
+; CHECK-NEXT: [[LOAD334:%.*]] = extractelement <32 x i8> [[TMP1]], i32 3
+; CHECK-NEXT: [[LOAD435:%.*]] = extractelement <32 x i8> [[TMP1]], i32 4
+; CHECK-NEXT: [[GAPFILL436:%.*]] = extractelement <32 x i8> [[TMP1]], i32 5
+; CHECK-NEXT: [[GAPFILL637:%.*]] = extractelement <32 x i8> [[TMP1]], i32 6
+; CHECK-NEXT: [[LOAD738:%.*]] = extractelement <32 x i8> [[TMP1]], i32 7
+; CHECK-NEXT: [[LOAD839:%.*]] = extractelement <32 x i8> [[TMP1]], i32 8
+; CHECK-NEXT: [[GAPFILL840:%.*]] = extractelement <32 x i8> [[TMP1]], i32 9
+; CHECK-NEXT: [[GAPFILL1041:%.*]] = extractelement <32 x i8> [[TMP1]], i32 10
+; CHECK-NEXT: [[LOAD1142:%.*]] = extractelement <32 x i8> [[TMP1]], i32 11
+; CHECK-NEXT: [[LOAD1243:%.*]] = extractelement <32 x i8> [[TMP1]], i32 12
+; CHECK-NEXT: [[GAPFILL1244:%.*]] = extractelement <32 x i8> [[TMP1]], i32 13
+; CHECK-NEXT: [[GAPFILL1445:%.*]] = extractelement <32 x i8> [[TMP1]], i32 14
+; CHECK-NEXT: [[LOAD1546:%.*]] = extractelement <32 x i8> [[TMP1]], i32 15
+; CHECK-NEXT: [[LOAD1647:%.*]] = extractelement <32 x i8> [[TMP1]], i32 16
+; CHECK-NEXT: [[GAPFILL1648:%.*]] = extractelement <32 x i8> [[TMP1]], i32 17
+; CHECK-NEXT: [[GAPFILL1849:%.*]] = extractelement <32 x i8> [[TMP1]], i32 18
+; CHECK-NEXT: [[LOAD1950:%.*]] = extractelement <32 x i8> [[TMP1]], i32 19
+; CHECK-NEXT: [[LOAD2051:%.*]] = extractelement <32 x i8> [[TMP1]], i32 20
+; CHECK-NEXT: [[GAPFILL2052:%.*]] = extractelement <32 x i8> [[TMP1]], i32 21
+; CHECK-NEXT: [[GAPFILL2253:%.*]] = extractelement <32 x i8> [[TMP1]], i32 22
+; CHECK-NEXT: [[LOAD2354:%.*]] = extractelement <32 x i8> [[TMP1]], i32 23
+; CHECK-NEXT: [[LOAD2455:%.*]] = extractelement <32 x i8> [[TMP1]], i32 24
+; CHECK-NEXT: [[GAPFILL2456:%.*]] = extractelement <32 x i8> [[TMP1]], i32 25
+; CHECK-NEXT: [[GAPFILL2657:%.*]] = extractelement <32 x i8> [[TMP1]], i32 26
+; CHECK-NEXT: [[LOAD2758:%.*]] = extractelement <32 x i8> [[TMP1]], i32 27
+; CHECK-NEXT: [[LOAD2859:%.*]] = extractelement <32 x i8> [[TMP1]], i32 28
+; CHECK-NEXT: [[GAPFILL2860:%.*]] = extractelement <32 x i8> [[TMP1]], i32 29
+; CHECK-NEXT: [[GAPFILL3061:%.*]] = extractelement <32 x i8> [[TMP1]], i32 30
+; CHECK-NEXT: [[LOAD3162:%.*]] = extractelement <32 x i8> [[TMP1]], i32 31
+; CHECK-NEXT: store i8 [[LOAD031]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: [[OUTELEM3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 3
+; CHECK-NEXT: store i8 [[LOAD334]], ptr addrspace(1) [[OUTELEM3]], align 1
+; CHECK-NEXT: [[OUTELEM4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 4
+; CHECK-NEXT: store i8 [[LOAD435]], ptr addrspace(1) [[OUTELEM4]], align 4
+; CHECK-NEXT: [[OUTELEM7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 7
+; CHECK-NEXT: store i8 [[LOAD738]], ptr addrspace(1) [[OUTELEM7]], align 1
+; CHECK-NEXT: [[OUTELEM8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8
+; CHECK-NEXT: store i8 [[LOAD839]], ptr addrspace(1) [[OUTELEM8]], align 8
+; CHECK-NEXT: [[OUTELEM11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 11
+; CHECK-NEXT: store i8 [[LOAD1142]], ptr addrspace(1) [[OUTELEM11]], align 1
+; CHECK-NEXT: [[OUTELEM12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 12
+; CHECK-NEXT: store i8 [[LOAD1243]], ptr addrspace(1) [[OUTELEM12]], align 4
+; CHECK-NEXT: [[OUTELEM15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 15
+; CHECK-NEXT: store i8 [[LOAD1546]], ptr addrspace(1) [[OUTELEM15]], align 1
+; CHECK-NEXT: [[OUTELEM16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 16
+; CHECK-NEXT: store i8 [[LOAD1647]], ptr addrspace(1) [[OUTELEM16]], align 16
+; CHECK-NEXT: [[OUTELEM19:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 19
+; CHECK-NEXT: store i8 [[LOAD1950]], ptr addrspace(1) [[OUTELEM19]], align 1
+; CHECK-NEXT: [[OUTELEM20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 20
+; CHECK-NEXT: store i8 [[LOAD2051]], ptr addrspace(1) [[OUTELEM20]], align 4
+; CHECK-NEXT: [[OUTELEM23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 23
+; CHECK-NEXT: store i8 [[LOAD2354]], ptr addrspace(1) [[OUTELEM23]], align 1
+; CHECK-NEXT: [[OUTELEM24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24
+; CHECK-NEXT: store i8 [[LOAD2455]], ptr addrspace(1) [[OUTELEM24]], align 8
+; CHECK-NEXT: [[OUTELEM27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 27
+; CHECK-NEXT: store i8 [[LOAD2758]], ptr addrspace(1) [[OUTELEM27]], align 1
+; CHECK-NEXT: [[OUTELEM28:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 28
+; CHECK-NEXT: store i8 [[LOAD2859]], ptr addrspace(1) [[OUTELEM28]], align 4
+; CHECK-NEXT: [[OUTELEM31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 31
+; CHECK-NEXT: store i8 [[LOAD3162]], ptr addrspace(1) [[OUTELEM31]], align 1
+; CHECK-NEXT: ret void
+;
+ %load0 = load i8, ptr addrspace(1) %in, align 32
+ %getElem3 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 3
+ %load3 = load i8, ptr addrspace(1) %getElem3, align 1
+ %getElem4 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 4
+ %load4 = load i8, ptr addrspace(1) %getElem4, align 4
+ %getElem7 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 7
+ %load7 = load i8, ptr addrspace(1) %getElem7, align 1
+ %getElem8 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8
+ %load8 = load i8, ptr addrspace(1) %getElem8, align 8
+ %getElem11 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 11
+ %load11 = load i8, ptr addrspace(1) %getElem11, align 1
+ %getElem12 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 12
+ %load12 = load i8, ptr addrspace(1) %getElem12, align 4
+ %getElem15 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 15
+ %load15 = load i8, ptr addrspace(1) %getElem15, align 1
+ %getElem16 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16
+ %load16 = load i8, ptr addrspace(1) %getElem16, align 16
+ %getElem19 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 19
+ %load19 = load i8, ptr addrspace(1) %getElem19, align 1
+ %getElem20 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 20
+ %load20 = load i8, ptr addrspace(1) %getElem20, align 4
+ %getElem23 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 23
+ %load23 = load i8, ptr addrspace(1) %getElem23, align 1
+ %getElem24 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 24
+ %load24 = load i8, ptr addrspace(1) %getElem24, align 8
+ %getElem27 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 27
+ %load27 = load i8, ptr addrspace(1) %getElem27, align 1
+ %getElem28 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 28
+ %load28 = load i8, ptr addrspace(1) %getElem28, align 4
+ %getElem31 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 31
+ %load31 = load i8, ptr addrspace(1) %getElem31, align 1
+
+ store i8 %load0, ptr addrspace(1) %out, align 32
+ %outElem3 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 3
+ store i8 %load3, ptr addrspace(1) %outElem3, align 1
+ %outElem4 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 4
+ store i8 %load4, ptr addrspace(1) %outElem4, align 4
+ %outElem7 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 7
+ store i8 %load7, ptr addrspace(1) %outElem7, align 1
+ %outElem8 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
+ store i8 %load8, ptr addrspace(1) %outElem8, align 8
+ %outElem11 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 11
+ store i8 %load11, ptr addrspace(1) %outElem11, align 1
+ %outElem12 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 12
+ store i8 %load12, ptr addrspace(1) %outElem12, align 4
+ %outElem15 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 15
+ store i8 %load15, ptr addrspace(1) %outElem15, align 1
+ %outElem16 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
+ store i8 %load16, ptr addrspace(1) %outElem16, align 16
+ %outElem19 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 19
+ store i8 %load19, ptr addrspace(1) %outElem19, align 1
+ %outElem20 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 20
+ store i8 %load20, ptr addrspace(1) %outElem20, align 4
+ %outElem23 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 23
+ store i8 %load23, ptr addrspace(1) %outElem23, align 1
+ %outElem24 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
+ store i8 %load24, ptr addrspace(1) %outElem24, align 8
+ %outElem27 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 27
+ store i8 %load27, ptr addrspace(1) %outElem27, align 1
+ %outElem28 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 28
+ store i8 %load28, ptr addrspace(1) %outElem28, align 4
+ %outElem31 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 31
+ store i8 %load31, ptr addrspace(1) %outElem31, align 1
+
+ ret void
+}
+
+; This test has 32-bytes of i16s with a 2-element gap in the middle of each 4-element chunk.
+; i16s are not supported by masked stores on the target, so the stores will not be vectorized.
+; The loads, on the other hand, get gap filled.
+define void @cantMaski16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; CHECK-LABEL: define void @cantMaski16(
+; CHECK-SAME: ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr addrspace(1) [[IN]], align 32
+; CHECK-NEXT: [[LOAD015:%.*]] = extractelement <16 x i16> [[TMP1]], i32 0
+; CHECK-NEXT: [[GAPFILL16:%.*]] = extractelement <16 x i16> [[TMP1]], i32 1
+; CHECK-NEXT: [[GAPFILL217:%.*]] = extractelement <16 x i16> [[TMP1]], i32 2
+; CHECK-NEXT: [[LOAD318:%.*]] = extractelement <16 x i16> [[TMP1]], i32 3
+; CHECK-NEXT: [[LOAD419:%.*]] = extractelement <16 x i16> [[TMP1]], i32 4
+; CHECK-NEXT: [[GAPFILL420:%.*]] = extractelement <16 x i16> [[TMP1]], i32 5
+; CHECK-NEXT: [[GAPFILL621:%.*]] = extractelement <16 x i16> [[TMP1]], i32 6
+; CHECK-NEXT: [[LOAD722:%.*]] = extractelement <16 x i16> [[TMP1]], i32 7
+; CHECK-NEXT: [[LOAD823:%.*]] = extractelement <16 x i16> [[TMP1]], i32 8
+; CHECK-NEXT: [[GAPFILL824:%.*]] = extractelement <16 x i16> [[TMP1]], i32 9
+; CHECK-NEXT: [[GAPFILL1025:%.*]] = extractelement <16 x i16> [[TMP1]], i32 10
+; CHECK-NEXT: [[LOAD1126:%.*]] = extractelement <16 x i16> [[TMP1]], i32 11
+; CHECK-NEXT: [[LOAD1227:%.*]] = extractelement <16 x i16> [[TMP1]], i32 12
+; CHECK-NEXT: [[GAPFILL1228:%.*]] = extractelement <16 x i16> [[TMP1]], i32 13
+; CHECK-NEXT: [[GAPFILL1429:%.*]] = extractelement <16 x i16> [[TMP1]], i32 14
+; CHECK-NEXT: [[LOAD1530:%.*]] = extractelement <16 x i16> [[TMP1]], i32 15
+; CHECK-NEXT: store i16 [[LOAD015]], ptr addrspace(1) [[OUT]], align 32
+; CHECK-NEXT: [[OUTELEM6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 6
+; CHECK-NEXT: store i16 [[LOAD318]], ptr addrspace(1) [[OUTELEM6]], align 2
+; CHECK-NEXT: [[OUTELEM8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 8
+; CHECK-NEXT: store i16 [[LOAD419]], ptr addrspace(1) [[OUTELEM8]], align 8
+; CHECK-NEXT: [[OUTELEM14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 14
+; CHECK-NEXT: store i16 [[LOAD722]], ptr addrspace(1) [[OUTELEM14]], align 2
+; CHECK-NEXT: [[OUTELEM16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 16
+; CHECK-NEXT: store i16 [[LOAD823]], ptr addrspace(1) [[OUTELEM16]], align 16
+; CHECK-NEXT: [[OUTELEM22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 22
+; CHECK-NEXT: store i16 [[LOAD1126]], ptr addrspace(1) [[OUTELEM22]], align 2
+; CHECK-NEXT: [[OUTELEM24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 24
+; CHECK-NEXT: store i16 [[LOAD1227]], ptr addrspace(1) [[OUTELEM24]], align 8
+; CHECK-NEXT: [[OUTELEM30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[OUT]], i32 30
+; CHECK-NEXT: store i16 [[LOAD1530]], ptr addrspace(1) [[OUTELEM30]], align 2
+; CHECK-NEXT: ret void
+;
+ %load0 = load i16, ptr addrspace(1) %in, align 32
+ %getElem6 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 6
+ %load3 = load i16, ptr addrspace(1) %getElem6, align 2
+ %getElem8 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 8
+ %load4 = load i16, ptr addrspace(1) %getElem8, align 8
+ %getElem14 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 14
+ %load7 = load i16, ptr addrspace(1) %getElem14, align 2
+ %getElem16 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 16
+ %load8 = load i16, ptr addrspace(1) %getElem16, align 16
+ %getElem22 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 22
+ %load11 = load i16, ptr addrspace(1) %getElem22, align 2
+ %getElem24 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 24
+ %load12 = load i16, ptr addrspace(1) %getElem24, align 8
+ %getElem30 = getelementptr inbounds i8, ptr addrspace(1) %in, i32 30
+ %load15 = load i16, ptr addrspace(1) %getElem30, align 2
+
+ store i16 %load0, ptr addrspace(1) %out, align 32
+ %outElem6 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 6
+ store i16 %load3, ptr addrspace(1) %outElem6, align 2
+ %outElem8 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 8
+ store i16 %load4, ptr addrspace(1) %outElem8, align 8
+ %outElem14 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 14
+ store i16 %load7, ptr addrspace(1) %outElem14, align 2
+ %outElem16 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 16
+ store i16 %load8, ptr addrspace(1) %outElem16, align 16
+ %outElem22 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 22
+ store i16 %load11, ptr addrspace(1) %outElem22, align 2
+ %outElem24 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 24
+ store i16 %load12, ptr addrspace(1) %outElem24, align 8
+ %outElem30 = getelementptr inbounds i8, ptr addrspace(1) %out, i32 30
+ store i16 %load15, ptr addrspace(1) %outElem30, align 2
+
+ ret void
+}
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll
index 2d3c289c2a12b..e031daab6d786 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/vectorize_i8.ll
@@ -40,8 +40,7 @@ define void @int8x3a4(ptr nocapture align 4 %ptr) {
ret void
; CHECK-LABEL: @int8x3a4
-; CHECK: load <2 x i8>
-; CHECK: load i8
+; CHECK: load <4 x i8>
; CHECK: store <2 x i8>
; CHECK: store i8
}
>From 68a88d16ed381b1ebbd713566d781814e3284204 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Wed, 17 Sep 2025 15:48:02 +0000
Subject: [PATCH 2/5] Clang format
---
llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 04f4e92826a52..d452e1609957a 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -930,9 +930,7 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
}
Chain ExtendingLoadsStores;
- bool ExtendChain = IsLoadChain
- ? ExtendLoads
- : ExtendStores;
+ bool ExtendChain = IsLoadChain ? ExtendLoads : ExtendStores;
if (ExtendChain && NumVecElems < TargetVF && NumVecElems % 2 != 0 &&
VecElemBits >= 8) {
// TargetVF may be a lot higher than NumVecElems,
@@ -1047,8 +1045,8 @@ bool Vectorizer::vectorizeChain(Chain &C) {
// If we are left with a two-element chain, and one of the elements is an
// extra element, we don't want to vectorize
- if (C.size() == 2 && (ExtraElements.contains(C[0].Inst) ||
- ExtraElements.contains(C[1].Inst)))
+ if (C.size() == 2 &&
+ (ExtraElements.contains(C[0].Inst) || ExtraElements.contains(C[1].Inst)))
return false;
sortChainInOffsetOrder(C);
>From 001b4095a931db40264e782afaa443af5cce3ed6 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Thu, 18 Sep 2025 15:37:56 +0000
Subject: [PATCH 3/5] Remove cl opts
---
.../Vectorize/LoadStoreVectorizer.cpp | 32 ++-----------------
1 file changed, 3 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index d452e1609957a..b0f7f12b157f3 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -119,29 +119,6 @@ using namespace llvm;
#define DEBUG_TYPE "load-store-vectorizer"
-cl::opt<bool>
- ExtendLoads("vect-extend-loads", cl::Hidden,
- cl::desc("Load more elements if the target VF is higher "
- "than the chain length."),
- cl::init(true));
-
-cl::opt<bool> ExtendStores(
- "vect-extend-stores", cl::Hidden,
- cl::desc("Store more elements if the target VF is higher "
- "than the chain length and we have access to masked stores."),
- cl::init(true));
-
-cl::opt<bool> FillLoadGaps(
- "vect-fill-load-gaps", cl::Hidden,
- cl::desc("Should Loads be introduced in gaps to enable vectorization."),
- cl::init(true));
-
-cl::opt<bool>
- FillStoreGaps("vect-fill-store-gaps", cl::Hidden,
- cl::desc("Should Stores be introduced in gaps to enable "
- "vectorization into masked stores."),
- cl::init(true));
-
STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
@@ -689,9 +666,8 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
// store for the target. If later on, we don't end up with a chain that
// could be vectorized into a legal masked store, the chains with extra
// elements will be filtered out in splitChainByAlignment.
- bool TryFillGaps = isa<LoadInst>(C[0].Inst)
- ? (FillLoadGaps && TTI.isLegalToWidenLoads())
- : (FillStoreGaps && shouldAttemptMaskedStore(C));
+ bool TryFillGaps = isa<LoadInst>(C[0].Inst) ? TTI.isLegalToWidenLoads()
+ : shouldAttemptMaskedStore(C);
unsigned ASPtrBits =
DL.getIndexSizeInBits(getLoadStoreAddressSpace(C[0].Inst));
@@ -930,9 +906,7 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
}
Chain ExtendingLoadsStores;
- bool ExtendChain = IsLoadChain ? ExtendLoads : ExtendStores;
- if (ExtendChain && NumVecElems < TargetVF && NumVecElems % 2 != 0 &&
- VecElemBits >= 8) {
+ if (NumVecElems < TargetVF && NumVecElems % 2 != 0 && VecElemBits >= 8) {
// TargetVF may be a lot higher than NumVecElems,
// so only extend to the next power of 2.
assert(VecElemBits % 8 == 0);
>From da7391b6e6d22b96ded1e6cc71fe2141965d936a Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Thu, 18 Sep 2025 15:58:56 +0000
Subject: [PATCH 4/5] Add context argument to TTI API
---
llvm/include/llvm/Analysis/TargetTransformInfo.h | 2 +-
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 2 +-
llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h | 4 +++-
llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 7 ++++---
4 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f8f134c833ea2..aaf37c25939c8 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -821,7 +821,7 @@ class TargetTransformInfo {
/// assuming the result is still well-aligned. For example, converting a load
/// i32 to a load i64, or vectorizing three continuous load i32s into a load
/// <4 x i32>.
- LLVM_ABI bool isLegalToWidenLoads() const;
+ LLVM_ABI bool isLegalToWidenLoads(LLVMContext &Context) const;
/// Return true if the target supports nontemporal store.
LLVM_ABI bool isLegalNTStore(Type *DataType, Align Alignment) const;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 55bd4bd709589..69dd9bffdf8d9 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -318,7 +318,7 @@ class TargetTransformInfoImplBase {
return false;
}
- virtual bool isLegalToWidenLoads() const { return false; }
+ virtual bool isLegalToWidenLoads(LLVMContext &Context) const { return false; }
virtual bool isLegalNTStore(Type *DataType, Align Alignment) const {
// By default, assume nontemporal memory stores are available for stores
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index d56cff1ce3695..9ed9f10f770a7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -72,7 +72,9 @@ class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
return isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, AddrSpace);
}
- bool isLegalToWidenLoads() const override { return true; };
+ bool isLegalToWidenLoads(LLVMContext &Context) const override {
+ return true;
+ };
// NVPTX has infinite registers of all kinds, but the actual machine doesn't.
// We conservatively return 1 here which is just enough to enable the
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index b0f7f12b157f3..bfdf18b582e7f 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -666,8 +666,9 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
// store for the target. If later on, we don't end up with a chain that
// could be vectorized into a legal masked store, the chains with extra
// elements will be filtered out in splitChainByAlignment.
- bool TryFillGaps = isa<LoadInst>(C[0].Inst) ? TTI.isLegalToWidenLoads()
- : shouldAttemptMaskedStore(C);
+ bool TryFillGaps = isa<LoadInst>(C[0].Inst)
+ ? TTI.isLegalToWidenLoads(F.getContext())
+ : shouldAttemptMaskedStore(C);
unsigned ASPtrBits =
DL.getIndexSizeInBits(getLoadStoreAddressSpace(C[0].Inst));
@@ -924,7 +925,7 @@ std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
// otherwise we may unnecessary split the chain when the target actually
// supports non-pow2 VF.
if (accessIsAllowedAndFast(NewSizeBytes, AS, Alignment, VecElemBits) &&
- ((IsLoadChain ? TTI.isLegalToWidenLoads()
+ ((IsLoadChain ? TTI.isLegalToWidenLoads(F.getContext())
: TTI.isLegalMaskedStore(
FixedVectorType::get(VecElemTy, NewNumVecElems),
Alignment, AS, /*IsMaskConstant=*/true)))) {
>From 838017430d514878d37421cfe9eec6ad1a7b9a50 Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dakersnar at me.com>
Date: Thu, 25 Sep 2025 11:21:06 -0500
Subject: [PATCH 5/5] Update
llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index bfdf18b582e7f..bf8dd2580ff80 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -365,7 +365,7 @@ class Vectorizer {
/// deleted before the end of the pass.
ChainElem createExtraElementAfter(const ChainElem &PrevElem, APInt Offset,
StringRef Prefix,
- Align Alignment = Align(1));
+ Align Alignment = Align());
/// Delete dead GEPs and extra Load/Store instructions created by
/// createExtraElementAfter
More information about the llvm-commits
mailing list