[llvm] [LV] Vectorization of compress idiom (PR #83467)
Kolya Panchenko via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 29 12:42:32 PST 2024
https://github.com/nikolaypanchenko updated https://github.com/llvm/llvm-project/pull/83467
>From 20fe5ee426ffe05205f19f3641ad6f0e69cfba86 Mon Sep 17 00:00:00 2001
From: Kolya Panchenko <kolya.panchenko at sifive.com>
Date: Tue, 6 Feb 2024 11:59:58 -0800
Subject: [PATCH 1/2] [LV] Support for monotonic idiom
Monotonic Idiom is a special form of a loop carried dependency, which
can be described as
```
m += step
... m ...
```
where
* `m` is a scalar variable, *`step` is a loop-invariant variable,
* the update is done under some non-uniform condition,
* use(s) is(are) done under the same or nested condition(s)
Whether `m` is used in rhs or lhs defines which special vector code
needs to be generated on a use-side.
If `m` is used in lhs, the pattern is known as compress as stored data
needs to be compressed before the store
If `m` is used in rhs, the pattern is known as expand/decompress as use
data needs to be expanded according to the mask
The changeset adds new descriptor for monotonic values as define above
and adds initial support for unit-strided compress store.
---
llvm/include/llvm/Analysis/IVDescriptors.h | 49 ++
.../llvm/Analysis/TargetTransformInfo.h | 8 +
.../llvm/Analysis/TargetTransformInfoImpl.h | 2 +
.../Vectorize/LoopVectorizationLegality.h | 58 ++
llvm/lib/Analysis/IVDescriptors.cpp | 124 ++++
llvm/lib/Analysis/TargetTransformInfo.cpp | 4 +
.../Target/RISCV/RISCVTargetTransformInfo.cpp | 4 +
.../Target/RISCV/RISCVTargetTransformInfo.h | 4 +
.../Vectorize/LoopVectorizationLegality.cpp | 88 +++
.../Transforms/Vectorize/LoopVectorize.cpp | 99 ++-
llvm/lib/Transforms/Vectorize/VPlan.cpp | 1 +
llvm/lib/Transforms/Vectorize/VPlan.h | 110 ++-
.../Transforms/Vectorize/VPlanAnalysis.cpp | 16 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 69 ++
.../Transforms/Vectorize/VPlanTransforms.cpp | 62 ++
.../Transforms/Vectorize/VPlanTransforms.h | 4 +
llvm/lib/Transforms/Vectorize/VPlanValue.h | 2 +
.../LoopVectorize/RISCV/compress_expand.ll | 702 ++++++++++++++++++
18 files changed, 1390 insertions(+), 16 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 5c7b613ac48c40..877204a8b2d864 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -17,6 +17,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/ValueHandle.h"
+#include "llvm/ADT/SetVector.h"
namespace llvm {
@@ -395,6 +396,54 @@ class InductionDescriptor {
SmallVector<Instruction *, 2> RedundantCasts;
};
+class MonotonicDescriptor {
+public:
+ /// This enum represents the kinds of monotonic that we support.
+ enum MonotonicKind {
+ MK_None, ///< Not a monotonic variable.
+ MK_Integer, /// < Integer monotonic variable. Step = C
+ MK_Pointer, /// < Pointer monotonic variable. Step = C
+ };
+
+public:
+ MonotonicDescriptor() = default;
+
+ Value *getStartValue() const { return StartValue; }
+ MonotonicKind getKind() const { return MK; }
+ const SCEV *getStep() const { return Step; }
+ const Instruction *getUpdateOp() const { return UpdateOp; }
+ const SetVector<PHINode *> &getPhis() const { return Phis; }
+ bool isHeaderPhi(const PHINode *Phi) const {
+ return !Phis.empty() && Phis[0] == Phi;
+ }
+
+ /// Returns true if \p Phi forms monotonic pattern within a loop \p L.
+ static MonotonicDescriptor isMonotonicPHI(PHINode *Phi, const Loop *L,
+ PredicatedScalarEvolution &PSE);
+
+ operator bool() const { return MK != MK_None; }
+
+private:
+ /// Private constructor - used by \c isMonotonicPHI
+ MonotonicDescriptor(Value *Start, MonotonicKind K, const SCEV *Step,
+ const Instruction *UpdateOp, SetVector<PHINode *> &Phis)
+ : StartValue(Start), MK(K), Step(Step), UpdateOp(UpdateOp),
+ Phis(Phis.begin(), Phis.end()) {}
+
+ /// Start value.
+ TrackingVH<Value> StartValue = nullptr;
+ /// Induction kind.
+ MonotonicKind MK = MK_None;
+ /// Step value.
+ const SCEV *Step = nullptr;
+ // Instruction that advances induction variable.
+ const Instruction *UpdateOp = nullptr;
+
+ /// All phis that are used to update the monotonic variable. It's expected
+ /// that the first PHINode is in the header BB
+ SetVector<PHINode *> Phis;
+};
+
} // end namespace llvm
#endif // LLVM_ANALYSIS_IVDESCRIPTORS_H
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 58577a6b6eb5c0..a7bdefe0d95708 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1701,6 +1701,9 @@ class TargetTransformInfo {
bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
Align Alignment) const;
+ /// \returns true if vectorization of monotonics is supported by the target.
+ bool enableMonotonicVectorization() const;
+
struct VPLegalization {
enum VPTransform {
// keep the predicating parameter
@@ -2131,6 +2134,7 @@ class TargetTransformInfo::Concept {
virtual bool supportsScalableVectors() const = 0;
virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
Align Alignment) const = 0;
+ virtual bool enableMonotonicVectorization() const = 0;
virtual VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
virtual bool hasArmWideBranch(bool Thumb) const = 0;
@@ -2874,6 +2878,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
return Impl.hasActiveVectorLength(Opcode, DataType, Alignment);
}
+ bool enableMonotonicVectorization() const override {
+ return Impl.enableMonotonicVectorization();
+ }
+
VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const override {
return Impl.getVPLegalizationStrategy(PI);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 13379cc126a40c..e77838882ee725 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -923,6 +923,8 @@ class TargetTransformInfoImplBase {
return false;
}
+ bool enableMonotonicVectorization() const { return false; }
+
TargetTransformInfo::VPLegalization
getVPLegalizationStrategy(const VPIntrinsic &PI) const {
return TargetTransformInfo::VPLegalization(
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index a509ebf6a7e1b3..9896211ca11d83 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -257,6 +257,10 @@ class LoopVectorizationLegality {
/// induction descriptor.
using InductionList = MapVector<PHINode *, InductionDescriptor>;
+ /// MonotonicPhiList contains phi nodes that represent monotonic idiom
+ using MonotonicPhiList =
+ MapVector<const PHINode *, MonotonicDescriptor>;
+
/// RecurrenceSet contains the phi nodes that are recurrences other than
/// inductions and reductions.
using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
@@ -306,6 +310,42 @@ class LoopVectorizationLegality {
/// Returns True if V is a Phi node of an induction variable in this loop.
bool isInductionPhi(const Value *V) const;
+ /// Returns the Monotonics found in the loop
+ const MonotonicPhiList &getMonotonics() const { return MonotonicPhis; }
+
+ /// Returns the MonotonicDescriptor associated with an \p I instruction
+ /// Returns emtpy descriptor if \p I instruction is non-monotonic.
+ const MonotonicDescriptor *getMonotonicDescriptor(const Instruction *I) const {
+ for (const auto &PMD : getMonotonics()) {
+ if (const auto *Phi = dyn_cast<const PHINode>(I))
+ if (PMD.second.getPhis().contains(const_cast<PHINode *>(Phi)))
+ return &PMD.second;
+ if (PMD.second.getUpdateOp() == I)
+ return &PMD.second;
+ }
+ return nullptr;
+ }
+
+ /// Returns true if \p I instruction is a header phi of the monotonic.
+ bool isMonotonicPhi(const Instruction *I) const {
+ const auto *Phi = dyn_cast<PHINode>(I);
+ return Phi && MonotonicPhis.contains(Phi);
+ }
+
+ /// Returns true if \p V value is a header phi of the monotonic.
+ bool isMonotonicPhi(const Value *V) const {
+ const auto *I = dyn_cast<Instruction>(V);
+ return I && isMonotonicPhi(I);
+ }
+
+ /// Returns true of \p I instruction is an update instruction of the
+ /// monotonic.
+ bool isMonotonicUpdate(const Instruction *I) const {
+ return any_of(getMonotonics(), [I](const auto &PMD) {
+ return PMD.second.getUpdateOp() == I;
+ });
+ }
+
/// Returns a pointer to the induction descriptor, if \p Phi is an integer or
/// floating point induction.
const InductionDescriptor *getIntOrFpInductionDescriptor(PHINode *Phi) const;
@@ -346,6 +386,13 @@ class LoopVectorizationLegality {
/// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const;
+ /// Returns true if \p Ptr is depends on a monotonic value and ptr diff
+ /// between two iterations is one if monotonic value is updated
+ bool isConsecutiveMonotonicPtr(Value *Ptr) const;
+
+ /// Return true if \p Ptr computation depends on monotonic value.
+ bool ptrHasMonotonicOperand(Value *Ptr) const;
+
/// Returns true if value V is uniform across \p VF lanes, when \p VF is
/// provided, and otherwise if \p V is invariant across all loop iterations.
bool isInvariant(Value *V) const;
@@ -443,6 +490,11 @@ class LoopVectorizationLegality {
/// specific checks for outer loop vectorization.
bool canVectorizeOuterLoop();
+ /// Return true if loop vectorizer can generate correct code for that
+ /// monotonic. The method is needed to gradually enable vectorization of
+ /// monotonics.
+ bool canVectorizeMonotonic(const MonotonicDescriptor &MD);
+
/// Return true if all of the instructions in the block can be speculatively
/// executed, and record the loads/stores that require masking.
/// \p SafePtrs is a list of addresses that are known to be legal and we know
@@ -460,6 +512,9 @@ class LoopVectorizationLegality {
void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
SmallPtrSetImpl<Value *> &AllowedExit);
+ /// Add MonotonicDescriptor
+ void addMonotonic(const MonotonicDescriptor &MD);
+
/// The loop that we evaluate.
Loop *TheLoop;
@@ -510,6 +565,9 @@ class LoopVectorizationLegality {
/// loop body.
SmallPtrSet<Instruction *, 4> InductionCastsToIgnore;
+ /// Holds the phis of the monotonics
+ MonotonicPhiList MonotonicPhis;
+
/// Holds the phi nodes that are fixed-order recurrences.
RecurrenceSet FixedOrderRecurrences;
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 055f121e743411..9194a5622b7dc2 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -1475,6 +1475,130 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
return isInductionPHI(Phi, TheLoop, PSE.getSE(), D, AR);
}
+MonotonicDescriptor
+MonotonicDescriptor::isMonotonicPHI(PHINode *Phi, const Loop *L,
+ PredicatedScalarEvolution &PSE) {
+ // Monotonic is a special loop carried dependency which is
+ // incremented by a invariant value under some condition and used under the
+ // same or nested condition. That's different to conditional reduction, which
+ // does not allow uses at all.
+
+ // Don't allow multiple updates of the value
+ if (Phi->getNumIncomingValues() != 2)
+ return MonotonicDescriptor();
+
+ Type *Ty = Phi->getType();
+ if (!Ty->isIntegerTy() && !Ty->isPointerTy())
+ return MonotonicDescriptor();
+
+ SetVector<PHINode *> Visited;
+ Visited.insert(Phi);
+ SmallVector<PHINode *> Worklist;
+
+ for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+ if (Phi->getIncomingBlock(I) == L->getLoopPreheader())
+ continue;
+ auto *P = dyn_cast<PHINode>(Phi->getIncomingValue(I));
+ if (!P)
+ return MonotonicDescriptor();
+ Worklist.push_back(P);
+ }
+
+ auto FindSelfUpdate = [&]() -> Instruction * {
+ Instruction *SelfUpdate = nullptr;
+ // Visit use-def chain of the Phi expecting all incoming values as phis
+ // which are used just once, i.e. within that chain.
+ while (!Worklist.empty()) {
+ PHINode *P = Worklist.pop_back_val();
+ if (Visited.contains(P))
+ continue;
+
+ Visited.insert(P);
+ // Expect all phi to be a part of the loop
+ if (!L->contains(P))
+ return nullptr;
+
+ for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; ++I) {
+ Value *V = P->getIncomingValue(I);
+ if (auto *PN = dyn_cast<PHINode>(V)) {
+ Worklist.push_back(PN);
+ continue;
+ }
+ if (SelfUpdate != nullptr)
+ return nullptr;
+
+ if ((Ty->isIntegerTy() && !isa<BinaryOperator>(V)) ||
+ (Ty->isPointerTy() && !isa<GetElementPtrInst>(V)))
+ return nullptr;
+
+ SelfUpdate = cast<Instruction>(V);
+ }
+ }
+ return SelfUpdate;
+ };
+ Instruction *SelfUpdate = FindSelfUpdate();
+
+ // Expect `SelfUpdate` to bey used only once
+ // TODO: Support monotonic with a pre-increment
+ if (!SelfUpdate || SelfUpdate->getNumUses() != 1)
+ return MonotonicDescriptor();
+
+ Value *Step = nullptr;
+ if (auto *GEPUpdate = dyn_cast<GetElementPtrInst>(SelfUpdate)) {
+ if (GEPUpdate->getNumOperands() != 2)
+ return MonotonicDescriptor();
+
+ Step = GEPUpdate->getOperand(1);
+ // TODO: Re-enable update via GEP. This will require changes in VPlan to
+ // correctly print and generate updates
+ return MonotonicDescriptor();
+ }
+ auto *BO = cast<BinaryOperator>(SelfUpdate);
+ // TODO: support other than Add instruction to update monotonic variable
+ if (BO->getOpcode() != Instruction::Add)
+ return MonotonicDescriptor();
+
+ // Either `nsw` or `nuw` should be set, otherwise it's not safe to assume
+ // monotonic won't wrap.
+ if (!BO->hasNoSignedWrap() && !BO->hasNoUnsignedWrap())
+ return MonotonicDescriptor();
+ Step = BO->getOperand(0) == Phi ? BO->getOperand(1) : BO->getOperand(0);
+
+ if (!L->isLoopInvariant(Step))
+ return MonotonicDescriptor();
+
+ auto *StepSCEV = PSE.getSCEV(Step);
+ if (auto *C = dyn_cast<SCEVConstant>(StepSCEV))
+ // TODO: handle step != 1
+ if (!C->isOne())
+ return MonotonicDescriptor();
+
+ // It's important to check all uses of the Phi and make sure they are either
+ // outside of the loop.
+ // TODO: Support uses under nested predicate, which can be supported by vectorizer
+ for (User *U : Phi->users()) {
+ auto *UI = cast<Instruction>(U);
+ if (!L->contains(UI))
+ continue;
+
+ // Ignore phis that are necessary to represent self-update
+ if (auto *P = dyn_cast<PHINode>(UI))
+ if (Visited.contains(P))
+ continue;
+
+ BasicBlock *UIParent = UI->getParent();
+ if (UIParent != SelfUpdate->getParent())
+ return MonotonicDescriptor();
+ }
+
+ Value *StartValue = Phi->getIncomingValueForBlock(L->getLoopPreheader());
+ // Record all visited Phis in a vector and place Phi at the biginning to
+ // simplify future analysis
+ return MonotonicDescriptor(StartValue,
+ Ty->isPointerTy() ? MK_Pointer : MK_Integer,
+ StepSCEV, SelfUpdate, Visited);
+}
+
bool InductionDescriptor::isInductionPHI(
PHINode *Phi, const Loop *TheLoop, ScalarEvolution *SE,
InductionDescriptor &D, const SCEV *Expr,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1f11f0d7dd620e..becc8e821fd346 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1302,6 +1302,10 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
}
+bool TargetTransformInfo::enableMonotonicVectorization() const {
+ return TTIImpl->enableMonotonicVectorization();
+}
+
TargetTransformInfo::Concept::~Concept() = default;
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 2e4e69fb4f920f..da01d2f986b4b4 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1609,3 +1609,7 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
C2.NumIVMuls, C2.NumBaseAdds,
C2.ScaleCost, C2.ImmCost, C2.SetupCost);
}
+
+bool RISCVTTIImpl::enableMonotonicVectorization() const {
+ return ST->hasVInstructions();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index af36e9d5d5e886..c5e6fc26605b28 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -372,6 +372,10 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
const TargetTransformInfo::LSRCost &C2);
+ /// \returns true if ISA supports all needed instructions to vectorize
+ /// monotonics
+ bool enableMonotonicVectorization() const;
+
bool shouldFoldTerminatingConditionAfterLSR() const {
return true;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 37a356c43e29a4..77348826e067cf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -78,6 +78,11 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
"Scalable vectorization is available and favored when the "
"cost is inconclusive.")));
+static cl::opt<bool>
+ EnableMonotonics("enable-monotonics", cl::init(true), cl::Hidden,
+ cl::desc("Control whether vectorization of loops with "
+ "monotonic variables is enabled"));
+
/// Maximum vectorization interleave count.
static const unsigned MaxInterleaveFactor = 16;
@@ -471,6 +476,36 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
return 0;
}
+bool LoopVectorizationLegality::isConsecutiveMonotonicPtr(Value *Ptr) const {
+ assert(ptrHasMonotonicOperand(Ptr) &&
+ "Pointer's computation does not use monotonic values.");
+
+ auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ assert(GEP->getNumOperands() == 2 &&
+ "GetElementPtr with more than 1 indexes is not currently supported "
+ "and should be filtered out before.");
+ Value *Monotonic = GEP->getOperand(1);
+ if (auto *Cast = dyn_cast<CastInst>(Monotonic))
+ Monotonic = Cast->getOperand(0);
+ const MonotonicDescriptor *MD =
+ getMonotonicDescriptor(cast<Instruction>(Monotonic));
+ assert(MD && "The index has no MonotonicDescriptor associated with it.");
+ const SCEVConstant *Step = dyn_cast<SCEVConstant>(MD->getStep());
+ return Step && Step->getAPInt().getZExtValue() == 1;
+}
+
+bool LoopVectorizationLegality::ptrHasMonotonicOperand(
+ Value *Ptr) const {
+ auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ if (!GEP)
+ return false;
+ return any_of(GEP->operands(), [&](Value *V) {
+ if (auto *Cast = dyn_cast<CastInst>(V))
+ return isMonotonicPhi(Cast->getOperand(0));
+ return isMonotonicPhi(V);
+ });
+}
+
bool LoopVectorizationLegality::isInvariant(Value *V) const {
return LAI->isInvariant(V);
}
@@ -678,6 +713,47 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
return Result;
}
+bool LoopVectorizationLegality::canVectorizeMonotonic(const MonotonicDescriptor &MD) {
+ Value *Monotonic = MD.getPhis().front();
+ auto IsUserInLoop = [&](User *U) -> bool {
+ auto *I = dyn_cast<Instruction>(U);
+ return I && TheLoop->contains(I);
+ };
+ auto CanIgnoreUser = [&](User *U) -> bool {
+ if (auto *PN = dyn_cast<PHINode>(U))
+ if (MD.getPhis().contains(PN))
+ return true;
+ return U == MD.getUpdateOp();
+ };
+
+ for (User *U : Monotonic->users()) {
+ if (!IsUserInLoop(U) || CanIgnoreUser(U))
+ continue;
+
+ // For now expect monotonic value to be used by by zext with a single user
+ // or GEP
+ if (U->hasOneUser() && isa<ZExtInst, SExtInst>(U))
+ U = *cast<Instruction>(U)->users().begin();
+
+ if (!isa<GetElementPtrInst>(U))
+ return false;
+
+ // All GEPs should be used as a pointer operand of a store which represents
+ // compressstore.
+ if (any_of(U->users(), [&](User *UI) {
+ if (!IsUserInLoop(UI) || CanIgnoreUser(UI))
+ return false;
+ return UI != MD.getUpdateOp() &&
+ (!isa<StoreInst>(UI) || getLoadStorePointerOperand(UI) != U);
+ })) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Expand of a monotonic value is not yet supported.\n");
+ return false;
+ }
+ }
+ return true;
+}
+
void LoopVectorizationLegality::addInductionPhi(
PHINode *Phi, const InductionDescriptor &ID,
SmallPtrSetImpl<Value *> &AllowedExit) {
@@ -730,6 +806,11 @@ void LoopVectorizationLegality::addInductionPhi(
LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
}
+void LoopVectorizationLegality::addMonotonic(const MonotonicDescriptor &MD) {
+ for (PHINode *P : MD.getPhis())
+ MonotonicPhis[P] = MD;
+}
+
bool LoopVectorizationLegality::setupOuterLoopInductions() {
BasicBlock *Header = TheLoop->getHeader();
@@ -880,6 +961,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
addInductionPhi(Phi, ID, AllowedExit);
continue;
}
+ if (EnableMonotonics && TTI->enableMonotonicVectorization())
+ if (auto MD =
+ MonotonicDescriptor::isMonotonicPHI(Phi, TheLoop, PSE))
+ if (canVectorizeMonotonic(MD)) {
+ addMonotonic(MD);
+ continue;
+ }
reportVectorizationFailure("Found an unidentified PHI",
"value that could not be identified as "
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 50a073e890626e..ab90b5dc50ab69 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1253,7 +1253,8 @@ class LoopVectorizationCostModel {
CM_GatherScatter,
CM_Scalarize,
CM_VectorCall,
- CM_IntrinsicCall
+ CM_IntrinsicCall,
+ CM_MonotonicUnit, // For consecutive accesses with monotonic +1
};
/// Save vectorization decision \p W and \p Cost taken by the cost model for
@@ -1454,6 +1455,10 @@ class LoopVectorizationCostModel {
/// access that can be widened.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
+ /// Returns true if \p I is a memory instruction with monotonic index(es) that
+ /// can be widened
+ bool memoryInstructionUsesMonotonic(Instruction *I, ElementCount VF);
+
/// Returns true if \p I is a memory instruction in an interleaved-group
/// of memory accesses that can be vectorized with wide vector loads/stores
/// and shuffles.
@@ -1647,6 +1652,10 @@ class LoopVectorizationCostModel {
/// memory access.
InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
+ /// The cost computation for widening memory access \p I which has monotonic
+ /// index.
+ InstructionCost getMonotonicMemoryOpCost(Instruction *I, ElementCount VF);
+
/// The cost calculation for Load/Store instruction \p I with uniform pointer -
/// Load: scalar load + broadcast.
/// Store: scalar store + (loop invariant value stored? 0 : extract of last
@@ -3707,7 +3716,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
return WideningDecision == CM_Scalarize;
assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
"Ptr is neither a value or pointer operand");
- return WideningDecision != CM_GatherScatter;
+ return WideningDecision != CM_GatherScatter &&
+ WideningDecision != CM_MonotonicUnit;
};
// A helper that returns true if the given value is a bitcast or
@@ -4102,6 +4112,12 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
return true;
}
+bool LoopVectorizationCostModel::memoryInstructionUsesMonotonic(
+ Instruction *I, ElementCount VF) {
+ assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
+ return Legal->ptrHasMonotonicOperand(getLoadStorePointerOperand(I));
+}
+
void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// We should not collect Uniforms more than once per VF. Right now,
// this function is called from collectUniformsAndScalars(), which
@@ -4188,6 +4204,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
return (WideningDecision == CM_Widen ||
WideningDecision == CM_Widen_Reverse ||
+ WideningDecision == CM_MonotonicUnit ||
WideningDecision == CM_Interleave);
};
@@ -5257,7 +5274,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
return 1;
// We used the distance for the interleave count.
- if (!Legal->isSafeForAnyVectorWidth())
+ if (!Legal->isSafeForAnyVectorWidth() || !Legal->getMonotonics().empty())
return 1;
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
@@ -6054,6 +6071,29 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
return Cost;
}
+InstructionCost
+LoopVectorizationCostModel::getMonotonicMemoryOpCost(Instruction *I,
+ ElementCount VF) {
+ Type *ValTy = getLoadStoreType(I);
+ auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ Value *Ptr = getLoadStorePointerOperand(I);
+
+ if (!Legal->isConsecutiveMonotonicPtr(Ptr))
+ return InstructionCost::getInvalid();
+ if (isa<LoadInst>(I))
+ return InstructionCost::getInvalid();
+
+ LLVMContext &Ctx = I->getContext();
+ SmallVector<Type *> ParamTys;;
+ ParamTys.push_back(VectorTy);
+ ParamTys.push_back(Ptr->getType());
+ ParamTys.push_back(VectorType::get(Type::getInt1Ty(Ctx), VF));
+ IntrinsicCostAttributes CostAttrs(Intrinsic::masked_compressstore,
+ Type::getVoidTy(Ctx), ParamTys);
+ return TTI.getIntrinsicInstrCost(CostAttrs,
+ TargetTransformInfo::TCK_RecipThroughput);
+}
+
InstructionCost
LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
ElementCount VF) {
@@ -6442,6 +6482,11 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
if (!foldTailByMasking())
return true;
+ // Load or store with monotonic index in pointer's computation
+ // requires special handling of a mask.
+ if (Legal->ptrHasMonotonicOperand(Ptr))
+ return false;
+
// For scalable vectors, a uniform memop load is always
// uniform-by-parts and we know how to scalarize that.
if (isa<LoadInst>(I))
@@ -6487,6 +6532,15 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
continue;
}
+ if (memoryInstructionUsesMonotonic(&I, VF)) {
+ assert(
+ Legal->isConsecutiveMonotonicPtr(getLoadStorePointerOperand(&I)) &&
+ "Expected consecutive monotonic pointer");
+ setWideningDecision(&I, VF, CM_MonotonicUnit,
+ getMonotonicMemoryOpCost(&I, VF));
+ continue;
+ }
+
// Choose between Interleaving, Gather/Scatter or Scalarization.
InstructionCost InterleaveCost = InstructionCost::getInvalid();
unsigned NumAccesses = 1;
@@ -6901,6 +6955,18 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
return *RedCost;
+ if (auto *MD = Legal->getMonotonicDescriptor(I)) {
+ if (VF.isScalable())
+ return InstructionCost::getInvalid();
+ InstructionCost Cost = 0;
+
+ Type *ScalarTy = Type::getIntNTy(I->getContext(), VF.getKnownMinValue());
+ IntrinsicCostAttributes CostAttrs(Intrinsic::ctpop, ScalarTy, {ScalarTy});
+
+ Cost += TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
+ return Cost;
+ }
+
// Certain instructions can be cheaper to vectorize if they have a constant
// second vector operand. One example of this are shifts on x86.
Value *Op2 = I->getOperand(1);
@@ -7034,6 +7100,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
case LoopVectorizationCostModel::CM_VectorCall:
case LoopVectorizationCostModel::CM_IntrinsicCall:
llvm_unreachable_internal("Instr has invalid widening decision");
+ case LoopVectorizationCostModel::CM_MonotonicUnit:
+ return TTI::CastContextHint::Masked;
}
llvm_unreachable("Unhandled case!");
@@ -8060,9 +8128,10 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
// reverse consecutive.
LoopVectorizationCostModel::InstWidening Decision =
CM.getWideningDecision(I, Range.Start);
+ bool Monotonic = Decision == LoopVectorizationCostModel::CM_MonotonicUnit;
bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
bool Consecutive =
- Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+ Reverse || Monotonic || Decision == LoopVectorizationCostModel::CM_Widen;
VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
if (Consecutive) {
@@ -8080,7 +8149,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
- Consecutive, Reverse);
+ Consecutive, Reverse, Monotonic);
}
/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
@@ -8423,7 +8492,7 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
return Recipe;
VPHeaderPHIRecipe *PhiRecipe = nullptr;
- assert((Legal->isReductionVariable(Phi) ||
+ assert((Legal->isReductionVariable(Phi) || Legal->isMonotonicPhi(Phi) ||
Legal->isFixedOrderRecurrence(Phi)) &&
"can only widen reductions and fixed-order recurrences here");
VPValue *StartV = Operands[0];
@@ -8435,6 +8504,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
CM.isInLoopReduction(Phi),
CM.useOrderedReductions(RdxDesc));
+ } else if (Legal->isMonotonicPhi(Phi)) {
+ PhiRecipe = new VPMonotonicHeaderPHIRecipe(Phi, StartV);
} else {
// TODO: Currently fixed-order recurrences are modeled as chains of
// first-order recurrences. If there are no users of the intermediate
@@ -8468,6 +8539,16 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
if (auto *CI = dyn_cast<CallInst>(Instr))
return tryToWidenCall(CI, Operands, Range, Plan);
+ if (Legal->isMonotonicUpdate(Instr)) {
+ const MonotonicDescriptor *MD = Legal->getMonotonicDescriptor(Instr);
+ assert(MD && "Monotonic descriptor was not found");
+ assert(Operands.size() == 2 &&
+ "Only binary monotonic updates are supported");
+ VPValue *Mask = getBlockInMask(Instr->getParent());
+ return new VPMonotonicUpdateInstruction(Mask, Operands[0], Operands[1],
+ Instr->getDebugLoc(), *MD);
+ }
+
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range, Plan);
@@ -9372,6 +9453,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
assert((LI || SI) && "Invalid Load/Store instruction");
assert((!SI || StoredValue) && "No stored value provided for widened store");
assert((!LI || !StoredValue) && "Stored value provided for widened load");
+ assert((!Monotonic || !LI) && "Expand load is not yet supported");
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
@@ -9405,6 +9487,11 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
Value *VectorGep = State.get(getAddr(), Part);
NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
MaskPart);
+ } else if (isMonotonic()) {
+ auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
+ NewSI = Builder.CreateIntrinsic(
+ Builder.getVoidTy(), Intrinsic::masked_compressstore,
+ {StoredVal, VecPtr, BlockInMaskParts[Part]});
} else {
if (isReverse()) {
// If we store to reverse consecutive memory locations, then we need
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4aeab6fc619988..37bae74ed9acb7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -888,6 +888,7 @@ void VPlan::execute(VPTransformState *State) {
(isa<VPReductionPHIRecipe>(PhiR) &&
cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
bool NeedsScalar = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
+ isa<VPMonotonicHeaderPHIRecipe>(PhiR) ||
(isa<VPReductionPHIRecipe>(PhiR) &&
cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 16c09a83e777dd..0297b5bae7f9ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -866,7 +866,9 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenIntOrFpInductionSC:
case VPRecipeBase::VPWidenPointerInductionSC:
case VPRecipeBase::VPReductionPHISC:
+ case VPRecipeBase::VPMonotonicHeaderPHISC:
case VPRecipeBase::VPScalarCastSC:
+ case VPRecipeBase::VPMonotonicUpdateSC:
return true;
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPBranchOnMaskSC:
@@ -1155,6 +1157,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
BranchOnCount,
BranchOnCond,
ComputeReductionResult,
+ MonotonicUpdate,
};
private:
@@ -1790,6 +1793,68 @@ class VPWidenPHIRecipe : public VPSingleDefRecipe {
VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
};
+class VPMonotonicUpdateInstruction : public VPInstruction {
+private:
+ MonotonicDescriptor MD;
+
+public:
+ explicit VPMonotonicUpdateInstruction(VPValue *Mask, VPValue *Op1,
+ VPValue *Op2, DebugLoc DL,
+ MonotonicDescriptor MD,
+ const Twine &Name = "")
+ : VPInstruction(VPInstruction::MonotonicUpdate, {Op1, Op2}, DL, Name),
+ MD(MD) {
+ addOperand(Mask);
+ setUnderlyingValue(
+ cast<Value>(const_cast<Instruction *>(MD.getUpdateOp())));
+ }
+
+ explicit VPMonotonicUpdateInstruction() = delete;
+ ~VPMonotonicUpdateInstruction() override = default;
+
+ const MonotonicDescriptor &getMonotonicDescriptor() const { return MD; }
+
+ // Returns the incoming value from the loop backedge.
+ VPValue *getIncomingValue() const { return getOperand(0); }
+
+ // Returns the step value from the loop backedge.
+ VPValue *getStepValue() const { return getOperand(1); }
+
+ /// Returns the mask value of the instruction
+ VPValue *getMask() const { return getOperand(2); }
+
+ VPRecipeBase *clone() override {
+ return new VPMonotonicUpdateInstruction(getMask(), getIncomingValue(),
+ getStepValue(), getDebugLoc(),
+ getMonotonicDescriptor());
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPDef *D) {
+ auto *R = cast<VPRecipeBase>(D);
+ auto *I = dyn_cast<VPInstruction>(R);
+ return I && I->getOpcode() == VPInstruction::MonotonicUpdate;
+ }
+
+ static inline bool classof(const VPRecipeBase *R) {
+ auto *VPInst = dyn_cast<VPInstruction>(R);
+ return VPInst && VPInst->getOpcode() == VPInstruction::MonotonicUpdate;
+ }
+
+ static inline bool classof(const VPUser *U) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ return R && VPMonotonicUpdateInstruction::classof(R);
+ }
+
+ void execute(VPTransformState &State) override final;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A recipe for handling first-order recurrence phis. The start value is the
/// first operand of the recipe and the incoming value from the backedge is the
/// second operand.
@@ -2185,6 +2250,35 @@ class VPBranchOnMaskRecipe : public VPRecipeBase {
}
};
+/// VPMonotonicHeaderPHIRecipe represents a phi of the monotonic value
+class VPMonotonicHeaderPHIRecipe final : public VPHeaderPHIRecipe {
+public:
+ VPMonotonicHeaderPHIRecipe(PHINode *Phi, VPValue *StartValue)
+ : VPHeaderPHIRecipe(VPDef::VPMonotonicHeaderPHISC, Phi,
+ StartValue) {}
+
+ ~VPMonotonicHeaderPHIRecipe() override = default;
+
+ void execute(VPTransformState &State) override;
+
+ VPRecipeBase *clone() override {
+ return new VPMonotonicHeaderPHIRecipe(cast<PHINode>(getUnderlyingInstr()),
+ getOperand(0));
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ VP_CLASSOF_IMPL(VPDef::VPMonotonicHeaderPHISC);
+
+ static inline bool classof(const VPHeaderPHIRecipe *R) {
+ return R->getVPDefID() == VPDef::VPMonotonicHeaderPHISC;
+ }
+};
+
/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
/// control converges back from a Branch-on-Mask. The phi nodes are needed in
/// order to merge values that are set under such a branch and feed their uses.
@@ -2236,6 +2330,9 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
// Whether the consecutive loaded/stored addresses are in reverse order.
bool Reverse;
+ // Whether monotonic is used as an index in store.
+ bool Monotonic = false;
+
void setMask(VPValue *Mask) {
if (!Mask)
return;
@@ -2258,10 +2355,14 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
VPValue *StoredValue, VPValue *Mask,
- bool Consecutive, bool Reverse)
+ bool Consecutive, bool Reverse,
+ bool Monotonic = false)
: VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr, StoredValue}),
- Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) {
+ Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse),
+ Monotonic(Monotonic) {
assert((Consecutive || !Reverse) && "Reverse implies consecutive");
+ assert((!Monotonic || Consecutive) &&
+ "Non-consecutive compress store is not supported");
setMask(Mask);
}
@@ -2301,6 +2402,9 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
// Return whether the loaded-from / stored-to addresses are consecutive.
bool isConsecutive() const { return Consecutive; }
+ // Return whether store uses monotonic in address computation
+ bool isMonotonic() const { return Monotonic; }
+
// Return whether the consecutive loaded/stored addresses are in reverse
// order.
bool isReverse() const { return Reverse; }
@@ -3412,6 +3516,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
return Rep->isUniform();
if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
return all_of(GEP->operands(), isUniformAfterVectorization);
+ if (isa<VPMonotonicUpdateInstruction, VPMonotonicHeaderPHIRecipe>(Def))
+ return true;
if (auto *VPI = dyn_cast<VPInstruction>(Def))
return VPI->getOpcode() == VPInstruction::ComputeReductionResult;
return false;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f55beac2047c94..4cba282b7b5d8e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -213,14 +213,14 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
Type *ResultTy =
TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
.Case<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe,
- VPReductionPHIRecipe, VPWidenPointerInductionRecipe>(
- [this](const auto *R) {
- // Handle header phi recipes, except VPWienIntOrFpInduction
- // which needs special handling due it being possibly truncated.
- // TODO: consider inferring/caching type of siblings, e.g.,
- // backedge value, here and in cases below.
- return inferScalarType(R->getStartValue());
- })
+ VPReductionPHIRecipe, VPWidenPointerInductionRecipe,
+ VPMonotonicHeaderPHIRecipe>([this](const auto *R) {
+ // Handle header phi recipes, except VPWienIntOrFpInduction
+ // which needs special handling due it being possibly truncated.
+ // TODO: consider inferring/caching type of siblings, e.g.,
+ // backedge value, here and in cases below.
+ return inferScalarType(R->getStartValue());
+ })
.Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
[](const auto *R) { return R->getScalarType(); })
.Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 27b72575ddd51a..6b89a53600dd6c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1087,6 +1087,49 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
VecInd->addIncoming(LastInduction, VectorPH);
}
+/// Generate phi for the monotonic:
+/// %monotonic = phi [%monotonic.update, %vector.latch]
+void VPMonotonicHeaderPHIRecipe::execute(VPTransformState &State) {
+ IRBuilder<>::InsertPointGuard Guard(State.Builder);
+ State.Builder.SetInsertPoint(State.CFG.PrevBB->getFirstNonPHI());
+
+ Value *StartV = State.get(getStartValue(), VPIteration(0, 0));
+ auto *Phi = State.Builder.CreatePHI(StartV->getType(), 2, "monotonic.phi");
+ BasicBlock *PreheaderBB = State.CFG.getPreheaderBBFor(this);
+ Phi->addIncoming(StartV, PreheaderBB);
+
+ // Use the same Phi for all Parts
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ State.set(this, Phi, Part, /*IsScalar=*/true);
+}
+
+/// Generate following sequence to update scalar monotonic variable:
+/// %bcast = bitcast %mask to iVF
+/// %0 = llvm.ctpop(%mask)
+/// %1 = mul %0, %step // where %step is a step of monotonic
+/// %monotonic.update = add %monotonic, %1
+void VPMonotonicUpdateInstruction::execute(VPTransformState &State) {
+ assert(State.UF == 1 && "Unrolling is not supported.");
+ assert(!State.VF.isScalable() &&
+ "Scalable vectorization of monotonics is not yet supported.");
+ auto &Builder = State.Builder;
+ Value *V = State.get(getIncomingValue(), 0, /*NeedsScalar=*/true);
+ Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+ Value *Mask = State.get(getMask(), 0);
+
+ Value *ScalarMask = Builder.CreateBitCast(
+ Mask, Builder.getIntNTy(State.VF.getKnownMinValue()));
+ Value *Popc = Builder.CreateIntrinsic(Intrinsic::ctpop,
+ {ScalarMask->getType()}, {ScalarMask});
+ Popc = Builder.CreateZExtOrTrunc(Popc, Step->getType());
+ Step = Builder.CreateMul(Step, Popc, "monotonic.vf.step");
+ const auto *OrigUpdateOp = cast<BinaryOperator>(MD.getUpdateOp());
+ Value *NewV = Builder.CreateBinOp(OrigUpdateOp->getOpcode(), V, Step,
+ "monotonic.update");
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ State.set(this, NewV, Part, /*IsScalar=*/true);
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
@@ -1102,6 +1145,32 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
O << ", ";
getStepValue()->printAsOperand(O, SlotTracker);
}
+
+void VPMonotonicUpdateInstruction::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
+ O << Indent << "monotonic-update ";
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ O << MD.getUpdateOp()->getOpcodeName();
+ O << ' ';
+ getOperand(0)->printAsOperand(O, SlotTracker);
+ O << ", ";
+ getOperand(1)->printAsOperand(O, SlotTracker);
+ O << " @";
+ getMask()->printAsOperand(O, SlotTracker);
+
+ if (auto DL = getDebugLoc()) {
+ O << ", !dbg ";
+ DL.print(O);
+ }
+}
+
+void VPMonotonicHeaderPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EMIT ";
+ printAsOperand(O, SlotTracker);
+ O << " = monotonic-phi ";
+ printOperands(O, SlotTracker);
+}
#endif
bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9c3f35112b592f..844ac0cdfe1576 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -24,6 +24,8 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PatternMatch.h"
+#define DEBUG_TYPE "loop-vectorize"
+
using namespace llvm;
using namespace llvm::PatternMatch;
@@ -500,6 +502,65 @@ static void removeDeadRecipes(VPlan &Plan) {
}
}
+void VPlanTransforms::simplifyMonotonics(VPlan &Plan) {
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+ Plan.getEntry());
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))
+ for (VPRecipeBase &R : *VPBB) {
+ auto *MonotonicPhi = dyn_cast<VPMonotonicHeaderPHIRecipe>(&R);
+ if (!MonotonicPhi)
+ continue;
+
+ DenseSet<VPRecipeBase *> ToRemove;
+ SmallVector<VPRecipeBase *> Worklist = {
+ &MonotonicPhi->getBackedgeRecipe()};
+ VPMonotonicUpdateInstruction *VPMUI = nullptr;
+
+ while (!Worklist.empty()) {
+ VPRecipeBase *RR = Worklist.pop_back_val();
+ if (RR->getParent() != MonotonicPhi->getParent()) {
+ LLVM_DEBUG(dbgs() << "LV: Cannot simplify non-flattened HCFG\n");
+ return;
+ }
+ if (ToRemove.contains(RR) || RR == MonotonicPhi)
+ continue;
+
+ if (auto *MU = dyn_cast<VPMonotonicUpdateInstruction>(RR)) {
+ VPMUI = MU;
+ continue;
+ }
+ auto *VPB = dyn_cast<VPBlendRecipe>(RR);
+ if (!VPB) {
+ LLVM_DEBUG(dbgs()
+ << "LV: Blend recipes are expected to propagate new "
+ "value of monotonic to a header phi\n");
+ return;
+ }
+ ToRemove.insert(RR);
+ for (unsigned I = 0, E = VPB->getNumIncomingValues(); I != E; ++I) {
+ VPValue *V = VPB->getIncomingValue(I);
+ if (!isa<VPRecipeBase>(V->getDefiningRecipe())) {
+ LLVM_DEBUG(dbgs()
+ << "LV: Unsupported VPValue in simplifyMonotonics\n");
+ return;
+ }
+ Worklist.push_back(V->getDefiningRecipe());
+ }
+ }
+ assert(VPMUI && "Monotonic update must exist in a VPlan");
+ // Use VPValue of the monotonic update instruction in a header phi
+ // instead
+ MonotonicPhi->setOperand(1, VPMUI);
+ for (auto &PV : Plan.getLiveOuts()) {
+ VPLiveOut *LO = PV.second;
+ if (ToRemove.contains(LO->getOperand(0)->getDefiningRecipe()))
+ LO->setOperand(0, VPMUI);
+ }
+ for (VPRecipeBase *RR : ToRemove)
+ RR->eraseFromParent();
+ }
+}
+
static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID,
ScalarEvolution &SE, Instruction *TruncI,
VPValue *StartV, VPValue *Step,
@@ -1071,6 +1132,7 @@ void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
removeDeadRecipes(Plan);
createAndOptimizeReplicateRegions(Plan);
+ simplifyMonotonics(Plan);
removeRedundantExpandSCEVRecipes(Plan);
mergeBlocksIntoPredecessors(Plan);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index ff83c3f083b093..b6aa553d808f94 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -98,6 +98,10 @@ struct VPlanTransforms {
/// VPlan directly.
static void dropPoisonGeneratingRecipes(
VPlan &Plan, function_ref<bool(BasicBlock *)> BlockNeedsPredication);
+
+ /// Simplify usage of monotonics within a VPlan by removing unnecessary blends
+ /// if HCFG has been flattened
+ static void simplifyMonotonics(VPlan &Plan);
};
} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 1d2c17e91b7ab9..8bc626419e4d3c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -360,6 +360,7 @@ class VPDef {
VPWidenMemoryInstructionSC,
VPWidenSC,
VPWidenSelectSC,
+ VPMonotonicUpdateSC,
// START: Phi-like recipes. Need to be kept together.
VPBlendSC,
VPWidenPHISC,
@@ -371,6 +372,7 @@ class VPDef {
VPFirstOrderRecurrencePHISC,
VPWidenIntOrFpInductionSC,
VPWidenPointerInductionSC,
+ VPMonotonicHeaderPHISC,
VPReductionPHISC,
// END: SubclassID for recipes that inherit VPHeaderPHIRecipe
// END: Phi-like recipes
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll b/llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll
new file mode 100644
index 00000000000000..71de9d495508da
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll
@@ -0,0 +1,702 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -mattr=+v,+d -S -mtriple riscv64 -force-vector-width=4 %s -o - | FileCheck %s
+
+define ptr @compress_on_pointers(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define ptr @compress_on_pointers(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[A_ADDR_1_LCSSA:%.*]] = phi ptr [ [[A_ADDR_1:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[A_ADDR_0_LCSSA:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A_ADDR_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT: ret ptr [[A_ADDR_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi ptr [ [[A]], [[FOR_BODY_PREHEADER]] ], [ [[A_ADDR_1]], [[FOR_INC]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[A_ADDR_09]], i64 1
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[A_ADDR_09]], align 4
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[A_ADDR_1]] = phi ptr [ [[INCDEC_PTR]], [[IF_THEN]] ], [ [[A_ADDR_09]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp8 = icmp sgt i32 %n, 0
+ br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ %a.addr.1.lcssa = phi ptr [ %a.addr.1, %for.inc ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ %a.addr.0.lcssa = phi ptr [ %a, %entry ], [ %a.addr.1.lcssa, %for.cond.cleanup.loopexit ]
+ ret ptr %a.addr.0.lcssa
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %a.addr.09 = phi ptr [ %a, %for.body.preheader ], [ %a.addr.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+ %incdec.ptr = getelementptr inbounds i32, ptr %a.addr.09, i64 1
+ store i32 %0, ptr %a.addr.09, align 4
+ br label %for.inc
+
+for.inc:
+ %a.addr.1 = phi ptr [ %incdec.ptr, %if.then ], [ %a.addr.09, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+
+define i32 @compress_store(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i32 @compress_store(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[MONOTONIC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[MONOTONIC_UPDATE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[MONOTONIC_PHI]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr [[TMP7]], <4 x i1> [[TMP4]])
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i1> [[TMP4]] to i4
+; CHECK-NEXT: [[TMP9:%.*]] = call i4 @llvm.ctpop.i4(i4 [[TMP8]])
+; CHECK-NEXT: [[TMP10:%.*]] = zext i4 [[TMP9]] to i32
+; CHECK-NEXT: [[MONOTONIC_VF_STEP:%.*]] = mul i32 1, [[TMP10]]
+; CHECK-NEXT: [[MONOTONIC_UPDATE]] = add i32 [[MONOTONIC_PHI]], [[MONOTONIC_VF_STEP]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[RET_1_LCSSA:%.*]] = phi i32 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ], [ [[MONOTONIC_UPDATE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: ret i32 [[RET_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT: [[RET_011:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP12]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[RET_011]] to i64
+; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDX_EXT]]
+; CHECK-NEXT: store i32 [[TMP12]], ptr [[ADD_PTR]], align 4
+; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[RET_011]], 1
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[RET_1]] = phi i32 [ [[INC]], [[IF_THEN]] ], [ [[RET_011]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+ %cmp10 = icmp sgt i32 %n, 0
+ br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup:
+ %ret.0.lcssa = phi i32 [ 0, %entry ], [ %ret.1, %for.inc ]
+ ret i32 %ret.0.lcssa
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %ret.011 = phi i32 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+ %idx.ext = sext i32 %ret.011 to i64
+ %add.ptr = getelementptr inbounds i32, ptr %a, i64 %idx.ext
+ store i32 %0, ptr %add.ptr, align 4
+ %inc = add nsw i32 %ret.011, 1
+ br label %for.inc
+
+for.inc:
+ %ret.1 = phi i32 [ %inc, %if.then ], [ %ret.011, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define i64 @compress_store_i64(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i64 @compress_store_i64(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[MONOTONIC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[MONOTONIC_UPDATE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[A]], i64 [[MONOTONIC_PHI]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr [[TMP6]], <4 x i1> [[TMP4]])
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i1> [[TMP4]] to i4
+; CHECK-NEXT: [[TMP8:%.*]] = call i4 @llvm.ctpop.i4(i4 [[TMP7]])
+; CHECK-NEXT: [[TMP9:%.*]] = zext i4 [[TMP8]] to i64
+; CHECK-NEXT: [[MONOTONIC_VF_STEP:%.*]] = mul i64 1, [[TMP9]]
+; CHECK-NEXT: [[MONOTONIC_UPDATE]] = add i64 [[MONOTONIC_PHI]], [[MONOTONIC_VF_STEP]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[RET_1_LCSSA:%.*]] = phi i64 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ], [ [[MONOTONIC_UPDATE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RET_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: ret i64 [[RET_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT: [[RET_011:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[RET_011]]
+; CHECK-NEXT: store i32 [[TMP11]], ptr [[ADD_PTR]], align 4
+; CHECK-NEXT: [[INC:%.*]] = add nsw i64 [[RET_011]], 1
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[RET_1]] = phi i64 [ [[INC]], [[IF_THEN]] ], [ [[RET_011]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+entry:
+ %cmp10 = icmp sgt i32 %n, 0
+ br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup:
+ %ret.0.lcssa = phi i64 [ 0, %entry ], [ %ret.1, %for.inc ]
+ ret i64 %ret.0.lcssa
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %ret.011 = phi i64 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+ %add.ptr = getelementptr inbounds i32, ptr %a, i64 %ret.011
+ store i32 %0, ptr %add.ptr, align 4
+ %inc = add nsw i64 %ret.011, 1
+ br label %for.inc
+
+for.inc:
+ %ret.1 = phi i64 [ %inc, %if.then ], [ %ret.011, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define i32 @compress_store_strided(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i32 @compress_store_strided(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[RET_1_LCSSA:%.*]] = phi i32 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT: ret i32 [[RET_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT: [[RET_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[RET_010]] to i64
+; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDX_EXT]]
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[ADD_PTR]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RET_010]], 2
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[RET_1]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[RET_010]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp9 = icmp sgt i32 %n, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup:
+ %ret.0.lcssa = phi i32 [ 0, %entry ], [ %ret.1, %for.inc ]
+ ret i32 %ret.0.lcssa
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %ret.010 = phi i32 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+ %idx.ext = sext i32 %ret.010 to i64
+ %add.ptr = getelementptr inbounds i32, ptr %a, i64 %idx.ext
+ store i32 %0, ptr %add.ptr, align 4
+ %add = add nsw i32 %ret.010, 2
+ br label %for.inc
+
+for.inc:
+ %ret.1 = phi i32 [ %add, %if.then ], [ %ret.010, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define i32 @compress_reverse(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i32 @compress_reverse(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[RET_1_LCSSA:%.*]] = phi i32 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT: ret i32 [[RET_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT: [[RET_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[RET_010]] to i64
+; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDX_EXT]]
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[ADD_PTR]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RET_010]], -1
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[RET_1]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[RET_010]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp9 = icmp sgt i32 %n, 0
+ br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup:
+ %ret.0.lcssa = phi i32 [ 0, %entry ], [ %ret.1, %for.inc ]
+ ret i32 %ret.0.lcssa
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %ret.010 = phi i32 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+ %idx.ext = sext i32 %ret.010 to i64
+ %add.ptr = getelementptr inbounds i32, ptr %a, i64 %idx.ext
+ store i32 %0, ptr %add.ptr, align 4
+ %add = add nsw i32 %ret.010, -1
+ br label %for.inc
+
+for.inc:
+ %ret.1 = phi i32 [ %add, %if.then ], [ %ret.010, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define i32 @expand_store(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i32 @expand_store(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[RET_1_LCSSA:%.*]] = phi i32 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT: ret i32 [[RET_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT: [[RET_012:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[RET_012]] to i64
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM1]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i32 [[TMP1]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RET_012]], 1
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[RET_1]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[RET_012]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp11 = icmp sgt i32 %n, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup:
+ %ret.0.lcssa = phi i32 [ 0, %entry ], [ %ret.1, %for.inc ]
+ ret i32 %ret.0.lcssa
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %ret.012 = phi i32 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+ %idxprom1 = sext i32 %ret.012 to i64
+ %arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %idxprom1
+ %1 = load i32, ptr %arrayidx2, align 4
+ %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+ store i32 %1, ptr %arrayidx4, align 4
+ %add = add nsw i32 %ret.012, 1
+ br label %for.inc
+
+for.inc:
+ %ret.1 = phi i32 [ %add, %if.then ], [ %ret.012, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define i32 @expand_store_strided(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i32 @expand_store_strided(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[RET_1_LCSSA:%.*]] = phi i32 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT: ret i32 [[RET_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT: [[RET_012:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[RET_012]] to i64
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM1]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i32 [[TMP1]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RET_012]], 2
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[RET_1]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[RET_012]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp11 = icmp sgt i32 %n, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup:
+ %ret.0.lcssa = phi i32 [ 0, %entry ], [ %ret.1, %for.inc ]
+ ret i32 %ret.0.lcssa
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %ret.012 = phi i32 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+ %idxprom1 = sext i32 %ret.012 to i64
+ %arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %idxprom1
+ %1 = load i32, ptr %arrayidx2, align 4
+ %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+ store i32 %1, ptr %arrayidx4, align 4
+ %add = add nsw i32 %ret.012, 2
+ br label %for.inc
+
+for.inc:
+ %ret.1 = phi i32 [ %add, %if.then ], [ %ret.012, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define i32 @expand_store_reverse(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i32 @expand_store_reverse(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[RET_1_LCSSA:%.*]] = phi i32 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT: ret i32 [[RET_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT: [[RET_012:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[RET_012]] to i64
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM1]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i32 [[TMP1]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[RET_012]], -1
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[RET_1]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[RET_012]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp11 = icmp sgt i32 %n, 0
+ br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup:
+ %ret.0.lcssa = phi i32 [ 0, %entry ], [ %ret.1, %for.inc ]
+ ret i32 %ret.0.lcssa
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %ret.012 = phi i32 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+ %idxprom1 = sext i32 %ret.012 to i64
+ %arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %idxprom1
+ %1 = load i32, ptr %arrayidx2, align 4
+ %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+ store i32 %1, ptr %arrayidx4, align 4
+ %add = add nsw i32 %ret.012, -1
+ br label %for.inc
+
+for.inc:
+ %ret.1 = phi i32 [ %add, %if.then ], [ %ret.012, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define i32 @expand(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i32 @expand(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP13]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[RET_1_LCSSA:%.*]] = phi i32 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT: ret i32 [[RET_0_LCSSA]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT: [[RET_014:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[RET_014]]
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[RET_014]], 1
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[RET_1]] = phi i32 [ [[ADD5]], [[IF_THEN]] ], [ [[RET_014]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+ %cmp13 = icmp sgt i32 %n, 0
+ br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup:
+ %ret.0.lcssa = phi i32 [ 0, %entry ], [ %ret.1, %for.inc ]
+ ret i32 %ret.0.lcssa
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+ %ret.014 = phi i32 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+ %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+ %add = add nsw i32 %0, %ret.014
+ %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+ store i32 %add, ptr %arrayidx4, align 4
+ %add5 = add nsw i32 %ret.014, 1
+ br label %for.inc
+
+for.inc:
+ %ret.1 = phi i32 [ %add5, %if.then ], [ %ret.014, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
>From d3256232d31f361099ff917eef19c26443a3a09e Mon Sep 17 00:00:00 2001
From: Kolya Panchenko <kolya.panchenko at sifive.com>
Date: Thu, 29 Feb 2024 12:41:21 -0800
Subject: [PATCH 2/2] format + addressed comments
---
llvm/include/llvm/Analysis/IVDescriptors.h | 4 ++--
.../Transforms/Vectorize/LoopVectorizationLegality.h | 10 +++++-----
llvm/lib/Analysis/IVDescriptors.cpp | 3 ++-
.../Transforms/Vectorize/LoopVectorizationLegality.cpp | 8 +++-----
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 +++++---
llvm/lib/Transforms/Vectorize/VPlan.h | 3 +--
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 3 ++-
.../Transforms/LoopVectorize/RISCV/compress_expand.ll | 4 ++--
8 files changed, 22 insertions(+), 21 deletions(-)
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 877204a8b2d864..dd60e84bf1f82e 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -13,11 +13,11 @@
#ifndef LLVM_ANALYSIS_IVDESCRIPTORS_H
#define LLVM_ANALYSIS_IVDESCRIPTORS_H
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/ValueHandle.h"
-#include "llvm/ADT/SetVector.h"
namespace llvm {
@@ -400,7 +400,7 @@ class MonotonicDescriptor {
public:
/// This enum represents the kinds of monotonic that we support.
enum MonotonicKind {
- MK_None, ///< Not a monotonic variable.
+ MK_None, ///< Not a monotonic variable.
MK_Integer, /// < Integer monotonic variable. Step = C
MK_Pointer, /// < Pointer monotonic variable. Step = C
};
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 9896211ca11d83..fa2208e5e460cb 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -258,8 +258,7 @@ class LoopVectorizationLegality {
using InductionList = MapVector<PHINode *, InductionDescriptor>;
/// MonotonicPhiList contains phi nodes that represent monotonic idiom
- using MonotonicPhiList =
- MapVector<const PHINode *, MonotonicDescriptor>;
+ using MonotonicPhiList = MapVector<const PHINode *, MonotonicDescriptor>;
/// RecurrenceSet contains the phi nodes that are recurrences other than
/// inductions and reductions.
@@ -315,9 +314,10 @@ class LoopVectorizationLegality {
/// Returns the MonotonicDescriptor associated with an \p I instruction
/// Returns emtpy descriptor if \p I instruction is non-monotonic.
- const MonotonicDescriptor *getMonotonicDescriptor(const Instruction *I) const {
+ const MonotonicDescriptor *
+ getMonotonicDescriptor(const Instruction *I) const {
for (const auto &PMD : getMonotonics()) {
- if (const auto *Phi = dyn_cast<const PHINode>(I))
+ if (const auto *Phi = dyn_cast<PHINode>(I))
if (PMD.second.getPhis().contains(const_cast<PHINode *>(Phi)))
return &PMD.second;
if (PMD.second.getUpdateOp() == I)
@@ -391,7 +391,7 @@ class LoopVectorizationLegality {
bool isConsecutiveMonotonicPtr(Value *Ptr) const;
/// Return true if \p Ptr computation depends on monotonic value.
- bool ptrHasMonotonicOperand(Value *Ptr) const;
+ bool hasMonotonicOperand(Value *Ptr) const;
/// Returns true if value V is uniform across \p VF lanes, when \p VF is
/// provided, and otherwise if \p V is invariant across all loop iterations.
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 9194a5622b7dc2..dbe1890477879c 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -1575,7 +1575,8 @@ MonotonicDescriptor::isMonotonicPHI(PHINode *Phi, const Loop *L,
// It's important to check all uses of the Phi and make sure they are either
// outside of the loop.
- // TODO: Support uses under nested predicate, which can be supported by vectorizer
+ // TODO: Support uses under nested predicate, which can be supported by
+ // vectorizer
for (User *U : Phi->users()) {
auto *UI = cast<Instruction>(U);
if (!L->contains(UI))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 77348826e067cf..c41af389ebb8ca 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -477,7 +477,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
}
bool LoopVectorizationLegality::isConsecutiveMonotonicPtr(Value *Ptr) const {
- assert(ptrHasMonotonicOperand(Ptr) &&
+ assert(hasMonotonicOperand(Ptr) &&
"Pointer's computation does not use monotonic values.");
auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
@@ -494,8 +494,7 @@ bool LoopVectorizationLegality::isConsecutiveMonotonicPtr(Value *Ptr) const {
return Step && Step->getAPInt().getZExtValue() == 1;
}
-bool LoopVectorizationLegality::ptrHasMonotonicOperand(
- Value *Ptr) const {
+bool LoopVectorizationLegality::hasMonotonicOperand(Value *Ptr) const {
auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
if (!GEP)
return false;
@@ -962,8 +961,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
continue;
}
if (EnableMonotonics && TTI->enableMonotonicVectorization())
- if (auto MD =
- MonotonicDescriptor::isMonotonicPHI(Phi, TheLoop, PSE))
+ if (auto MD = MonotonicDescriptor::isMonotonicPHI(Phi, TheLoop, PSE))
if (canVectorizeMonotonic(MD)) {
addMonotonic(MD);
continue;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ab90b5dc50ab69..42bb07f122f0a0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4115,7 +4115,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
bool LoopVectorizationCostModel::memoryInstructionUsesMonotonic(
Instruction *I, ElementCount VF) {
assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
- return Legal->ptrHasMonotonicOperand(getLoadStorePointerOperand(I));
+ return Legal->hasMonotonicOperand(getLoadStorePointerOperand(I));
}
void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
@@ -6084,7 +6084,7 @@ LoopVectorizationCostModel::getMonotonicMemoryOpCost(Instruction *I,
return InstructionCost::getInvalid();
LLVMContext &Ctx = I->getContext();
- SmallVector<Type *> ParamTys;;
+ SmallVector<Type *> ParamTys;
ParamTys.push_back(VectorTy);
ParamTys.push_back(Ptr->getType());
ParamTys.push_back(VectorType::get(Type::getInt1Ty(Ctx), VF));
@@ -6484,7 +6484,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
// Load or store with monotonic index in pointer's computation
// requires special handling of a mask.
- if (Legal->ptrHasMonotonicOperand(Ptr))
+ if (Legal->hasMonotonicOperand(Ptr))
return false;
// For scalable vectors, a uniform memop load is always
@@ -9492,6 +9492,8 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
NewSI = Builder.CreateIntrinsic(
Builder.getVoidTy(), Intrinsic::masked_compressstore,
{StoredVal, VecPtr, BlockInMaskParts[Part]});
+ cast<IntrinsicInst>(NewSI)->addParamAttr(
+ 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
} else {
if (isReverse()) {
// If we store to reverse consecutive memory locations, then we need
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0297b5bae7f9ad..3a8b69f2b5c191 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2254,8 +2254,7 @@ class VPBranchOnMaskRecipe : public VPRecipeBase {
class VPMonotonicHeaderPHIRecipe final : public VPHeaderPHIRecipe {
public:
VPMonotonicHeaderPHIRecipe(PHINode *Phi, VPValue *StartValue)
- : VPHeaderPHIRecipe(VPDef::VPMonotonicHeaderPHISC, Phi,
- StartValue) {}
+ : VPHeaderPHIRecipe(VPDef::VPMonotonicHeaderPHISC, Phi, StartValue) {}
~VPMonotonicHeaderPHIRecipe() override = default;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6b89a53600dd6c..c05f741eb10f94 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1146,7 +1146,8 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
getStepValue()->printAsOperand(O, SlotTracker);
}
-void VPMonotonicUpdateInstruction::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
+void VPMonotonicUpdateInstruction::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << Indent << "monotonic-update ";
printAsOperand(O, SlotTracker);
O << " = ";
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll b/llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll
index 71de9d495508da..6e7097e0963478 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll
@@ -96,7 +96,7 @@ define i32 @compress_store(i32 %n, ptr noalias %a, ptr noalias %b) {
; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[MONOTONIC_PHI]] to i64
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr [[TMP7]], <4 x i1> [[TMP4]])
+; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr align 4 [[TMP7]], <4 x i1> [[TMP4]])
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i1> [[TMP4]] to i4
; CHECK-NEXT: [[TMP9:%.*]] = call i4 @llvm.ctpop.i4(i4 [[TMP8]])
; CHECK-NEXT: [[TMP10:%.*]] = zext i4 [[TMP9]] to i32
@@ -196,7 +196,7 @@ define i64 @compress_store_i64(i32 %n, ptr noalias %a, ptr noalias %b) {
; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[A]], i64 [[MONOTONIC_PHI]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr [[TMP6]], <4 x i1> [[TMP4]])
+; CHECK-NEXT: call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr align 4 [[TMP6]], <4 x i1> [[TMP4]])
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i1> [[TMP4]] to i4
; CHECK-NEXT: [[TMP8:%.*]] = call i4 @llvm.ctpop.i4(i4 [[TMP7]])
; CHECK-NEXT: [[TMP9:%.*]] = zext i4 [[TMP8]] to i64
More information about the llvm-commits
mailing list