[llvm] [LV] Vectorization of compress idiom (PR #83467)

Thu Feb 29 12:42:32 PST 2024

https://github.com/nikolaypanchenko updated https://github.com/llvm/llvm-project/pull/83467

>From 20fe5ee426ffe05205f19f3641ad6f0e69cfba86 Mon Sep 17 00:00:00 2001
From: Kolya Panchenko <kolya.panchenko at sifive.com>
Date: Tue, 6 Feb 2024 11:59:58 -0800
Subject: [PATCH 1/2] [LV] Support for monotonic idiom

Monotonic Idiom is a special form of a loop carried dependency, which
can be described as
```
  m += step
  ... m ...
```
where
* `m` is a scalar variable, *`step` is a loop-invariant variable,
* the update is done under some non-uniform condition,
* use(s) is(are) done under the same or nested condition(s)

Whether `m` is used in rhs or lhs defines which special vector code
needs to be generated on a use-side.
If `m` is used in lhs, the pattern is known as compress as stored data
needs to be compressed before the store
If `m` is used in rhs, the pattern is known as expand/decompress as use
data needs to be expanded according to the mask

The changeset adds new descriptor for monotonic values as define above
and adds initial support for unit-strided compress store.
---
 llvm/include/llvm/Analysis/IVDescriptors.h    |  49 ++
 .../llvm/Analysis/TargetTransformInfo.h       |   8 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   2 +
 .../Vectorize/LoopVectorizationLegality.h     |  58 ++
 llvm/lib/Analysis/IVDescriptors.cpp           | 124 ++++
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   4 +
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   4 +
 .../Target/RISCV/RISCVTargetTransformInfo.h   |   4 +
 .../Vectorize/LoopVectorizationLegality.cpp   |  88 +++
 .../Transforms/Vectorize/LoopVectorize.cpp    |  99 ++-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   1 +
 llvm/lib/Transforms/Vectorize/VPlan.h         | 110 ++-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  16 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  69 ++
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  62 ++
 .../Transforms/Vectorize/VPlanTransforms.h    |   4 +
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   2 +
 .../LoopVectorize/RISCV/compress_expand.ll    | 702 ++++++++++++++++++
 18 files changed, 1390 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll

diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 5c7b613ac48c40..877204a8b2d864 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/ADT/SetVector.h"
 
 namespace llvm {
 
@@ -395,6 +396,54 @@ class InductionDescriptor {
   SmallVector<Instruction *, 2> RedundantCasts;
 };
 
+class MonotonicDescriptor {
+public:
+  /// This enum represents the kinds of monotonic that we support.
+  enum MonotonicKind {
+    MK_None,  ///< Not a monotonic variable.
+    MK_Integer, /// < Integer monotonic variable. Step = C
+    MK_Pointer, /// < Pointer monotonic variable. Step = C
+  };
+
+public:
+  MonotonicDescriptor() = default;
+
+  Value *getStartValue() const { return StartValue; }
+  MonotonicKind getKind() const { return MK; }
+  const SCEV *getStep() const { return Step; }
+  const Instruction *getUpdateOp() const { return UpdateOp; }
+  const SetVector<PHINode *> &getPhis() const { return Phis; }
+  bool isHeaderPhi(const PHINode *Phi) const {
+    return !Phis.empty() && Phis[0] == Phi;
+  }
+
+  /// Returns true if \p Phi forms monotonic pattern within a loop \p L.
+  static MonotonicDescriptor isMonotonicPHI(PHINode *Phi, const Loop *L,
+                                            PredicatedScalarEvolution &PSE);
+
+  operator bool() const { return MK != MK_None; }
+
+private:
+  /// Private constructor - used by \c isMonotonicPHI
+  MonotonicDescriptor(Value *Start, MonotonicKind K, const SCEV *Step,
+                      const Instruction *UpdateOp, SetVector<PHINode *> &Phis)
+      : StartValue(Start), MK(K), Step(Step), UpdateOp(UpdateOp),
+        Phis(Phis.begin(), Phis.end()) {}
+
+  /// Start value.
+  TrackingVH<Value> StartValue = nullptr;
+  /// Induction kind.
+  MonotonicKind MK = MK_None;
+  /// Step value.
+  const SCEV *Step = nullptr;
+  // Instruction that advances induction variable.
+  const Instruction *UpdateOp = nullptr;
+
+  /// All phis that are used to update the monotonic variable. It's expected
+  /// that the first PHINode is in the header BB
+  SetVector<PHINode *> Phis;
+};
+
 } // end namespace llvm
 
 #endif // LLVM_ANALYSIS_IVDESCRIPTORS_H
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 58577a6b6eb5c0..a7bdefe0d95708 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1701,6 +1701,9 @@ class TargetTransformInfo {
   bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
                              Align Alignment) const;
 
+  /// \returns true if vectorization of monotonics is supported by the target.
+  bool enableMonotonicVectorization() const;
+
   struct VPLegalization {
     enum VPTransform {
       // keep the predicating parameter
@@ -2131,6 +2134,7 @@ class TargetTransformInfo::Concept {
   virtual bool supportsScalableVectors() const = 0;
   virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
                                      Align Alignment) const = 0;
+  virtual bool enableMonotonicVectorization() const = 0;
   virtual VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
@@ -2874,6 +2878,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.hasActiveVectorLength(Opcode, DataType, Alignment);
   }
 
+  bool enableMonotonicVectorization() const override {
+    return Impl.enableMonotonicVectorization();
+  }
+
   VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const override {
     return Impl.getVPLegalizationStrategy(PI);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 13379cc126a40c..e77838882ee725 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -923,6 +923,8 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  bool enableMonotonicVectorization() const { return false; }
+
   TargetTransformInfo::VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const {
     return TargetTransformInfo::VPLegalization(
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index a509ebf6a7e1b3..9896211ca11d83 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -257,6 +257,10 @@ class LoopVectorizationLegality {
   /// induction descriptor.
   using InductionList = MapVector<PHINode *, InductionDescriptor>;
 
+  /// MonotonicPhiList contains phi nodes that represent monotonic idiom
+  using MonotonicPhiList =
+      MapVector<const PHINode *, MonotonicDescriptor>;
+
   /// RecurrenceSet contains the phi nodes that are recurrences other than
   /// inductions and reductions.
   using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
@@ -306,6 +310,42 @@ class LoopVectorizationLegality {
   /// Returns True if V is a Phi node of an induction variable in this loop.
   bool isInductionPhi(const Value *V) const;
 
+  /// Returns the Monotonics found in the loop
+  const MonotonicPhiList &getMonotonics() const { return MonotonicPhis; }
+
+  /// Returns the MonotonicDescriptor associated with an \p I instruction
+  /// Returns emtpy descriptor if \p I instruction is non-monotonic.
+  const MonotonicDescriptor *getMonotonicDescriptor(const Instruction *I) const {
+    for (const auto &PMD : getMonotonics()) {
+      if (const auto *Phi = dyn_cast<const PHINode>(I))
+        if (PMD.second.getPhis().contains(const_cast<PHINode *>(Phi)))
+          return &PMD.second;
+      if (PMD.second.getUpdateOp() == I)
+        return &PMD.second;
+    }
+    return nullptr;
+  }
+
+  /// Returns true if \p I instruction is a header phi of the monotonic.
+  bool isMonotonicPhi(const Instruction *I) const {
+    const auto *Phi = dyn_cast<PHINode>(I);
+    return Phi && MonotonicPhis.contains(Phi);
+  }
+
+  /// Returns true if \p V value is a header phi of the monotonic.
+  bool isMonotonicPhi(const Value *V) const {
+    const auto *I = dyn_cast<Instruction>(V);
+    return I && isMonotonicPhi(I);
+  }
+
+  /// Returns true of \p I instruction is an update instruction of the
+  /// monotonic.
+  bool isMonotonicUpdate(const Instruction *I) const {
+    return any_of(getMonotonics(), [I](const auto &PMD) {
+      return PMD.second.getUpdateOp() == I;
+    });
+  }
+
   /// Returns a pointer to the induction descriptor, if \p Phi is an integer or
   /// floating point induction.
   const InductionDescriptor *getIntOrFpInductionDescriptor(PHINode *Phi) const;
@@ -346,6 +386,13 @@ class LoopVectorizationLegality {
   /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
   int isConsecutivePtr(Type *AccessTy, Value *Ptr) const;
 
+  /// Returns true if \p Ptr is depends on a monotonic value and ptr diff
+  /// between two iterations is one if monotonic value is updated
+  bool isConsecutiveMonotonicPtr(Value *Ptr) const;
+
+  /// Return true if \p Ptr computation depends on monotonic value.
+  bool ptrHasMonotonicOperand(Value *Ptr) const;
+
   /// Returns true if value V is uniform across \p VF lanes, when \p VF is
   /// provided, and otherwise if \p V is invariant across all loop iterations.
   bool isInvariant(Value *V) const;
@@ -443,6 +490,11 @@ class LoopVectorizationLegality {
   /// specific checks for outer loop vectorization.
   bool canVectorizeOuterLoop();
 
+  /// Return true if loop vectorizer can generate correct code for that
+  /// monotonic. The method is needed to gradually enable vectorization of
+  /// monotonics.
+  bool canVectorizeMonotonic(const MonotonicDescriptor &MD);
+
   /// Return true if all of the instructions in the block can be speculatively
   /// executed, and record the loads/stores that require masking.
   /// \p SafePtrs is a list of addresses that are known to be legal and we know
@@ -460,6 +512,9 @@ class LoopVectorizationLegality {
   void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
                        SmallPtrSetImpl<Value *> &AllowedExit);
 
+  /// Add MonotonicDescriptor
+  void addMonotonic(const MonotonicDescriptor &MD);
+
   /// The loop that we evaluate.
   Loop *TheLoop;
 
@@ -510,6 +565,9 @@ class LoopVectorizationLegality {
   /// loop body.
   SmallPtrSet<Instruction *, 4> InductionCastsToIgnore;
 
+  /// Holds the phis of the monotonics
+  MonotonicPhiList MonotonicPhis;
+
   /// Holds the phi nodes that are fixed-order recurrences.
   RecurrenceSet FixedOrderRecurrences;
 
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 055f121e743411..9194a5622b7dc2 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -1475,6 +1475,130 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
   return isInductionPHI(Phi, TheLoop, PSE.getSE(), D, AR);
 }
 
+MonotonicDescriptor
+MonotonicDescriptor::isMonotonicPHI(PHINode *Phi, const Loop *L,
+                                    PredicatedScalarEvolution &PSE) {
+  // Monotonic is a special loop carried dependency which is
+  // incremented by a invariant value under some condition and used under the
+  // same or nested condition. That's different to conditional reduction, which
+  // does not allow uses at all.
+
+  // Don't allow multiple updates of the value
+  if (Phi->getNumIncomingValues() != 2)
+    return MonotonicDescriptor();
+
+  Type *Ty = Phi->getType();
+  if (!Ty->isIntegerTy() && !Ty->isPointerTy())
+    return MonotonicDescriptor();
+
+  SetVector<PHINode *> Visited;
+  Visited.insert(Phi);
+  SmallVector<PHINode *> Worklist;
+
+  for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+    if (Phi->getIncomingBlock(I) == L->getLoopPreheader())
+      continue;
+    auto *P = dyn_cast<PHINode>(Phi->getIncomingValue(I));
+    if (!P)
+      return MonotonicDescriptor();
+    Worklist.push_back(P);
+  }
+
+  auto FindSelfUpdate = [&]() -> Instruction * {
+    Instruction *SelfUpdate = nullptr;
+    // Visit use-def chain of the Phi expecting all incoming values as phis
+    // which are used just once, i.e. within that chain.
+    while (!Worklist.empty()) {
+      PHINode *P = Worklist.pop_back_val();
+      if (Visited.contains(P))
+        continue;
+
+      Visited.insert(P);
+      // Expect all phi to be a part of the loop
+      if (!L->contains(P))
+        return nullptr;
+
+      for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; ++I) {
+        Value *V = P->getIncomingValue(I);
+        if (auto *PN = dyn_cast<PHINode>(V)) {
+          Worklist.push_back(PN);
+          continue;
+        }
+        if (SelfUpdate != nullptr)
+          return nullptr;
+
+        if ((Ty->isIntegerTy() && !isa<BinaryOperator>(V)) ||
+            (Ty->isPointerTy() && !isa<GetElementPtrInst>(V)))
+          return nullptr;
+
+        SelfUpdate = cast<Instruction>(V);
+      }
+    }
+    return SelfUpdate;
+  };
+  Instruction *SelfUpdate = FindSelfUpdate();
+
+  // Expect `SelfUpdate` to bey used only once
+  // TODO: Support monotonic with a pre-increment
+  if (!SelfUpdate || SelfUpdate->getNumUses() != 1)
+    return MonotonicDescriptor();
+
+  Value *Step = nullptr;
+  if (auto *GEPUpdate = dyn_cast<GetElementPtrInst>(SelfUpdate)) {
+    if (GEPUpdate->getNumOperands() != 2)
+      return MonotonicDescriptor();
+
+    Step = GEPUpdate->getOperand(1);
+    // TODO: Re-enable update via GEP. This will require changes in VPlan to
+    // correctly print and generate updates
+    return MonotonicDescriptor();
+  }
+  auto *BO = cast<BinaryOperator>(SelfUpdate);
+  // TODO: support other than Add instruction to update monotonic variable
+  if (BO->getOpcode() != Instruction::Add)
+    return MonotonicDescriptor();
+
+  // Either `nsw` or `nuw` should be set, otherwise it's not safe to assume
+  // monotonic won't wrap.
+  if (!BO->hasNoSignedWrap() && !BO->hasNoUnsignedWrap())
+    return MonotonicDescriptor();
+  Step = BO->getOperand(0) == Phi ? BO->getOperand(1) : BO->getOperand(0);
+
+  if (!L->isLoopInvariant(Step))
+    return MonotonicDescriptor();
+
+  auto *StepSCEV = PSE.getSCEV(Step);
+  if (auto *C = dyn_cast<SCEVConstant>(StepSCEV))
+    // TODO: handle step != 1
+    if (!C->isOne())
+      return MonotonicDescriptor();
+
+  // It's important to check all uses of the Phi and make sure they are either
+  // outside of the loop.
+  // TODO: Support uses under nested predicate, which can be supported by vectorizer
+  for (User *U : Phi->users()) {
+    auto *UI = cast<Instruction>(U);
+    if (!L->contains(UI))
+      continue;
+
+    // Ignore phis that are necessary to represent self-update
+    if (auto *P = dyn_cast<PHINode>(UI))
+      if (Visited.contains(P))
+        continue;
+
+    BasicBlock *UIParent = UI->getParent();
+    if (UIParent != SelfUpdate->getParent())
+      return MonotonicDescriptor();
+  }
+
+  Value *StartValue = Phi->getIncomingValueForBlock(L->getLoopPreheader());
+  // Record all visited Phis in a vector and place Phi at the biginning to
+  // simplify future analysis
+  return MonotonicDescriptor(StartValue,
+                             Ty->isPointerTy() ? MK_Pointer : MK_Integer,
+                             StepSCEV, SelfUpdate, Visited);
+}
+
 bool InductionDescriptor::isInductionPHI(
     PHINode *Phi, const Loop *TheLoop, ScalarEvolution *SE,
     InductionDescriptor &D, const SCEV *Expr,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1f11f0d7dd620e..becc8e821fd346 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1302,6 +1302,10 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
+bool TargetTransformInfo::enableMonotonicVectorization() const {
+  return TTIImpl->enableMonotonicVectorization();
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 2e4e69fb4f920f..da01d2f986b4b4 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1609,3 +1609,7 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                   C2.NumIVMuls, C2.NumBaseAdds,
                   C2.ScaleCost, C2.ImmCost, C2.SetupCost);
 }
+
+bool RISCVTTIImpl::enableMonotonicVectorization() const {
+  return ST->hasVInstructions();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index af36e9d5d5e886..c5e6fc26605b28 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -372,6 +372,10 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
   bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                      const TargetTransformInfo::LSRCost &C2);
 
+  /// \returns true if ISA supports all needed instructions to vectorize
+  /// monotonics
+  bool enableMonotonicVectorization() const;
+
   bool shouldFoldTerminatingConditionAfterLSR() const {
     return true;
   }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 37a356c43e29a4..77348826e067cf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -78,6 +78,11 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
                 "Scalable vectorization is available and favored when the "
                 "cost is inconclusive.")));
 
+static cl::opt<bool>
+    EnableMonotonics("enable-monotonics", cl::init(true), cl::Hidden,
+                     cl::desc("Control whether vectorization of loops with "
+                              "monotonic variables is enabled"));
+
 /// Maximum vectorization interleave count.
 static const unsigned MaxInterleaveFactor = 16;
 
@@ -471,6 +476,36 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
   return 0;
 }
 
+bool LoopVectorizationLegality::isConsecutiveMonotonicPtr(Value *Ptr) const {
+  assert(ptrHasMonotonicOperand(Ptr) &&
+         "Pointer's computation does not use monotonic values.");
+
+  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  assert(GEP->getNumOperands() == 2 &&
+         "GetElementPtr with more than 1 indexes is not currently supported "
+         "and should be filtered out before.");
+  Value *Monotonic = GEP->getOperand(1);
+  if (auto *Cast = dyn_cast<CastInst>(Monotonic))
+    Monotonic = Cast->getOperand(0);
+  const MonotonicDescriptor *MD =
+      getMonotonicDescriptor(cast<Instruction>(Monotonic));
+  assert(MD && "The index has no MonotonicDescriptor associated with it.");
+  const SCEVConstant *Step = dyn_cast<SCEVConstant>(MD->getStep());
+  return Step && Step->getAPInt().getZExtValue() == 1;
+}
+
+bool LoopVectorizationLegality::ptrHasMonotonicOperand(
+    Value *Ptr) const {
+  auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!GEP)
+    return false;
+  return any_of(GEP->operands(), [&](Value *V) {
+    if (auto *Cast = dyn_cast<CastInst>(V))
+      return isMonotonicPhi(Cast->getOperand(0));
+    return isMonotonicPhi(V);
+  });
+}
+
 bool LoopVectorizationLegality::isInvariant(Value *V) const {
   return LAI->isInvariant(V);
 }
@@ -678,6 +713,47 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
   return Result;
 }
 
+bool LoopVectorizationLegality::canVectorizeMonotonic(const MonotonicDescriptor &MD) {
+  Value *Monotonic = MD.getPhis().front();
+  auto IsUserInLoop = [&](User *U) -> bool {
+    auto *I = dyn_cast<Instruction>(U);
+    return I && TheLoop->contains(I);
+  };
+  auto CanIgnoreUser = [&](User *U) -> bool {
+    if (auto *PN = dyn_cast<PHINode>(U))
+      if (MD.getPhis().contains(PN))
+        return true;
+    return U == MD.getUpdateOp();
+  };
+
+  for (User *U : Monotonic->users()) {
+    if (!IsUserInLoop(U) || CanIgnoreUser(U))
+      continue;
+
+    // For now expect monotonic value to be used by by zext with a single user
+    // or GEP
+    if (U->hasOneUser() && isa<ZExtInst, SExtInst>(U))
+      U = *cast<Instruction>(U)->users().begin();
+
+    if (!isa<GetElementPtrInst>(U))
+      return false;
+
+    // All GEPs should be used as a pointer operand of a store which represents
+    // compressstore.
+    if (any_of(U->users(), [&](User *UI) {
+          if (!IsUserInLoop(UI) || CanIgnoreUser(UI))
+            return false;
+          return UI != MD.getUpdateOp() &&
+                 (!isa<StoreInst>(UI) || getLoadStorePointerOperand(UI) != U);
+        })) {
+      LLVM_DEBUG(
+          dbgs() << "LV: Expand of a monotonic value is not yet supported.\n");
+      return false;
+    }
+  }
+  return true;
+}
+
 void LoopVectorizationLegality::addInductionPhi(
     PHINode *Phi, const InductionDescriptor &ID,
     SmallPtrSetImpl<Value *> &AllowedExit) {
@@ -730,6 +806,11 @@ void LoopVectorizationLegality::addInductionPhi(
   LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
 }
 
+void LoopVectorizationLegality::addMonotonic(const MonotonicDescriptor &MD) {
+  for (PHINode *P : MD.getPhis())
+    MonotonicPhis[P] = MD;
+}
+
 bool LoopVectorizationLegality::setupOuterLoopInductions() {
   BasicBlock *Header = TheLoop->getHeader();
 
@@ -880,6 +961,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           addInductionPhi(Phi, ID, AllowedExit);
           continue;
         }
+        if (EnableMonotonics && TTI->enableMonotonicVectorization())
+          if (auto MD =
+                  MonotonicDescriptor::isMonotonicPHI(Phi, TheLoop, PSE))
+            if (canVectorizeMonotonic(MD)) {
+              addMonotonic(MD);
+              continue;
+            }
 
         reportVectorizationFailure("Found an unidentified PHI",
             "value that could not be identified as "
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 50a073e890626e..ab90b5dc50ab69 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1253,7 +1253,8 @@ class LoopVectorizationCostModel {
     CM_GatherScatter,
     CM_Scalarize,
     CM_VectorCall,
-    CM_IntrinsicCall
+    CM_IntrinsicCall,
+    CM_MonotonicUnit, // For consecutive accesses with monotonic +1
   };
 
   /// Save vectorization decision \p W and \p Cost taken by the cost model for
@@ -1454,6 +1455,10 @@ class LoopVectorizationCostModel {
   /// access that can be widened.
   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
 
+  /// Returns true if \p I is a memory instruction with monotonic index(es) that
+  /// can be widened
+  bool memoryInstructionUsesMonotonic(Instruction *I, ElementCount VF);
+
   /// Returns true if \p I is a memory instruction in an interleaved-group
   /// of memory accesses that can be vectorized with wide vector loads/stores
   /// and shuffles.
@@ -1647,6 +1652,10 @@ class LoopVectorizationCostModel {
   /// memory access.
   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
 
+  /// The cost computation for widening memory access \p I which has monotonic
+  /// index.
+  InstructionCost getMonotonicMemoryOpCost(Instruction *I, ElementCount VF);
+
   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
   /// Load: scalar load + broadcast.
   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
@@ -3707,7 +3716,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
         return WideningDecision == CM_Scalarize;
     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
            "Ptr is neither a value or pointer operand");
-    return WideningDecision != CM_GatherScatter;
+    return WideningDecision != CM_GatherScatter &&
+           WideningDecision != CM_MonotonicUnit;
   };
 
   // A helper that returns true if the given value is a bitcast or
@@ -4102,6 +4112,12 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
   return true;
 }
 
+bool LoopVectorizationCostModel::memoryInstructionUsesMonotonic(
+    Instruction *I, ElementCount VF) {
+  assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
+  return Legal->ptrHasMonotonicOperand(getLoadStorePointerOperand(I));
+}
+
 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   // We should not collect Uniforms more than once per VF. Right now,
   // this function is called from collectUniformsAndScalars(), which
@@ -4188,6 +4204,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
 
     return (WideningDecision == CM_Widen ||
             WideningDecision == CM_Widen_Reverse ||
+            WideningDecision == CM_MonotonicUnit ||
             WideningDecision == CM_Interleave);
   };
 
@@ -5257,7 +5274,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     return 1;
 
   // We used the distance for the interleave count.
-  if (!Legal->isSafeForAnyVectorWidth())
+  if (!Legal->isSafeForAnyVectorWidth() || !Legal->getMonotonics().empty())
     return 1;
 
   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
@@ -6054,6 +6071,29 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
   return Cost;
 }
 
+InstructionCost
+LoopVectorizationCostModel::getMonotonicMemoryOpCost(Instruction *I,
+                                                     ElementCount VF) {
+  Type *ValTy = getLoadStoreType(I);
+  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+  Value *Ptr = getLoadStorePointerOperand(I);
+
+  if (!Legal->isConsecutiveMonotonicPtr(Ptr))
+    return InstructionCost::getInvalid();
+  if (isa<LoadInst>(I))
+    return InstructionCost::getInvalid();
+
+  LLVMContext &Ctx = I->getContext();
+  SmallVector<Type *> ParamTys;;
+  ParamTys.push_back(VectorTy);
+  ParamTys.push_back(Ptr->getType());
+  ParamTys.push_back(VectorType::get(Type::getInt1Ty(Ctx), VF));
+  IntrinsicCostAttributes CostAttrs(Intrinsic::masked_compressstore,
+                                    Type::getVoidTy(Ctx), ParamTys);
+  return TTI.getIntrinsicInstrCost(CostAttrs,
+                                   TargetTransformInfo::TCK_RecipThroughput);
+}
+
 InstructionCost
 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
                                                 ElementCount VF) {
@@ -6442,6 +6482,11 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
           if (!foldTailByMasking())
             return true;
 
+          // Load or store with monotonic index in pointer's computation
+          // requires special handling of a mask.
+          if (Legal->ptrHasMonotonicOperand(Ptr))
+            return false;
+
           // For scalable vectors, a uniform memop load is always
           // uniform-by-parts  and we know how to scalarize that.
           if (isa<LoadInst>(I))
@@ -6487,6 +6532,15 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
         continue;
       }
 
+      if (memoryInstructionUsesMonotonic(&I, VF)) {
+        assert(
+            Legal->isConsecutiveMonotonicPtr(getLoadStorePointerOperand(&I)) &&
+            "Expected consecutive monotonic pointer");
+        setWideningDecision(&I, VF, CM_MonotonicUnit,
+                            getMonotonicMemoryOpCost(&I, VF));
+        continue;
+      }
+
       // Choose between Interleaving, Gather/Scatter or Scalarization.
       InstructionCost InterleaveCost = InstructionCost::getInvalid();
       unsigned NumAccesses = 1;
@@ -6901,6 +6955,18 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
       return *RedCost;
 
+    if (auto *MD = Legal->getMonotonicDescriptor(I)) {
+      if (VF.isScalable())
+        return InstructionCost::getInvalid();
+      InstructionCost Cost = 0;
+
+      Type *ScalarTy = Type::getIntNTy(I->getContext(), VF.getKnownMinValue());
+      IntrinsicCostAttributes CostAttrs(Intrinsic::ctpop, ScalarTy, {ScalarTy});
+
+      Cost += TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
+      return Cost;
+    }
+
     // Certain instructions can be cheaper to vectorize if they have a constant
     // second vector operand. One example of this are shifts on x86.
     Value *Op2 = I->getOperand(1);
@@ -7034,6 +7100,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       case LoopVectorizationCostModel::CM_VectorCall:
       case LoopVectorizationCostModel::CM_IntrinsicCall:
         llvm_unreachable_internal("Instr has invalid widening decision");
+      case LoopVectorizationCostModel::CM_MonotonicUnit:
+        return TTI::CastContextHint::Masked;
       }
 
       llvm_unreachable("Unhandled case!");
@@ -8060,9 +8128,10 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
   // reverse consecutive.
   LoopVectorizationCostModel::InstWidening Decision =
       CM.getWideningDecision(I, Range.Start);
+  bool Monotonic = Decision == LoopVectorizationCostModel::CM_MonotonicUnit;
   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
   bool Consecutive =
-      Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
+      Reverse || Monotonic || Decision == LoopVectorizationCostModel::CM_Widen;
 
   VPValue *Ptr = isa<LoadInst>(I) ? Operands[0] : Operands[1];
   if (Consecutive) {
@@ -8080,7 +8149,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
 
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask,
-                                            Consecutive, Reverse);
+                                            Consecutive, Reverse, Monotonic);
 }
 
 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
@@ -8423,7 +8492,7 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
       return Recipe;
 
     VPHeaderPHIRecipe *PhiRecipe = nullptr;
-    assert((Legal->isReductionVariable(Phi) ||
+    assert((Legal->isReductionVariable(Phi) || Legal->isMonotonicPhi(Phi) ||
             Legal->isFixedOrderRecurrence(Phi)) &&
            "can only widen reductions and fixed-order recurrences here");
     VPValue *StartV = Operands[0];
@@ -8435,6 +8504,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
                                            CM.isInLoopReduction(Phi),
                                            CM.useOrderedReductions(RdxDesc));
+    } else if (Legal->isMonotonicPhi(Phi)) {
+      PhiRecipe = new VPMonotonicHeaderPHIRecipe(Phi, StartV);
     } else {
       // TODO: Currently fixed-order recurrences are modeled as chains of
       // first-order recurrences. If there are no users of the intermediate
@@ -8468,6 +8539,16 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
   if (auto *CI = dyn_cast<CallInst>(Instr))
     return tryToWidenCall(CI, Operands, Range, Plan);
 
+  if (Legal->isMonotonicUpdate(Instr)) {
+    const MonotonicDescriptor *MD = Legal->getMonotonicDescriptor(Instr);
+    assert(MD && "Monotonic descriptor was not found");
+    assert(Operands.size() == 2 &&
+           "Only binary monotonic updates are supported");
+    VPValue *Mask = getBlockInMask(Instr->getParent());
+    return new VPMonotonicUpdateInstruction(Mask, Operands[0], Operands[1],
+                                            Instr->getDebugLoc(), *MD);
+  }
+
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
     return tryToWidenMemory(Instr, Operands, Range, Plan);
 
@@ -9372,6 +9453,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   assert((LI || SI) && "Invalid Load/Store instruction");
   assert((!SI || StoredValue) && "No stored value provided for widened store");
   assert((!LI || !StoredValue) && "Stored value provided for widened load");
+  assert((!Monotonic || !LI) && "Expand load is not yet supported");
 
   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
 
@@ -9405,6 +9487,11 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
         Value *VectorGep = State.get(getAddr(), Part);
         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
                                             MaskPart);
+      } else if (isMonotonic()) {
+        auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
+        NewSI = Builder.CreateIntrinsic(
+            Builder.getVoidTy(), Intrinsic::masked_compressstore,
+            {StoredVal, VecPtr, BlockInMaskParts[Part]});
       } else {
         if (isReverse()) {
           // If we store to reverse consecutive memory locations, then we need
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4aeab6fc619988..37bae74ed9acb7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -888,6 +888,7 @@ void VPlan::execute(VPTransformState *State) {
                             (isa<VPReductionPHIRecipe>(PhiR) &&
                              cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
     bool NeedsScalar = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
+                       isa<VPMonotonicHeaderPHIRecipe>(PhiR) ||
                        (isa<VPReductionPHIRecipe>(PhiR) &&
                         cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
     unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 16c09a83e777dd..0297b5bae7f9ad 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -866,7 +866,9 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPWidenIntOrFpInductionSC:
     case VPRecipeBase::VPWidenPointerInductionSC:
     case VPRecipeBase::VPReductionPHISC:
+    case VPRecipeBase::VPMonotonicHeaderPHISC:
     case VPRecipeBase::VPScalarCastSC:
+    case VPRecipeBase::VPMonotonicUpdateSC:
       return true;
     case VPRecipeBase::VPInterleaveSC:
     case VPRecipeBase::VPBranchOnMaskSC:
@@ -1155,6 +1157,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
     BranchOnCount,
     BranchOnCond,
     ComputeReductionResult,
+    MonotonicUpdate,
   };
 
 private:
@@ -1790,6 +1793,68 @@ class VPWidenPHIRecipe : public VPSingleDefRecipe {
   VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
 };
 
+class VPMonotonicUpdateInstruction : public VPInstruction {
+private:
+  MonotonicDescriptor MD;
+
+public:
+  explicit VPMonotonicUpdateInstruction(VPValue *Mask, VPValue *Op1,
+                                        VPValue *Op2, DebugLoc DL,
+                                        MonotonicDescriptor MD,
+                                        const Twine &Name = "")
+      : VPInstruction(VPInstruction::MonotonicUpdate, {Op1, Op2}, DL, Name),
+        MD(MD) {
+    addOperand(Mask);
+    setUnderlyingValue(
+        cast<Value>(const_cast<Instruction *>(MD.getUpdateOp())));
+  }
+
+  explicit VPMonotonicUpdateInstruction() = delete;
+  ~VPMonotonicUpdateInstruction() override = default;
+
+  const MonotonicDescriptor &getMonotonicDescriptor() const { return MD; }
+
+  // Returns the incoming value from the loop backedge.
+  VPValue *getIncomingValue() const { return getOperand(0); }
+
+  // Returns the step value from the loop backedge.
+  VPValue *getStepValue() const { return getOperand(1); }
+
+  /// Returns the mask value of the instruction
+  VPValue *getMask() const { return getOperand(2); }
+
+  VPRecipeBase *clone() override {
+    return new VPMonotonicUpdateInstruction(getMask(), getIncomingValue(),
+                                            getStepValue(), getDebugLoc(),
+                                            getMonotonicDescriptor());
+  }
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPDef *D) {
+    auto *R = cast<VPRecipeBase>(D);
+    auto *I = dyn_cast<VPInstruction>(R);
+    return I && I->getOpcode() == VPInstruction::MonotonicUpdate;
+  }
+
+  static inline bool classof(const VPRecipeBase *R) {
+    auto *VPInst = dyn_cast<VPInstruction>(R);
+    return VPInst && VPInst->getOpcode() == VPInstruction::MonotonicUpdate;
+  }
+
+  static inline bool classof(const VPUser *U) {
+    auto *R = dyn_cast<VPRecipeBase>(U);
+    return R && VPMonotonicUpdateInstruction::classof(R);
+  }
+
+  void execute(VPTransformState &State) override final;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for handling first-order recurrence phis. The start value is the
 /// first operand of the recipe and the incoming value from the backedge is the
 /// second operand.
@@ -2185,6 +2250,35 @@ class VPBranchOnMaskRecipe : public VPRecipeBase {
   }
 };
 
+/// VPMonotonicHeaderPHIRecipe represents a phi of the monotonic value
+class VPMonotonicHeaderPHIRecipe final : public VPHeaderPHIRecipe {
+public:
+  VPMonotonicHeaderPHIRecipe(PHINode *Phi, VPValue *StartValue)
+      : VPHeaderPHIRecipe(VPDef::VPMonotonicHeaderPHISC, Phi,
+                          StartValue) {}
+
+  ~VPMonotonicHeaderPHIRecipe() override = default;
+
+  void execute(VPTransformState &State) override;
+
+  VPRecipeBase *clone() override {
+    return new VPMonotonicHeaderPHIRecipe(cast<PHINode>(getUnderlyingInstr()),
+                                          getOperand(0));
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  VP_CLASSOF_IMPL(VPDef::VPMonotonicHeaderPHISC);
+
+  static inline bool classof(const VPHeaderPHIRecipe *R) {
+    return R->getVPDefID() == VPDef::VPMonotonicHeaderPHISC;
+  }
+};
+
 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
 /// control converges back from a Branch-on-Mask. The phi nodes are needed in
 /// order to merge values that are set under such a branch and feed their uses.
@@ -2236,6 +2330,9 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
   // Whether the consecutive loaded/stored addresses are in reverse order.
   bool Reverse;
 
+  // Whether monotonic is used as an index in store.
+  bool Monotonic = false;
+
   void setMask(VPValue *Mask) {
     if (!Mask)
       return;
@@ -2258,10 +2355,14 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
 
   VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
                                  VPValue *StoredValue, VPValue *Mask,
-                                 bool Consecutive, bool Reverse)
+                                 bool Consecutive, bool Reverse,
+                                 bool Monotonic = false)
       : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr, StoredValue}),
-        Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) {
+        Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse),
+        Monotonic(Monotonic) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
+    assert((!Monotonic || Consecutive) &&
+           "Non-consecutive compress store is not supported");
     setMask(Mask);
   }
 
@@ -2301,6 +2402,9 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
   // Return whether the loaded-from / stored-to addresses are consecutive.
   bool isConsecutive() const { return Consecutive; }
 
+  // Return whether store uses monotonic in address computation
+  bool isMonotonic() const { return Monotonic; }
+
   // Return whether the consecutive loaded/stored addresses are in reverse
   // order.
   bool isReverse() const { return Reverse; }
@@ -3412,6 +3516,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
     return Rep->isUniform();
   if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
     return all_of(GEP->operands(), isUniformAfterVectorization);
+  if (isa<VPMonotonicUpdateInstruction, VPMonotonicHeaderPHIRecipe>(Def))
+    return true;
   if (auto *VPI = dyn_cast<VPInstruction>(Def))
     return VPI->getOpcode() == VPInstruction::ComputeReductionResult;
   return false;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index f55beac2047c94..4cba282b7b5d8e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -213,14 +213,14 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
   Type *ResultTy =
       TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
           .Case<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe,
-                VPReductionPHIRecipe, VPWidenPointerInductionRecipe>(
-              [this](const auto *R) {
-                // Handle header phi recipes, except VPWienIntOrFpInduction
-                // which needs special handling due it being possibly truncated.
-                // TODO: consider inferring/caching type of siblings, e.g.,
-                // backedge value, here and in cases below.
-                return inferScalarType(R->getStartValue());
-              })
+                VPReductionPHIRecipe, VPWidenPointerInductionRecipe,
+                VPMonotonicHeaderPHIRecipe>([this](const auto *R) {
+            // Handle header phi recipes, except VPWienIntOrFpInduction
+            // which needs special handling due it being possibly truncated.
+            // TODO: consider inferring/caching type of siblings, e.g.,
+            // backedge value, here and in cases below.
+            return inferScalarType(R->getStartValue());
+          })
           .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
               [](const auto *R) { return R->getScalarType(); })
           .Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 27b72575ddd51a..6b89a53600dd6c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1087,6 +1087,49 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   VecInd->addIncoming(LastInduction, VectorPH);
 }
 
+/// Generate phi for the monotonic:
+///   %monotonic = phi [%monotonic.update, %vector.latch]
+void VPMonotonicHeaderPHIRecipe::execute(VPTransformState &State) {
+  IRBuilder<>::InsertPointGuard Guard(State.Builder);
+  State.Builder.SetInsertPoint(State.CFG.PrevBB->getFirstNonPHI());
+
+  Value *StartV = State.get(getStartValue(), VPIteration(0, 0));
+  auto *Phi = State.Builder.CreatePHI(StartV->getType(), 2, "monotonic.phi");
+  BasicBlock *PreheaderBB = State.CFG.getPreheaderBBFor(this);
+  Phi->addIncoming(StartV, PreheaderBB);
+
+  // Use the same Phi for all Parts
+  for (unsigned Part = 0; Part < State.UF; ++Part)
+    State.set(this, Phi, Part, /*IsScalar=*/true);
+}
+
+/// Generate following sequence to update scalar monotonic variable:
+///   %bcast = bitcast %mask to iVF
+///   %0 = llvm.ctpop(%mask)
+///   %1 = mul %0, %step          // where %step is a step of monotonic
+///   %monotonic.update = add %monotonic, %1
+void VPMonotonicUpdateInstruction::execute(VPTransformState &State) {
+  assert(State.UF == 1 && "Unrolling is not supported.");
+  assert(!State.VF.isScalable() &&
+         "Scalable vectorization of monotonics is not yet supported.");
+  auto &Builder = State.Builder;
+  Value *V = State.get(getIncomingValue(), 0, /*NeedsScalar=*/true);
+  Value *Step = State.get(getStepValue(), VPIteration(0, 0));
+  Value *Mask = State.get(getMask(), 0);
+
+  Value *ScalarMask = Builder.CreateBitCast(
+      Mask, Builder.getIntNTy(State.VF.getKnownMinValue()));
+  Value *Popc = Builder.CreateIntrinsic(Intrinsic::ctpop,
+                                        {ScalarMask->getType()}, {ScalarMask});
+  Popc = Builder.CreateZExtOrTrunc(Popc, Step->getType());
+  Step = Builder.CreateMul(Step, Popc, "monotonic.vf.step");
+  const auto *OrigUpdateOp = cast<BinaryOperator>(MD.getUpdateOp());
+  Value *NewV = Builder.CreateBinOp(OrigUpdateOp->getOpcode(), V, Step,
+                                    "monotonic.update");
+  for (unsigned Part = 0; Part < State.UF; ++Part)
+    State.set(this, NewV, Part, /*IsScalar=*/true);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
@@ -1102,6 +1145,32 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
   O << ", ";
   getStepValue()->printAsOperand(O, SlotTracker);
 }
+
+void VPMonotonicUpdateInstruction::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
+  O << Indent << "monotonic-update ";
+  printAsOperand(O, SlotTracker);
+  O << " = ";
+  O << MD.getUpdateOp()->getOpcodeName();
+  O << ' ';
+  getOperand(0)->printAsOperand(O, SlotTracker);
+  O << ", ";
+  getOperand(1)->printAsOperand(O, SlotTracker);
+  O << " @";
+  getMask()->printAsOperand(O, SlotTracker);
+
+  if (auto DL = getDebugLoc()) {
+    O << ", !dbg ";
+    DL.print(O);
+  }
+}
+
+void VPMonotonicHeaderPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                                       VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  printAsOperand(O, SlotTracker);
+  O << " = monotonic-phi ";
+  printOperands(O, SlotTracker);
+}
 #endif
 
 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9c3f35112b592f..844ac0cdfe1576 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -24,6 +24,8 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
 
+#define DEBUG_TYPE "loop-vectorize"
+
 using namespace llvm;
 
 using namespace llvm::PatternMatch;
@@ -500,6 +502,65 @@ static void removeDeadRecipes(VPlan &Plan) {
   }
 }
 
+void VPlanTransforms::simplifyMonotonics(VPlan &Plan) {
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+      Plan.getEntry());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT))
+    for (VPRecipeBase &R : *VPBB) {
+      auto *MonotonicPhi = dyn_cast<VPMonotonicHeaderPHIRecipe>(&R);
+      if (!MonotonicPhi)
+        continue;
+
+      DenseSet<VPRecipeBase *> ToRemove;
+      SmallVector<VPRecipeBase *> Worklist = {
+          &MonotonicPhi->getBackedgeRecipe()};
+      VPMonotonicUpdateInstruction *VPMUI = nullptr;
+
+      while (!Worklist.empty()) {
+        VPRecipeBase *RR = Worklist.pop_back_val();
+        if (RR->getParent() != MonotonicPhi->getParent()) {
+          LLVM_DEBUG(dbgs() << "LV: Cannot simplify non-flattened HCFG\n");
+          return;
+        }
+        if (ToRemove.contains(RR) || RR == MonotonicPhi)
+          continue;
+
+        if (auto *MU = dyn_cast<VPMonotonicUpdateInstruction>(RR)) {
+          VPMUI = MU;
+          continue;
+        }
+        auto *VPB = dyn_cast<VPBlendRecipe>(RR);
+        if (!VPB) {
+          LLVM_DEBUG(dbgs()
+                     << "LV: Blend recipes are expected to propagate new "
+                        "value of monotonic to a header phi\n");
+          return;
+        }
+        ToRemove.insert(RR);
+        for (unsigned I = 0, E = VPB->getNumIncomingValues(); I != E; ++I) {
+          VPValue *V = VPB->getIncomingValue(I);
+          if (!isa<VPRecipeBase>(V->getDefiningRecipe())) {
+            LLVM_DEBUG(dbgs()
+                       << "LV: Unsupported VPValue in simplifyMonotonics\n");
+            return;
+          }
+          Worklist.push_back(V->getDefiningRecipe());
+        }
+      }
+      assert(VPMUI && "Monotonic update must exist in a VPlan");
+      // Use VPValue of the monotonic update instruction in a header phi
+      // instead
+      MonotonicPhi->setOperand(1, VPMUI);
+      for (auto &PV : Plan.getLiveOuts()) {
+        VPLiveOut *LO = PV.second;
+        if (ToRemove.contains(LO->getOperand(0)->getDefiningRecipe()))
+          LO->setOperand(0, VPMUI);
+      }
+      for (VPRecipeBase *RR : ToRemove)
+        RR->eraseFromParent();
+    }
+}
+
 static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID,
                                     ScalarEvolution &SE, Instruction *TruncI,
                                     VPValue *StartV, VPValue *Step,
@@ -1071,6 +1132,7 @@ void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
   removeDeadRecipes(Plan);
 
   createAndOptimizeReplicateRegions(Plan);
+  simplifyMonotonics(Plan);
 
   removeRedundantExpandSCEVRecipes(Plan);
   mergeBlocksIntoPredecessors(Plan);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index ff83c3f083b093..b6aa553d808f94 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -98,6 +98,10 @@ struct VPlanTransforms {
   ///       VPlan directly.
   static void dropPoisonGeneratingRecipes(
       VPlan &Plan, function_ref<bool(BasicBlock *)> BlockNeedsPredication);
+
+  /// Simplify usage of monotonics within a VPlan by removing unnecessary blends
+  /// if HCFG has been flattened
+  static void simplifyMonotonics(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 1d2c17e91b7ab9..8bc626419e4d3c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -360,6 +360,7 @@ class VPDef {
     VPWidenMemoryInstructionSC,
     VPWidenSC,
     VPWidenSelectSC,
+    VPMonotonicUpdateSC,
     // START: Phi-like recipes. Need to be kept together.
     VPBlendSC,
     VPWidenPHISC,
@@ -371,6 +372,7 @@ class VPDef {
     VPFirstOrderRecurrencePHISC,
     VPWidenIntOrFpInductionSC,
     VPWidenPointerInductionSC,
+    VPMonotonicHeaderPHISC,
     VPReductionPHISC,
     // END: SubclassID for recipes that inherit VPHeaderPHIRecipe
     // END: Phi-like recipes
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll b/llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll
new file mode 100644
index 00000000000000..71de9d495508da
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll
@@ -0,0 +1,702 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -mattr=+v,+d -S -mtriple riscv64 -force-vector-width=4 %s -o -  | FileCheck %s
+
+define ptr @compress_on_pointers(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define ptr @compress_on_pointers(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[A_ADDR_1_LCSSA:%.*]] = phi ptr [ [[A_ADDR_1:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[A_ADDR_0_LCSSA:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A_ADDR_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    ret ptr [[A_ADDR_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[A_ADDR_09:%.*]] = phi ptr [ [[A]], [[FOR_BODY_PREHEADER]] ], [ [[A_ADDR_1]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[A_ADDR_09]], i64 1
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[A_ADDR_09]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[A_ADDR_1]] = phi ptr [ [[INCDEC_PTR]], [[IF_THEN]] ], [ [[A_ADDR_09]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp8 = icmp sgt i32 %n, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  %a.addr.1.lcssa = phi ptr [ %a.addr.1, %for.inc ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %a.addr.0.lcssa = phi ptr [ %a, %entry ], [ %a.addr.1.lcssa, %for.cond.cleanup.loopexit ]
+  ret ptr %a.addr.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %a.addr.09 = phi ptr [ %a, %for.body.preheader ], [ %a.addr.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %incdec.ptr = getelementptr inbounds i32, ptr %a.addr.09, i64 1
+  store i32 %0, ptr %a.addr.09, align 4
+  br label %for.inc
+
+for.inc:
+  %a.addr.1 = phi ptr [ %incdec.ptr, %if.then ], [ %a.addr.09, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+
+define i32 @compress_store(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i32 @compress_store(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[MONOTONIC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[MONOTONIC_UPDATE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[MONOTONIC_PHI]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr [[TMP7]], <4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[TMP4]] to i4
+; CHECK-NEXT:    [[TMP9:%.*]] = call i4 @llvm.ctpop.i4(i4 [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i4 [[TMP9]] to i32
+; CHECK-NEXT:    [[MONOTONIC_VF_STEP:%.*]] = mul i32 1, [[TMP10]]
+; CHECK-NEXT:    [[MONOTONIC_UPDATE]] = add i32 [[MONOTONIC_PHI]], [[MONOTONIC_VF_STEP]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[RET_1_LCSSA:%.*]] = phi i32 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ], [ [[MONOTONIC_UPDATE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RET_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[RET_011:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[RET_011]] to i64
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[ADD_PTR]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[RET_011]], 1
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RET_1]] = phi i32 [ [[INC]], [[IF_THEN]] ], [ [[RET_011]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:
+  %ret.0.lcssa = phi i32 [ 0, %entry ], [ %ret.1, %for.inc ]
+  ret i32 %ret.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %ret.011 = phi i32 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %idx.ext = sext i32 %ret.011 to i64
+  %add.ptr = getelementptr inbounds i32, ptr %a, i64 %idx.ext
+  store i32 %0, ptr %add.ptr, align 4
+  %inc = add nsw i32 %ret.011, 1
+  br label %for.inc
+
+for.inc:
+  %ret.1 = phi i32 [ %inc, %if.then ], [ %ret.011, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define i64 @compress_store_i64(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i64 @compress_store_i64(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[MONOTONIC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[MONOTONIC_UPDATE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[A]], i64 [[MONOTONIC_PHI]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr [[TMP6]], <4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i1> [[TMP4]] to i4
+; CHECK-NEXT:    [[TMP8:%.*]] = call i4 @llvm.ctpop.i4(i4 [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i4 [[TMP8]] to i64
+; CHECK-NEXT:    [[MONOTONIC_VF_STEP:%.*]] = mul i64 1, [[TMP9]]
+; CHECK-NEXT:    [[MONOTONIC_UPDATE]] = add i64 [[MONOTONIC_PHI]], [[MONOTONIC_VF_STEP]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[RET_1_LCSSA:%.*]] = phi i64 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ], [ [[MONOTONIC_UPDATE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[RET_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i64 [[RET_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[RET_011:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[RET_011]]
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[ADD_PTR]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i64 [[RET_011]], 1
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RET_1]] = phi i64 [ [[INC]], [[IF_THEN]] ], [ [[RET_011]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:
+  %ret.0.lcssa = phi i64 [ 0, %entry ], [ %ret.1, %for.inc ]
+  ret i64 %ret.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %ret.011 = phi i64 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %add.ptr = getelementptr inbounds i32, ptr %a, i64 %ret.011
+  store i32 %0, ptr %add.ptr, align 4
+  %inc = add nsw i64 %ret.011, 1
+  br label %for.inc
+
+for.inc:
+  %ret.1 = phi i64 [ %inc, %if.then ], [ %ret.011, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define i32 @compress_store_strided(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i32 @compress_store_strided(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[RET_1_LCSSA:%.*]] = phi i32 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RET_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[RET_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[RET_010]] to i64
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[ADD_PTR]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[RET_010]], 2
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RET_1]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[RET_010]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:
+  %ret.0.lcssa = phi i32 [ 0, %entry ], [ %ret.1, %for.inc ]
+  ret i32 %ret.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %ret.010 = phi i32 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %idx.ext = sext i32 %ret.010 to i64
+  %add.ptr = getelementptr inbounds i32, ptr %a, i64 %idx.ext
+  store i32 %0, ptr %add.ptr, align 4
+  %add = add nsw i32 %ret.010, 2
+  br label %for.inc
+
+for.inc:
+  %ret.1 = phi i32 [ %add, %if.then ], [ %ret.010, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define i32 @compress_reverse(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i32 @compress_reverse(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[RET_1_LCSSA:%.*]] = phi i32 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RET_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[RET_010:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[RET_010]] to i64
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[ADD_PTR]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[RET_010]], -1
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RET_1]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[RET_010]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:
+  %ret.0.lcssa = phi i32 [ 0, %entry ], [ %ret.1, %for.inc ]
+  ret i32 %ret.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %ret.010 = phi i32 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %idx.ext = sext i32 %ret.010 to i64
+  %add.ptr = getelementptr inbounds i32, ptr %a, i64 %idx.ext
+  store i32 %0, ptr %add.ptr, align 4
+  %add = add nsw i32 %ret.010, -1
+  br label %for.inc
+
+for.inc:
+  %ret.1 = phi i32 [ %add, %if.then ], [ %ret.010, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define i32 @expand_store(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i32 @expand_store(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[RET_1_LCSSA:%.*]] = phi i32 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RET_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[RET_012:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[RET_012]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[RET_012]], 1
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RET_1]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[RET_012]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:
+  %ret.0.lcssa = phi i32 [ 0, %entry ], [ %ret.1, %for.inc ]
+  ret i32 %ret.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %ret.012 = phi i32 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %idxprom1 = sext i32 %ret.012 to i64
+  %arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %idxprom1
+  %1 = load i32, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %1, ptr %arrayidx4, align 4
+  %add = add nsw i32 %ret.012, 1
+  br label %for.inc
+
+for.inc:
+  %ret.1 = phi i32 [ %add, %if.then ], [ %ret.012, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define i32 @expand_store_strided(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i32 @expand_store_strided(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[RET_1_LCSSA:%.*]] = phi i32 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RET_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[RET_012:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[RET_012]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[RET_012]], 2
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RET_1]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[RET_012]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:
+  %ret.0.lcssa = phi i32 [ 0, %entry ], [ %ret.1, %for.inc ]
+  ret i32 %ret.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %ret.012 = phi i32 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %idxprom1 = sext i32 %ret.012 to i64
+  %arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %idxprom1
+  %1 = load i32, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %1, ptr %arrayidx4, align 4
+  %add = add nsw i32 %ret.012, 2
+  br label %for.inc
+
+for.inc:
+  %ret.1 = phi i32 [ %add, %if.then ], [ %ret.012, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define i32 @expand_store_reverse(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i32 @expand_store_reverse(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[RET_1_LCSSA:%.*]] = phi i32 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RET_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[RET_012:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[RET_012]] to i64
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[RET_012]], -1
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RET_1]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[RET_012]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:
+  %ret.0.lcssa = phi i32 [ 0, %entry ], [ %ret.1, %for.inc ]
+  ret i32 %ret.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %ret.012 = phi i32 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %idxprom1 = sext i32 %ret.012 to i64
+  %arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %idxprom1
+  %1 = load i32, ptr %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %1, ptr %arrayidx4, align 4
+  %add = add nsw i32 %ret.012, -1
+  br label %for.inc
+
+for.inc:
+  %ret.1 = phi i32 [ %add, %if.then ], [ %ret.012, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define i32 @expand(i32 %n, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: define i32 @expand(
+; CHECK-SAME: i32 [[N:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP13:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP13]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[RET_1_LCSSA:%.*]] = phi i32 [ [[RET_1:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[RET_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RET_1_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RET_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[RET_014:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[RET_1]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[RET_014]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ADD5:%.*]] = add nsw i32 [[RET_014]], 1
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[RET_1]] = phi i32 [ [[ADD5]], [[IF_THEN]] ], [ [[RET_014]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp13 = icmp sgt i32 %n, 0
+  br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:
+  %ret.0.lcssa = phi i32 [ 0, %entry ], [ %ret.1, %for.inc ]
+  ret i32 %ret.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %ret.014 = phi i32 [ 0, %for.body.preheader ], [ %ret.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:
+  %add = add nsw i32 %0, %ret.014
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %add5 = add nsw i32 %ret.014, 1
+  br label %for.inc
+
+for.inc:
+  %ret.1 = phi i32 [ %add5, %if.then ], [ %ret.014, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.

>From d3256232d31f361099ff917eef19c26443a3a09e Mon Sep 17 00:00:00 2001
From: Kolya Panchenko <kolya.panchenko at sifive.com>
Date: Thu, 29 Feb 2024 12:41:21 -0800
Subject: [PATCH 2/2] format + addressed comments

---
 llvm/include/llvm/Analysis/IVDescriptors.h             |  4 ++--
 .../Transforms/Vectorize/LoopVectorizationLegality.h   | 10 +++++-----
 llvm/lib/Analysis/IVDescriptors.cpp                    |  3 ++-
 .../Transforms/Vectorize/LoopVectorizationLegality.cpp |  8 +++-----
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp        |  8 +++++---
 llvm/lib/Transforms/Vectorize/VPlan.h                  |  3 +--
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp         |  3 ++-
 .../Transforms/LoopVectorize/RISCV/compress_expand.ll  |  4 ++--
 8 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 877204a8b2d864..dd60e84bf1f82e 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -13,11 +13,11 @@
 #ifndef LLVM_ANALYSIS_IVDESCRIPTORS_H
 #define LLVM_ANALYSIS_IVDESCRIPTORS_H
 
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/ADT/SetVector.h"
 
 namespace llvm {
 
@@ -400,7 +400,7 @@ class MonotonicDescriptor {
 public:
   /// This enum represents the kinds of monotonic that we support.
   enum MonotonicKind {
-    MK_None,  ///< Not a monotonic variable.
+    MK_None,    ///< Not a monotonic variable.
     MK_Integer, /// < Integer monotonic variable. Step = C
     MK_Pointer, /// < Pointer monotonic variable. Step = C
   };
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 9896211ca11d83..fa2208e5e460cb 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -258,8 +258,7 @@ class LoopVectorizationLegality {
   using InductionList = MapVector<PHINode *, InductionDescriptor>;
 
   /// MonotonicPhiList contains phi nodes that represent monotonic idiom
-  using MonotonicPhiList =
-      MapVector<const PHINode *, MonotonicDescriptor>;
+  using MonotonicPhiList = MapVector<const PHINode *, MonotonicDescriptor>;
 
   /// RecurrenceSet contains the phi nodes that are recurrences other than
   /// inductions and reductions.
@@ -315,9 +314,10 @@ class LoopVectorizationLegality {
 
   /// Returns the MonotonicDescriptor associated with an \p I instruction
   /// Returns emtpy descriptor if \p I instruction is non-monotonic.
-  const MonotonicDescriptor *getMonotonicDescriptor(const Instruction *I) const {
+  const MonotonicDescriptor *
+  getMonotonicDescriptor(const Instruction *I) const {
     for (const auto &PMD : getMonotonics()) {
-      if (const auto *Phi = dyn_cast<const PHINode>(I))
+      if (const auto *Phi = dyn_cast<PHINode>(I))
         if (PMD.second.getPhis().contains(const_cast<PHINode *>(Phi)))
           return &PMD.second;
       if (PMD.second.getUpdateOp() == I)
@@ -391,7 +391,7 @@ class LoopVectorizationLegality {
   bool isConsecutiveMonotonicPtr(Value *Ptr) const;
 
   /// Return true if \p Ptr computation depends on monotonic value.
-  bool ptrHasMonotonicOperand(Value *Ptr) const;
+  bool hasMonotonicOperand(Value *Ptr) const;
 
   /// Returns true if value V is uniform across \p VF lanes, when \p VF is
   /// provided, and otherwise if \p V is invariant across all loop iterations.
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 9194a5622b7dc2..dbe1890477879c 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -1575,7 +1575,8 @@ MonotonicDescriptor::isMonotonicPHI(PHINode *Phi, const Loop *L,
 
   // It's important to check all uses of the Phi and make sure they are either
   // outside of the loop.
-  // TODO: Support uses under nested predicate, which can be supported by vectorizer
+  // TODO: Support uses under nested predicate, which can be supported by
+  // vectorizer
   for (User *U : Phi->users()) {
     auto *UI = cast<Instruction>(U);
     if (!L->contains(UI))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 77348826e067cf..c41af389ebb8ca 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -477,7 +477,7 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
 }
 
 bool LoopVectorizationLegality::isConsecutiveMonotonicPtr(Value *Ptr) const {
-  assert(ptrHasMonotonicOperand(Ptr) &&
+  assert(hasMonotonicOperand(Ptr) &&
          "Pointer's computation does not use monotonic values.");
 
   auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
@@ -494,8 +494,7 @@ bool LoopVectorizationLegality::isConsecutiveMonotonicPtr(Value *Ptr) const {
   return Step && Step->getAPInt().getZExtValue() == 1;
 }
 
-bool LoopVectorizationLegality::ptrHasMonotonicOperand(
-    Value *Ptr) const {
+bool LoopVectorizationLegality::hasMonotonicOperand(Value *Ptr) const {
   auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
   if (!GEP)
     return false;
@@ -962,8 +961,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           continue;
         }
         if (EnableMonotonics && TTI->enableMonotonicVectorization())
-          if (auto MD =
-                  MonotonicDescriptor::isMonotonicPHI(Phi, TheLoop, PSE))
+          if (auto MD = MonotonicDescriptor::isMonotonicPHI(Phi, TheLoop, PSE))
             if (canVectorizeMonotonic(MD)) {
               addMonotonic(MD);
               continue;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ab90b5dc50ab69..42bb07f122f0a0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4115,7 +4115,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
 bool LoopVectorizationCostModel::memoryInstructionUsesMonotonic(
     Instruction *I, ElementCount VF) {
   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
-  return Legal->ptrHasMonotonicOperand(getLoadStorePointerOperand(I));
+  return Legal->hasMonotonicOperand(getLoadStorePointerOperand(I));
 }
 
 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
@@ -6084,7 +6084,7 @@ LoopVectorizationCostModel::getMonotonicMemoryOpCost(Instruction *I,
     return InstructionCost::getInvalid();
 
   LLVMContext &Ctx = I->getContext();
-  SmallVector<Type *> ParamTys;;
+  SmallVector<Type *> ParamTys;
   ParamTys.push_back(VectorTy);
   ParamTys.push_back(Ptr->getType());
   ParamTys.push_back(VectorType::get(Type::getInt1Ty(Ctx), VF));
@@ -6484,7 +6484,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
 
           // Load or store with monotonic index in pointer's computation
           // requires special handling of a mask.
-          if (Legal->ptrHasMonotonicOperand(Ptr))
+          if (Legal->hasMonotonicOperand(Ptr))
             return false;
 
           // For scalable vectors, a uniform memop load is always
@@ -9492,6 +9492,8 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
         NewSI = Builder.CreateIntrinsic(
             Builder.getVoidTy(), Intrinsic::masked_compressstore,
             {StoredVal, VecPtr, BlockInMaskParts[Part]});
+        cast<IntrinsicInst>(NewSI)->addParamAttr(
+            1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
       } else {
         if (isReverse()) {
           // If we store to reverse consecutive memory locations, then we need
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0297b5bae7f9ad..3a8b69f2b5c191 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2254,8 +2254,7 @@ class VPBranchOnMaskRecipe : public VPRecipeBase {
 class VPMonotonicHeaderPHIRecipe final : public VPHeaderPHIRecipe {
 public:
   VPMonotonicHeaderPHIRecipe(PHINode *Phi, VPValue *StartValue)
-      : VPHeaderPHIRecipe(VPDef::VPMonotonicHeaderPHISC, Phi,
-                          StartValue) {}
+      : VPHeaderPHIRecipe(VPDef::VPMonotonicHeaderPHISC, Phi, StartValue) {}
 
   ~VPMonotonicHeaderPHIRecipe() override = default;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6b89a53600dd6c..c05f741eb10f94 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1146,7 +1146,8 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
   getStepValue()->printAsOperand(O, SlotTracker);
 }
 
-void VPMonotonicUpdateInstruction::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
+void VPMonotonicUpdateInstruction::print(raw_ostream &O, const Twine &Indent,
+                                         VPSlotTracker &SlotTracker) const {
   O << Indent << "monotonic-update ";
   printAsOperand(O, SlotTracker);
   O << " = ";
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll b/llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll
index 71de9d495508da..6e7097e0963478 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/compress_expand.ll
@@ -96,7 +96,7 @@ define i32 @compress_store(i32 %n, ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[MONOTONIC_PHI]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr [[TMP7]], <4 x i1> [[TMP4]])
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr align 4 [[TMP7]], <4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[TMP4]] to i4
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i4 @llvm.ctpop.i4(i4 [[TMP8]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext i4 [[TMP9]] to i32
@@ -196,7 +196,7 @@ define i64 @compress_store_i64(i32 %n, ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[A]], i64 [[MONOTONIC_PHI]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr [[TMP6]], <4 x i1> [[TMP4]])
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v4i32(<4 x i32> [[WIDE_LOAD]], ptr align 4 [[TMP6]], <4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i1> [[TMP4]] to i4
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i4 @llvm.ctpop.i4(i4 [[TMP7]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext i4 [[TMP8]] to i64