[llvm] [LoopVectorize] Vectorize the compact pattern (PR #68980)

Fri Oct 13 04:48:34 PDT 2023

https://github.com/huhu233 created https://github.com/llvm/llvm-project/pull/68980

This patch tries to vectorize the compact pattern, as shown,
    
for(i=0; i<N; i++){
   x = comp[i];
   if(x<a) Out_ref[n++]=B[i];
}
    
It introduces some changes:
1. Add a pattern matching in LoopVectorizationLegality to cache specific cases.
2. Introduce two new recipes to hande the compact chain:
    VPCompactPHIRecipe: Handle the entry PHI of compact chain.
    VPWidenCompactInstructionRecipe: Handle other instructions in compact chain.
3. Slightly adapt the cost model for compact pattern.

>From 73a1356bf5a05f738675345be497e8ea291cedb7 Mon Sep 17 00:00:00 2001
From: huhu233 <1293577861 at qq.com>
Date: Fri, 13 Oct 2023 19:36:39 +0800
Subject: [PATCH 1/2] [LoopVectorize] Precommit a test for the compact pattern

---
 .../LoopVectorize/AArch64/compact.ll          | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/compact.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll b/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll
new file mode 100644
index 000000000000000..0bafd325bb8e98e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize -S < %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16)
+define dso_local i32 @kernel_reference(i32 noundef %N, i32 noundef %a, ptr noalias nocapture noundef readonly %comp, ptr noalias nocapture noundef writeonly %Out_ref, ptr nocapture noundef readonly %B, ptr noalias nocapture noundef readnone %Out1) #0 {
+; CHECK-LABEL: @kernel_reference(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[N_013:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_1:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COMP:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[A:%.*]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[N_013]], 1
+; CHECK-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[N_013]] to i64
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF:%.*]], i64 [[IDXPROM4]]
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[N_1]] = phi i32 [ [[INC]], [[IF_THEN]] ], [ [[N_013]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[N_1_LCSSA:%.*]] = phi i32 [ [[N_1]], [[FOR_INC]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[N_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[N_1_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[N_0_LCSSA]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %N, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %n.013 = phi i32 [ 0, %for.body.preheader ], [ %n.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %comp, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp1 = icmp slt i32 %0, %a
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx3 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx3, align 4
+  %inc = add nsw i32 %n.013, 1
+  %idxprom4 = sext i32 %n.013 to i64
+  %arrayidx5 = getelementptr inbounds i32, ptr %Out_ref, i64 %idxprom4
+  store i32 %1, ptr %arrayidx5, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %n.1 = phi i32 [ %inc, %if.then ], [ %n.013, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %n.0.lcssa = phi i32 [ 0, %entry ], [ %n.1, %for.inc ]
+  ret i32 %n.0.lcssa
+}
+
+attributes #0 = { argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16) "target-cpu"="generic" "target-features"="+neon,+sve,+v8.2a"}

>From 74b4fe41e5428071c22690585a190e2aeacbea92 Mon Sep 17 00:00:00 2001
From: huhu233 <1293577861 at qq.com>
Date: Fri, 13 Oct 2023 19:38:06 +0800
Subject: [PATCH 2/2] [LoopVectorize] Vectorize the compact pattern

This patch tries to vectorize the compact pattern, as shown,

  for (i = 0; i < N; i++) {
    x = comp[i];
    if(x<a) Out_ref[n++]=B[i];
  }

It introduces some changes:
1.Add a pattern matching in LoopVectorizationLegality to cache
specific cases.
2.Introduce two new recipes to hande the compact chain:
VPCompactPHIRecipe: Handle the entry PHI of compact chain.
VPWidenCompactInstructionRecipe: Handle other instructions in compact chain.
3.Slightly adapt the cost model for compact pattern.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  25 +++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   7 +
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |   4 +
 .../include/llvm/Transforms/Utils/LoopUtils.h |   8 +
 .../Vectorize/LoopVectorizationLegality.h     |  35 ++++
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  16 ++
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  12 ++
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   1 +
 .../AArch64/AArch64TargetTransformInfo.cpp    |   2 +
 .../AArch64/AArch64TargetTransformInfo.h      |  10 ++
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |  29 ++++
 .../Vectorize/LoopVectorizationLegality.cpp   | 157 ++++++++++++++++++
 .../Transforms/Vectorize/LoopVectorize.cpp    | 116 ++++++++++++-
 .../Transforms/Vectorize/VPRecipeBuilder.h    |   5 +
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  17 ++
 llvm/lib/Transforms/Vectorize/VPlan.h         |  62 +++++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 135 +++++++++++++++
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   2 +
 .../LoopVectorize/AArch64/compact-vplan.ll    |  78 +++++++++
 .../LoopVectorize/AArch64/compact.ll          |  96 +++++++++--
 20 files changed, 801 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/compact-vplan.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5234ef8788d9e96..c2851c10e6ff3ef 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1672,6 +1672,11 @@ class TargetTransformInfo {
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
+  InstructionCost getCompactCost() const;
+  bool isTargetSupportedCompactStore() const;
+  unsigned getTargetSupportedCompact() const;
+  unsigned getTargetSupportedCNTP() const;
+
   /// @}
 
 private:
@@ -2041,6 +2046,10 @@ class TargetTransformInfo::Concept {
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
+  virtual bool isTargetSupportedCompactStore() const = 0;
+  virtual unsigned getTargetSupportedCompact() const = 0;
+  virtual unsigned getTargetSupportedCNTP() const = 0;
+  virtual InstructionCost getCompactCost() const = 0;
 };
 
 template <typename T>
@@ -2757,6 +2766,22 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
+
+  bool isTargetSupportedCompactStore() const override {
+    return Impl.isTargetSupportedCompactStore();
+  }
+
+  unsigned getTargetSupportedCompact() const override {
+    return Impl.getTargetSupportedCompact();
+  }
+
+  unsigned getTargetSupportedCNTP() const override {
+    return Impl.getTargetSupportedCNTP();
+  }
+
+  InstructionCost getCompactCost() const override {
+    return Impl.getCompactCost();
+  }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index c1ff314ae51c98b..e063f383980a724 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -895,6 +895,13 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
+  bool isTargetSupportedCompactStore() const { return false; }
+  unsigned getTargetSupportedCompact() const { return 0; }
+  unsigned getTargetSupportedCNTP() const { return 0; }
+  InstructionCost getCompactCost() const {
+    return InstructionCost::getInvalid();
+  }
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 3dd16dafe3c42a7..737757761eca4ab 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -700,6 +700,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return getST()->getMaxPrefetchIterationsAhead();
   }
 
+  virtual InstructionCost getCompactCost() const {
+    return InstructionCost::getInvalid();
+  }
+
   virtual bool enableWritePrefetching() const {
     return getST()->enableWritePrefetching();
   }
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 0d99249be413762..348b8ad03de4179 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -409,6 +409,14 @@ Value *createAnyOfTargetReduction(IRBuilderBase &B, Value *Src,
 Value *createTargetReduction(IRBuilderBase &B, const RecurrenceDescriptor &Desc,
                              Value *Src, PHINode *OrigPhi = nullptr);
 
+Value *createTargetCompact(IRBuilderBase &B, Module *M,
+                           const TargetTransformInfo *TTI, Value *Mask,
+                           Value *Val);
+
+Value *createTargetCNTP(IRBuilderBase &B, Module *M,
+                        const TargetTransformInfo *TTI, Value *Mask,
+                        Value *Val);
+
 /// Create an ordered reduction intrinsic using the given recurrence
 /// descriptor \p Desc.
 Value *createOrderedReduction(IRBuilderBase &B,
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 20cfc680e8f90b3..7f82154699e5174 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -224,6 +224,26 @@ class LoopVectorizationRequirements {
   Instruction *ExactFPMathInst = nullptr;
 };
 
+class CompactDescriptor {
+  PHINode *LiveOutPhi;
+  bool IsCompactSign;
+  SmallPtrSet<Value *, 8> Chain;
+
+public:
+  CompactDescriptor() = default;
+  CompactDescriptor(SmallPtrSetImpl<Value *> &CompactChain, PHINode *LiveOut,
+                    bool IsSign)
+      : LiveOutPhi(LiveOut), IsCompactSign(IsSign) {
+    Chain.insert(CompactChain.begin(), CompactChain.end());
+  }
+
+  bool isInCompactChain(Value *V) const { return Chain.find(V) != Chain.end(); }
+
+  PHINode *getLiveOutPhi() const { return LiveOutPhi; }
+
+  bool isSign() const { return IsCompactSign; }
+};
+
 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
 /// to what vectorization factor.
 /// This class does not look at the profitability of vectorization, only the
@@ -261,6 +281,8 @@ class LoopVectorizationLegality {
   /// inductions and reductions.
   using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
 
+  using CompactList = MapVector<PHINode *, CompactDescriptor>;
+
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
   /// loop, only that it is legal to do so.
@@ -397,6 +419,14 @@ class LoopVectorizationLegality {
 
   DominatorTree *getDominatorTree() const { return DT; }
 
+  const CompactList &getCompactList() const { return CpList; }
+
+  bool hasCompactChain() const { return CpList.size() > 0; }
+
+  PHINode *getCompactChainStart(Instruction *I) const;
+
+  bool isSign(PHINode *Phi) { return CpList[Phi].isSign(); };
+
 private:
   /// Return true if the pre-header, exiting and latch blocks of \p Lp and all
   /// its nested loops are considered legal for vectorization. These legal
@@ -425,6 +455,8 @@ class LoopVectorizationLegality {
   /// and we only need to check individual instructions.
   bool canVectorizeInstrs();
 
+  bool isMatchCompact(PHINode *Phi, Loop *TheLoop, CompactDescriptor &CpDesc);
+
   /// When we vectorize loops we may change the order in which
   /// we read and write from memory. This method checks if it is
   /// legal to vectorize the code, considering only memory constrains.
@@ -538,6 +570,9 @@ class LoopVectorizationLegality {
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
+
+  // Record compact chain in the loop.
+  CompactList CpList;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index aad14f21d114619..b7596bb2e0dfc92 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1248,6 +1248,22 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
+bool TargetTransformInfo::isTargetSupportedCompactStore() const {
+  return TTIImpl->isTargetSupportedCompactStore();
+}
+
+unsigned TargetTransformInfo::getTargetSupportedCompact() const {
+  return TTIImpl->getTargetSupportedCompact();
+}
+
+unsigned TargetTransformInfo::getTargetSupportedCNTP() const {
+  return TTIImpl->getTargetSupportedCNTP();
+}
+
+InstructionCost TargetTransformInfo::getCompactCost() const {
+  return TTIImpl->getCompactCost();
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index fc9e3ff3734989d..d30d0b57b5d47b0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
@@ -301,6 +302,11 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::FFREXP:
     Res = PromoteIntRes_FFREXP(N);
     break;
+  case ISD::INTRINSIC_WO_CHAIN:
+    if (N->getConstantOperandVal(0) == Intrinsic::aarch64_sve_compact) {
+      Res = PromoteIntRes_COMPACT(N);
+      break;
+    }
   }
 
   // If the result is null then the sub-method took care of registering it.
@@ -5942,6 +5948,12 @@ SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
   return DAG.getBuildVector(N->getValueType(0), dl, NewOps);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_COMPACT(SDNode *N) {
+  SDValue OpExt = SExtOrZExtPromotedInteger(N->getOperand(2));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), OpExt.getValueType(),
+                     N->getOperand(0), N->getOperand(1), OpExt);
+}
+
 SDValue DAGTypeLegalizer::ExpandIntOp_STACKMAP(SDNode *N, unsigned OpNo) {
   assert(OpNo > 1);
   SDValue Op = N->getOperand(OpNo);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index c802604a3470e13..d204169ed2327f7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -364,6 +364,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntRes_FunnelShift(SDNode *N);
   SDValue PromoteIntRes_VPFunnelShift(SDNode *N);
   SDValue PromoteIntRes_IS_FPCLASS(SDNode *N);
+  SDValue PromoteIntRes_COMPACT(SDNode *N);
 
   // Integer Operand Promotion.
   bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d8a0e68d7123759..5ca5f22525d3dd3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3889,3 +3889,5 @@ AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
     return AM.Scale != 0 && AM.Scale != 1;
   return -1;
 }
+
+InstructionCost AArch64TTIImpl::getCompactCost() const { return 6; }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a6baade412c77d2..28bd48e8ed76c4a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
 #include <cstdint>
 #include <optional>
 
@@ -412,6 +413,15 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
     return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
   }
+
+  bool isTargetSupportedCompactStore() const { return ST->hasSVE(); }
+  unsigned getTargetSupportedCompact() const {
+    return Intrinsic::aarch64_sve_compact;
+  }
+  unsigned getTargetSupportedCNTP() const {
+    return Intrinsic::aarch64_sve_cntp;
+  }
+  InstructionCost getCompactCost() const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 21affe7bdce406e..1373fb7931f0a7e 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -34,6 +34,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
@@ -1119,6 +1120,34 @@ Value *llvm::createTargetReduction(IRBuilderBase &B,
   return createSimpleTargetReduction(B, Src, RK);
 }
 
+Value *llvm::createTargetCompact(IRBuilderBase &B, Module *M,
+                                 const TargetTransformInfo *TTI, Value *Mask,
+                                 Value *Val) {
+  Intrinsic::ID IID = TTI->getTargetSupportedCompact();
+  switch (IID) {
+  default:
+    return nullptr;
+  case Intrinsic::aarch64_sve_compact:
+    Function *CompactMaskDecl = Intrinsic::getDeclaration(
+        M, Intrinsic::aarch64_sve_compact, Val->getType());
+    return B.CreateCall(CompactMaskDecl, {Mask, Val});
+  }
+}
+
+Value *llvm::createTargetCNTP(IRBuilderBase &B, Module *M,
+                              const TargetTransformInfo *TTI, Value *Mask,
+                              Value *Val) {
+  Intrinsic::ID IID = TTI->getTargetSupportedCNTP();
+  switch (IID) {
+  default:
+    return nullptr;
+  case Intrinsic::aarch64_sve_cntp:
+    Function *CNTPDecl = Intrinsic::getDeclaration(
+        M, Intrinsic::aarch64_sve_cntp, Val->getType());
+    return B.CreateCall(CNTPDecl, {Mask, Val});
+  }
+}
+
 Value *llvm::createOrderedReduction(IRBuilderBase &B,
                                     const RecurrenceDescriptor &Desc,
                                     Value *Src, Value *Start) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 35d69df56dc7220..dbab8af159a1621 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 
@@ -78,6 +79,11 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
                 "Scalable vectorization is available and favored when the "
                 "cost is inconclusive.")));
 
+static cl::opt<bool>
+    EnableCompactVectorization("enable-compact-vectorization", cl::init(true),
+                               cl::Hidden,
+                               cl::desc("Enable vectorizing compact pattern."));
+
 /// Maximum vectorization interleave count.
 static const unsigned MaxInterleaveFactor = 16;
 
@@ -785,6 +791,143 @@ static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
   return Scalarize;
 }
 
+static bool isUserOfCompactPHI(BasicBlock *BB, PHINode *Phi, Instruction *I) {
+  if (I->getParent() != BB)
+    return false;
+
+  // Operations on PHI should be affine.
+  if (I->getOpcode() != Instruction::Add &&
+      I->getOpcode() != Instruction::Sub &&
+      I->getOpcode() != Instruction::SExt &&
+      I->getOpcode() != Instruction::ZExt)
+    return false;
+
+  if (I == Phi)
+    return true;
+
+  for (unsigned i = 0; i < I->getNumOperands(); i++) {
+    if (auto *Instr = dyn_cast<Instruction>(I->getOperand(i)))
+      if (isUserOfCompactPHI(BB, Phi, Instr))
+        return true;
+  }
+  return false;
+}
+
+// Match the basic compact pattern:
+// for.body:
+//    %src.phi = phi i64 [ 0, %preheader ], [ %target.phi, %for.inc ]
+//    ...
+// if.then:
+//    ...
+//    %data = load i32, ptr %In
+//    (there may be additional sext/zext if %src.phi types i32)
+//    %addr = getelementptr i32, ptr %Out, i64 %src.phi
+//    store i32 %data, ptr %addr
+//    %inc = add i64 %src.phi, 1
+// for.inc
+//    %target.phi = phi i64 [ %inc, if.then ], [ %src.phi, %for.body ]
+bool LoopVectorizationLegality::isMatchCompact(PHINode *Phi, Loop *TheLoop,
+                                               CompactDescriptor &CpDesc) {
+  if (Phi->getNumIncomingValues() > 2)
+    return false;
+
+  // Don't support phis who is used as mask.
+  for (User *U : Phi->users()) {
+    if (isa<CmpInst>(U))
+      return false;
+  }
+
+  SmallPtrSet<Value *, 8> CompactChain;
+  CompactChain.insert(Phi);
+
+  BasicBlock *LoopPreHeader = TheLoop->getLoopPreheader();
+  int ExitIndex = Phi->getIncomingBlock(0) == LoopPreHeader ? 1 : 0;
+  BasicBlock *ExitBlock = Phi->getIncomingBlock(ExitIndex);
+  PHINode *CompactLiveOut = nullptr;
+  Value *IncValue = nullptr;
+  BasicBlock *IncBlock = nullptr;
+  bool IsCycle = false;
+  for (auto &CandPhi : ExitBlock->phis()) {
+    if (llvm::is_contained(CandPhi.incoming_values(), Phi) &&
+        CandPhi.getNumIncomingValues() == 2) {
+      IsCycle = true;
+      CompactLiveOut = &CandPhi;
+      int IncIndex = CandPhi.getIncomingBlock(0) == Phi->getParent() ? 1 : 0;
+      IncBlock = CandPhi.getIncomingBlock(IncIndex);
+      IncValue = CandPhi.getIncomingValueForBlock(IncBlock);
+      break;
+    }
+  }
+  // Similar with reduction PHI.
+  if (!IsCycle)
+    return false;
+  CompactChain.insert(CompactLiveOut);
+
+  // Match the pattern %inc = add i32 %src.phi, 1.
+  Value *Index = nullptr, *Step = nullptr;
+  if (!match(IncValue, m_Add(m_Value(Index), m_Value(Step))))
+    return false;
+  if (Index != Phi) {
+    std::swap(Index, Step);
+  }
+  if (Step != ConstantInt::get(Step->getType(), 1))
+    return false;
+  CompactChain.insert(IncValue);
+
+  const DataLayout &DL = Phi->getModule()->getDataLayout();
+  int CntCandStores = 0;
+  GetElementPtrInst *GEP = nullptr;
+  for (auto &Inst : *IncBlock) {
+    if (auto *SI = dyn_cast<StoreInst>(&Inst)) {
+      // TODO: Support llvm.sve.compact.nxv8i16, llvm.sve.compact.nxv16i18 in
+      // the future.
+      unsigned TySize = DL.getTypeSizeInBits(SI->getValueOperand()->getType());
+      if (TySize < 32)
+        return false;
+
+      GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand());
+      if (GEP == nullptr)
+        continue;
+
+      // Only handle single pointer.
+      if (GEP->getNumOperands() != 2)
+        continue;
+
+      // Get the index of GEP, index could be phi or sext/zext (if phi types
+      // i32).
+      Value *Op1 = GEP->getOperand(1);
+      Value *X = nullptr;
+      SmallSet<Value *, 16> CandiInstrs;
+      if (match(Op1, m_SExt(m_Value(X))) || match(Op1, m_ZExt(m_Value(X)))) {
+        Op1 = X;
+      }
+      Instruction *Op1Instr = dyn_cast<Instruction>(Op1);
+      if (!Op1Instr || isUserOfCompactPHI(IncBlock, Phi, Op1Instr))
+        continue;
+      CompactChain.insert(GEP);
+      CompactChain.insert(SI);
+      CntCandStores++;
+    }
+  }
+  if (!CntCandStores)
+    return false;
+
+  KnownBits Bits = computeKnownBits(Phi, DL);
+  bool IsSign = !Bits.isNonNegative();
+  CompactDescriptor CompactDesc(CompactChain, CompactLiveOut, IsSign);
+  CpDesc = CompactDesc;
+  LLVM_DEBUG(dbgs() << "LV: Found a compact chain.\n");
+  return true;
+}
+
+PHINode *LoopVectorizationLegality::getCompactChainStart(Instruction *I) const {
+  for (auto &CpDesc : CpList) {
+    if (CpDesc.second.isInCompactChain(I))
+      return CpDesc.first;
+  }
+  return nullptr;
+}
+
 bool LoopVectorizationLegality::canVectorizeInstrs() {
   BasicBlock *Header = TheLoop->getHeader();
 
@@ -881,6 +1024,14 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           continue;
         }
 
+        CompactDescriptor CpDesc;
+        if (EnableCompactVectorization &&
+            TTI->isTargetSupportedCompactStore() &&
+            isMatchCompact(Phi, TheLoop, CpDesc)) {
+          CpList[Phi] = CpDesc;
+          continue;
+        }
+
         reportVectorizationFailure("Found an unidentified PHI",
             "value that could not be identified as "
             "reduction is used outside the loop",
@@ -1525,16 +1676,22 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
   LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
 
   SmallPtrSet<const Value *, 8> ReductionLiveOuts;
+  SmallPtrSet<const Value *, 8> CompactLiveOuts;
 
   for (const auto &Reduction : getReductionVars())
     ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
 
+  for (const auto &Compact : getCompactList())
+    CompactLiveOuts.insert(Compact.second.getLiveOutPhi());
+
   // TODO: handle non-reduction outside users when tail is folded by masking.
   for (auto *AE : AllowedExit) {
     // Check that all users of allowed exit values are inside the loop or
     // are the live-out of a reduction.
     if (ReductionLiveOuts.count(AE))
       continue;
+    if (CompactLiveOuts.count(AE))
+      continue;
     for (User *U : AE->users()) {
       Instruction *UI = cast<Instruction>(U);
       if (TheLoop->contains(UI))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 88f064b6d57cebc..28d3cadc0cf392d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -619,6 +619,8 @@ class InnerLoopVectorizer {
   /// Create code for the loop exit value of the reduction.
   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
 
+  void fixCompactPHI(VPCompactPHIRecipe *CompactPHIR, VPTransformState &State);
+
   /// Iteratively sink the scalarized operands of a predicated instruction into
   /// the block that was created for it.
   void sinkScalarOperands(Instruction *PredInst);
@@ -1968,7 +1970,8 @@ class GeneratedRTChecks {
   /// there is no vector code generation, the check blocks are removed
   /// completely.
   void Create(Loop *L, const LoopAccessInfo &LAI,
-              const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
+              const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
+              LoopVectorizationLegality *LVL = nullptr) {
 
     // Hard cutoff to limit compile-time increase in case a very large number of
     // runtime checks needs to be generated.
@@ -2001,7 +2004,7 @@ class GeneratedRTChecks {
                                  "vector.memcheck");
 
       auto DiffChecks = RtPtrChecking.getDiffChecks();
-      if (DiffChecks) {
+      if (DiffChecks && !(LVL && LVL->hasCompactChain())) {
         Value *RuntimeVF = nullptr;
         MemRuntimeCheckCond = addDiffRuntimeChecks(
             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
@@ -3654,6 +3657,47 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
   for (VPRecipeBase &R : Header->phis()) {
     if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
       fixFixedOrderRecurrence(FOR, State);
+    else if (auto *CompactR = dyn_cast<VPCompactPHIRecipe>(&R))
+      fixCompactPHI(CompactR, State);
+  }
+}
+
+void InnerLoopVectorizer::fixCompactPHI(VPCompactPHIRecipe *CompactPHIR,
+                                        VPTransformState &State) {
+  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstNonPHI());
+  VPValue *VPBackEdgeValue = CompactPHIR->getBackedgeValue();
+  Value *BackEdgeValue = State.get(VPBackEdgeValue, State.UF - 1);
+  Value *StartValue = CompactPHIR->getStartValue()->getUnderlyingValue();
+  Value *TruncBackEdgeValue = BackEdgeValue;
+  if (StartValue->getType() != BackEdgeValue->getType())
+    TruncBackEdgeValue =
+        Builder.CreateTruncOrBitCast(BackEdgeValue, StartValue->getType());
+
+  // Generate phi in scalar preheader to pass LiveIns outside the loop.
+  PHINode *ScalarPreheaderPN =
+      PHINode::Create(StartValue->getType(), 2, "compact.rdx",
+                      LoopScalarPreHeader->getFirstNonPHI());
+
+  for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
+    if (Incoming == LoopMiddleBlock)
+      ScalarPreheaderPN->addIncoming(TruncBackEdgeValue, Incoming);
+    else
+      ScalarPreheaderPN->addIncoming(StartValue, Incoming);
+  }
+
+  Value *ScalarBackEdgeValue =
+      CompactPHIR->getBackedgeValue()->getUnderlyingValue();
+  for (PHINode &Phi : LoopScalarBody->phis()) {
+    if (llvm::is_contained(Phi.incoming_values(), ScalarBackEdgeValue)) {
+      Phi.setIncomingValueForBlock(LoopScalarPreHeader, ScalarPreheaderPN);
+    }
+  }
+
+  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
+    if (llvm::is_contained(LCSSAPhi.incoming_values(), ScalarBackEdgeValue)) {
+      LCSSAPhi.addIncoming(TruncBackEdgeValue, LoopMiddleBlock);
+      State.Plan->removeLiveOut(&LCSSAPhi);
+    }
   }
 }
 
@@ -4260,6 +4304,8 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
                .Kind == CM_Scalarize;
   case Instruction::Load:
   case Instruction::Store: {
+    if (Legal->getCompactChainStart(I) != nullptr)
+      return false;
     auto *Ptr = getLoadStorePointerOperand(I);
     auto *Ty = getLoadStoreType(I);
     Type *VTy = Ty;
@@ -4621,6 +4667,12 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
         continue;
       }
 
+      // GEPs in compact chain should be uniform after vectorization.
+      if (isa<GetElementPtrInst>(&I) && Legal->getCompactChainStart(&I)) {
+        addToWorklistIfAllowed(&I);
+        continue;
+      }
+
       // If there's no pointer operand, there's nothing to do.
       auto *Ptr = getLoadStorePointerOperand(&I);
       if (!Ptr)
@@ -6844,6 +6896,24 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
         continue;
       }
 
+      if (isa<StoreInst>(I) && Legal->hasCompactChain()) {
+        InstructionCost Cost = 0;
+        if (!VF.isScalable() || VF.isScalar()) {
+          setWideningDecision(&I, VF, CM_Widen, InstructionCost::getInvalid());
+          continue;
+        }
+        Type *EleTy = getLoadStoreType(&I);
+        VectorType *VectorTy = cast<VectorType>(ToVectorTy(EleTy, VF));
+        const Align Alignment = getLoadStoreAlignment(&I);
+        unsigned AS = getLoadStoreAddressSpace(&I);
+        enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+        Cost += TTI.getMaskedMemoryOpCost(I.getOpcode(), VectorTy, Alignment,
+                                          AS, CostKind);
+        Cost += TTI.getCompactCost();
+        setWideningDecision(&I, VF, CM_Widen, Cost);
+        continue;
+      }
+
       // Choose between Interleaving, Gather/Scatter or Scalarization.
       InstructionCost InterleaveCost = InstructionCost::getInvalid();
       unsigned NumAccesses = 1;
@@ -8590,6 +8660,30 @@ VPRecipeOrVPValueTy VPRecipeBuilder::handleReplication(Instruction *I,
   return toVPRecipeResult(Recipe);
 }
 
+VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenCompactRecipe(
+    Instruction *Instr, ArrayRef<VPValue *> Operands, VPlanPtr &Plan,
+    bool IsSign, const TargetTransformInfo *TTI) {
+  if (auto Phi = dyn_cast<PHINode>(Instr)) {
+    if (Instr->getParent() != OrigLoop->getHeader())
+      return toVPRecipeResult(new VPWidenCompactInstructionRecipe(
+          Instr, Instr->getOpcode(), Operands));
+
+    VPValue *StartV = Operands[0];
+    VPHeaderPHIRecipe *PhiRecipe = new VPCompactPHIRecipe(Phi, StartV, IsSign);
+    recordRecipeOf(cast<Instruction>(
+        Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
+    PhisToFix.push_back(PhiRecipe);
+    return toVPRecipeResult(PhiRecipe);
+  }
+
+  if (isa<GetElementPtrInst>(Instr))
+    return nullptr;
+
+  VPValue *Mask = createBlockInMask(Instr->getParent(), *Plan);
+  return toVPRecipeResult(new VPWidenCompactInstructionRecipe(
+      Instr, Instr->getOpcode(), Operands, Mask, TTI));
+}
+
 VPRecipeOrVPValueTy
 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
                                         ArrayRef<VPValue *> Operands,
@@ -8681,6 +8775,10 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
+  // Don't build vplan of fixed width version if there is a compact chain in the
+  // loop.
+  if (Legal->hasCompactChain() && !MinVF.isScalable())
+    return;
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
@@ -8853,8 +8951,15 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
         continue;
 
-      auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
-          Instr, Operands, Range, VPBB, Plan);
+      VPRecipeOrVPValueTy RecipeOrValue;
+      if (PHINode *ChainStart = Legal->getCompactChainStart(Instr)) {
+        RecipeOrValue = RecipeBuilder.tryToCreateWidenCompactRecipe(
+            Instr, Operands, Plan, Legal->isSign(ChainStart), &TTI);
+      } else {
+        RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands,
+                                                             Range, VPBB, Plan);
+      }
+
       if (!RecipeOrValue)
         RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
       // If Instr can be simplified to an existing VPValue, use it.
@@ -10028,7 +10133,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     //  Optimistically generate runtime checks if they are needed. Drop them if
     //  they turn out to not be profitable.
     if (VF.Width.isVector() || SelectedIC > 1)
-      Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+      Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,
+                    &LVL);
 
     // Check if it is profitable to vectorize with runtime checks.
     bool ForceVectorization =
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 7ff6749a09089e9..0449b678b714072 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -123,6 +123,11 @@ class VPRecipeBuilder {
                                              VFRange &Range, VPBasicBlock *VPBB,
                                              VPlanPtr &Plan);
 
+  VPRecipeOrVPValueTy
+  tryToCreateWidenCompactRecipe(Instruction *Instr,
+                                ArrayRef<VPValue *> Operands, VPlanPtr &Plan,
+                                bool IsSign, const TargetTransformInfo *TTI);
+
   /// Set the recipe created for given ingredient. This operation is a no-op for
   /// ingredients that were not marked using a nullptr entry in the map.
   void setRecipe(Instruction *I, VPRecipeBase *R) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index b4d464ad7ad8c7c..39c0f114a79cfef 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -818,6 +818,7 @@ void VPlan::execute(VPTransformState *State) {
     // generated.
     bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
                             isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
+                            isa<VPCompactPHIRecipe>(PhiR) ||
                             (isa<VPReductionPHIRecipe>(PhiR) &&
                              cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
     unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
@@ -828,6 +829,22 @@ void VPlan::execute(VPTransformState *State) {
                               SinglePartNeeded ? State->UF - 1 : Part);
       cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
     }
+
+    // Fix Compact phis if UF > 1.
+    if (isa<VPCompactPHIRecipe>(PhiR)) {
+      for (unsigned Part = 1; Part < State->UF; ++Part) {
+        Value *Val = State->get(PhiR->getBackedgeValue(), Part - 1);
+        // BOSCC vectorization will transform liveouts into phis, and we should
+        // get the underlying value here.
+        if (auto *PN = dyn_cast<PHINode>(Val)) {
+          int ValIdx = isa<PoisonValue>(PN->getOperand(0)) ? 1 : 0;
+          Val = PN->getOperand(ValIdx);
+        }
+        PHINode *Phi = cast<PHINode>(State->get(PhiR, Part));
+        Phi->replaceAllUsesWith(Val);
+        Phi->eraseFromParent();
+      }
+    }
   }
 
   // We do not attempt to preserve DT for outer loop vectorization currently.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e65a7ab2cd028ee..e04266b7f5f1856 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1274,6 +1274,40 @@ struct VPWidenSelectRecipe : public VPRecipeBase, public VPValue {
   }
 };
 
+class VPWidenCompactInstructionRecipe : public VPRecipeBase, public VPValue {
+private:
+  Instruction &Ingredient;
+  unsigned Opcode;
+  VPValue *Mask;
+  const TargetTransformInfo *TTI;
+
+  void genCompactInc(VPTransformState &State);
+  void genCompactStore(VPTransformState &State);
+  void genCompactLiveOut(VPTransformState &State);
+
+public:
+  VPWidenCompactInstructionRecipe(Instruction *I, unsigned Opcode,
+                                  ArrayRef<VPValue *> Operands,
+                                  VPValue *Mask = nullptr,
+                                  const TargetTransformInfo *TTI = nullptr)
+      : VPRecipeBase(VPDef::VPCompactInstructionSC, Operands), VPValue(this, I),
+        Ingredient(*I), Opcode(Opcode), Mask(Mask), TTI(TTI) {}
+  ~VPWidenCompactInstructionRecipe() override = default;
+
+  VP_CLASSOF_IMPL(VPDef::VPCompactInstructionSC)
+
+  unsigned getOpcode() const { return Opcode; }
+
+  VPValue *getMask() { return Mask; }
+
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for handling GEP instructions.
 class VPWidenGEPRecipe : public VPRecipeWithIRFlags, public VPValue {
   bool isPointerLoopInvariant() const {
@@ -1616,6 +1650,34 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   bool isInLoop() const { return IsInLoop; }
 };
 
+class VPCompactPHIRecipe : public VPHeaderPHIRecipe {
+  PHINode *CompactPHI;
+  bool IsCompactSign;
+
+public:
+  VPCompactPHIRecipe(PHINode *Phi, VPValue *Start, bool IsSign)
+      : VPHeaderPHIRecipe(VPDef::VPCompactPHISC, Phi, Start), CompactPHI(Phi),
+        IsCompactSign(IsSign) {}
+
+  ~VPCompactPHIRecipe() override = default;
+
+  VP_CLASSOF_IMPL(VPDef::VPCompactPHISC)
+
+  static inline bool classof(const VPHeaderPHIRecipe *R) {
+    return R->getVPDefID() == VPDef::VPCompactPHISC;
+  }
+
+  bool isSign() { return IsCompactSign; }
+
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
 /// instructions.
 class VPBlendRecipe : public VPRecipeBase, public VPValue {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2a1213a98095907..06237f779415c33 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <cassert>
 
@@ -945,6 +946,140 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   VecInd->addIncoming(LastInduction, VectorPH);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenCompactInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                            VPSlotTracker &SlotTracker) const {
+  O << Indent << "COMPACT ";
+  if (getOpcode() != Instruction::Store) {
+    printAsOperand(O, SlotTracker);
+    O << " = ";
+  }
+  O << Instruction::getOpcodeName(getOpcode()) << " ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
+void VPWidenCompactInstructionRecipe::execute(VPTransformState &State) {
+  switch (getOpcode()) {
+  case Instruction::Add:
+    genCompactInc(State);
+    break;
+  case Instruction::PHI:
+    genCompactLiveOut(State);
+    break;
+  case Instruction::Store:
+    genCompactStore(State);
+    break;
+  default:
+    llvm_unreachable("Unsupport opcode for compact.");
+  }
+}
+
+void VPWidenCompactInstructionRecipe::genCompactStore(VPTransformState &State) {
+  assert(State.VF.isScalable() && "Compact store is for SVE scenario");
+  auto &Builder = State.Builder;
+  VPValue *VPStoredValue = getOperand(0);
+  VPValue *VPAddr = getOperand(1);
+  StoreInst *SI = cast<StoreInst>(&Ingredient);
+  Type *ScalarTy = getLoadStoreType(&Ingredient);
+  Module *M = SI->getModule();
+  VectorType *MaskVTy = cast<VectorType>(State.get(getMask(), 0)->getType());
+  Constant *One = nullptr;
+  unsigned VL = MaskVTy->getElementCount().getKnownMinValue();
+  switch (VL) {
+  case 2:
+    One = ConstantInt::get(Type::getInt64Ty(M->getContext()), 1);
+    break;
+  case 4:
+    One = ConstantInt::get(Type::getInt32Ty(M->getContext()), 1);
+    break;
+  default:
+    // TODO: Try to support compact.nxv8i16 / compact.nxv16i8 in the future.
+    llvm_unreachable("Unsupported type");
+  }
+  Constant *VOne = ConstantVector::getSplat(MaskVTy->getElementCount(), One);
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    // Generate compact mask.
+    Value *Mask = State.get(getMask(), Part);
+    Value *CompactMaskII = createTargetCompact(Builder, M, TTI, Mask, VOne);
+    assert(CompactMaskII && "Do not support compact in current target.");
+    Value *CompactCmpII =
+        Builder.CreateCmp(ICmpInst::ICMP_EQ, CompactMaskII, VOne);
+
+    // Transform stored value into compact form.
+    VectorType *StoreVTy = VectorType::get(ScalarTy, State.VF);
+    const Align Alignment = getLoadStoreAlignment(&Ingredient);
+    Value *Addr = State.get(VPAddr, VPIteration(Part, 0));
+    Value *StoredValue = State.get(VPStoredValue, Part);
+    Value *SCompact = createTargetCompact(Builder, M, TTI, Mask, StoredValue);
+    assert(SCompact && "Do not support comapct in current target.");
+    Instruction *CompactSI =
+        Builder.CreateMaskedStore(SCompact, Addr, Alignment, CompactCmpII);
+    State.addMetadata(CompactSI, SI);
+  }
+}
+
+void VPWidenCompactInstructionRecipe::genCompactInc(VPTransformState &State) {
+  auto &Builder = State.Builder;
+  Module *M = getUnderlyingInstr()->getModule();
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Value *Mask = State.get(getMask(), Part);
+    Constant *PTrue = ConstantInt::getTrue(cast<VectorType>(Mask->getType()));
+    Value *CNTPCall = createTargetCNTP(Builder, M, TTI, PTrue, Mask);
+    Value *Idx = nullptr;
+    if (Part == 0)
+      Idx = State.get(getOperand(0), Part);
+    else
+      Idx = State.get(this, Part - 1);
+    Value *TruncCall = CNTPCall;
+    if (Idx->getType() != CNTPCall->getType()) {
+      TruncCall = Builder.CreateTrunc(CNTPCall, Idx->getType());
+    }
+    Value *NewInc = Builder.CreateAdd(cast<Instruction>(Idx), TruncCall);
+    State.set(this, NewInc, Part);
+  }
+}
+
+void VPWidenCompactInstructionRecipe::genCompactLiveOut(
+    VPTransformState &State) {
+  // Get the exit value of phi
+  VPValue *VPExitValue = nullptr;
+  PHINode *Phi = cast<PHINode>(&Ingredient);
+  for (unsigned Idx = 0; Idx < Phi->getNumIncomingValues(); Idx++) {
+    PHINode *PhiOp =
+        dyn_cast_or_null<PHINode>(getOperand(Idx)->getUnderlyingValue());
+    if (!PhiOp) {
+      VPExitValue = getOperand(Idx);
+      break;
+    }
+  }
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Value *ExitVal = State.get(VPExitValue, Part);
+    State.set(this, ExitVal, Part);
+  }
+}
+
+void VPCompactPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                               VPSlotTracker &SlotTracker) const {
+  O << Indent << "COMPACT-PHI ";
+  printAsOperand(O, SlotTracker);
+  O << " = phi ";
+  printOperands(O, SlotTracker);
+}
+
+void VPCompactPHIRecipe::execute(VPTransformState &State) {
+  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+  BasicBlock *VectorHeader = State.CFG.PrevBB;
+  VPValue *StartVPV = getStartValue();
+  Value *Start = StartVPV->getLiveInIRValue();
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    PHINode *Entry = PHINode::Create(Start->getType(), 2, "compact.iv",
+                                     &*VectorHeader->getFirstInsertionPt());
+    Entry->addIncoming(Start, VectorPH);
+    State.set(this, Entry, Part);
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index ac110bb3b0ef9be..3d5cc8f9ea4620e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -348,6 +348,7 @@ class VPDef {
     VPWidenCastSC,
     VPWidenGEPSC,
     VPWidenMemoryInstructionSC,
+    VPCompactInstructionSC,
     VPWidenSC,
     VPWidenSelectSC,
     // START: Phi-like recipes. Need to be kept together.
@@ -361,6 +362,7 @@ class VPDef {
     VPWidenPHISC,
     VPWidenIntOrFpInductionSC,
     VPWidenPointerInductionSC,
+    VPCompactPHISC,
     VPReductionPHISC,
     // END: SubclassID for recipes that inherit VPHeaderPHIRecipe
     // END: Phi-like recipes
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/compact-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/compact-vplan.ll
new file mode 100644
index 000000000000000..d8d3ebdaacd42ba
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/compact-vplan.ll
@@ -0,0 +1,78 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -debug -disable-output %s 2>&1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; CHECK-LABEL: 'kernel_reference'
+; CHECK:      VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = vector-trip-count
+; CHECK-NEXT: vp<%1> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ph:
+; CHECK-NEXT:   EMIT vp<%1> = EXPAND SCEV (zext i32 %N to i64)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%2> = CANONICAL-INDUCTION
+; CHECK-NEXT:     COMPACT-PHI ir<%n.013> = phi ir<0>, ir<%n.1>
+; CHECK-NEXT:     vp<%4>    = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%comp>, vp<%4>
+; CHECK-NEXT:     WIDEN ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT:     WIDEN ir<%cmp1> = icmp slt ir<%0>, ir<%a>
+; CHECK-NEXT:     CLONE ir<%arrayidx3> = getelementptr inbounds ir<%B>, vp<%4>
+; CHECK-NEXT:     WIDEN ir<%1> = load ir<%arrayidx3>, ir<%cmp1>
+; CHECK-NEXT:     COMPACT ir<%inc> = add ir<%n.013>, ir<1>
+; CHECK-NEXT:     CLONE ir<%idxprom4> = sext ir<%n.013>
+; CHECK-NEXT:     CLONE ir<%arrayidx5> = getelementptr inbounds ir<%Out_ref>, ir<%idxprom4>
+; CHECK-NEXT:     COMPACT store ir<%1>, ir<%arrayidx5>
+; CHECK-NEXT:     COMPACT ir<%n.1> = phi ir<%inc>, ir<%n.013>
+; CHECK-NEXT:     EMIT vp<%15> = VF * UF + nuw vp<%2>
+; CHECK-NEXT:     EMIT branch-on-count vp<%15>, vp<%0>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+;
+; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16)
+define dso_local i32 @kernel_reference(i32 noundef %N, i32 noundef %a, ptr noalias nocapture noundef readonly %comp, ptr noalias nocapture noundef writeonly %Out_ref, ptr nocapture noundef readonly %B, ptr noalias nocapture noundef readnone %Out1) #0 {
+entry:
+  %cmp11 = icmp sgt i32 %N, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %n.013 = phi i32 [ 0, %for.body.preheader ], [ %n.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %comp, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp1 = icmp slt i32 %0, %a
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx3 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx3, align 4
+  %inc = add nsw i32 %n.013, 1
+  %idxprom4 = sext i32 %n.013 to i64
+  %arrayidx5 = getelementptr inbounds i32, ptr %Out_ref, i64 %idxprom4
+  store i32 %1, ptr %arrayidx5, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %n.1 = phi i32 [ %inc, %if.then ], [ %n.013, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %n.0.lcssa = phi i32 [ 0, %entry ], [ %n.1, %for.inc ]
+  ret i32 %n.0.lcssa
+}
+
+attributes #0 = { argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16) "target-cpu"="generic" "target-features"="+neon,+sve,+v8.2a"}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll b/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll
index 0bafd325bb8e98e..419ba76eb4cd4ac 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll
@@ -4,6 +4,12 @@
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
+; for (i = 0; i < N; i++){
+;  x = comp[i];
+;  if(x < a) Out[n++] = B[i];
+; }
+; return n;
+
 ; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16)
 define dso_local i32 @kernel_reference(i32 noundef %N, i32 noundef %a, ptr noalias nocapture noundef readonly %comp, ptr noalias nocapture noundef writeonly %Out_ref, ptr nocapture noundef readonly %B, ptr noalias nocapture noundef readnone %Out1) #0 {
 ; CHECK-LABEL: @kernel_reference(
@@ -12,29 +18,97 @@ define dso_local i32 @kernel_reference(i32 noundef %N, i32 noundef %a, ptr noali
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[COMPACT_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[COMP:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COMP]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i32>, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP20]], i32 4, <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 4
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP18]], i64 [[TMP22]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP23]], i32 4, <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[TMP16]])
+; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
+; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[COMPACT_IV]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[TMP17]])
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT:    [[TMP29]] = add i32 [[TMP26]], [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = sext i32 [[COMPACT_IV]] to i64
+; CHECK-NEXT:    [[TMP31:%.*]] = sext i32 [[TMP26]] to i64
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF:%.*]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF]], i64 [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.compact.nxv4i32(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq <vscale x 4 x i32> [[TMP34]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP36:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.compact.nxv4i32(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP36]], ptr [[TMP32]], i32 4, <vscale x 4 x i1> [[TMP35]])
+; CHECK-NEXT:    [[TMP37:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.compact.nxv4i32(<vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq <vscale x 4 x i32> [[TMP37]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP39:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.compact.nxv4i32(<vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD3]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP39]], ptr [[TMP33]], i32 4, <vscale x 4 x i1> [[TMP38]])
+; CHECK-NEXT:    [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP40]], 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP41]]
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[COMPACT_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[N_013:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_1:%.*]], [[FOR_INC]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COMP:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[A:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[N_013:%.*]] = phi i32 [ [[COMPACT_RDX]], [[SCALAR_PH]] ], [ [[N_1:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COMP]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP43]], [[A]]
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP44:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[N_013]], 1
 ; CHECK-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[N_013]] to i64
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF:%.*]], i64 [[IDXPROM4]]
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF]], i64 [[IDXPROM4]]
+; CHECK-NEXT:    store i32 [[TMP44]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[N_1]] = phi i32 [ [[INC]], [[IF_THEN]] ], [ [[N_013]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    [[N_1_LCSSA:%.*]] = phi i32 [ [[N_1]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[N_1_LCSSA:%.*]] = phi i32 [ [[N_1]], [[FOR_INC]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[N_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[N_1_LCSSA]], [[FOR_END_LOOPEXIT]] ]