[llvm] [LV][RFC] Generating conditional VPBB that will be skip when the mask is inactive in VPlan. (PR #141900)

Wed May 28 23:21:33 PDT 2025

https://github.com/ElvisWang123 created https://github.com/llvm/llvm-project/pull/141900

This patch create an overview of VPlan transformation of creating conditional VPBB and will be split off into multiple patches later.

RFC: https://discourse.llvm.org/t/rfc-lv-generating-conditional-vpbb-that-will-be-skip-when-the-mask-is-inactive-in-vplan/86591

This patch add the transformation that convert flatten control flow
with conditional vector basic block.
This transformation can help program skip masked operations without any
active lane.

First, this transformation will collect all masked stores and operands
bottom-up. And put these masked operations into a new vector basic
block.

Second, this transformation will split original vector loop and insert
the new basic block between split blocks. And update the conditional
branch in the original blocks.

E.g.
Before: {
  vector.loop:
    ...
    BranchOnCount %IV, %TC
  Successors middle.block, vector.loop
}

After: {
  vector.loop:
    ...
    %any.active.mask = any-of(%mask)
    BranchOnCount %any.active.mask, 0
  Successors vector.loop.split, vector.if.bb

  vector.if.bb:
    ... (Masked operations)
  Successors vector.loop.split

  vector.loop.split:
    ...
    BranchOnCount %IV, %TC
  Successors middle.block, vector.loop
}

Co-authored-by: nikolaypanchenko <kolya.panchenko at sifive.com>

>From 1a76e9c1554e47abe9720379a276eb7164cad346 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Mon, 28 Apr 2025 22:31:50 -0700
Subject: [PATCH 1/4] [LV] Pre-commit test case for condition VPBB. (NFC)

---
 .../RISCV/vplan-conditional-basic-block.ll    | 121 ++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
new file mode 100644
index 0000000000000..aec6538e1f869
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v %s | FileCheck %s
+
+define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i32 [[CONTROL1:%.*]], i32 [[CONTROL2:%.*]], i32 [[TARGET:%.*]], i32 [[REG_4_VAL:%.*]], ptr [[REG_24_VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[REG_4_VAL]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[SH_PROM:%.*]] = zext nneg i32 [[CONTROL1]] to i64
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 1, [[SH_PROM]]
+; CHECK-NEXT:    [[SH_PROM5:%.*]] = zext nneg i32 [[CONTROL2]] to i64
+; CHECK-NEXT:    [[SHL6:%.*]] = shl nuw i64 1, [[SH_PROM5]]
+; CHECK-NEXT:    [[SH_PROM10:%.*]] = zext nneg i32 [[TARGET]] to i64
+; CHECK-NEXT:    [[SHL11:%.*]] = shl nuw nsw i64 1, [[SH_PROM10]]
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[REG_4_VAL]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze i64 [[SHL6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[SHL]], [[TMP0]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[SHL11]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP11]], i32 8, <4 x i1> [[TMP7]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP12]], i32 8, <4 x i1> [[TMP8]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[REG_24_VAL]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[TMP28:%.*]] = and i64 [[TMP27]], [[TMP1]]
+; CHECK-NEXT:    [[OR_COND_NOT:%.*]] = icmp eq i64 [[TMP28]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[OR_COND_NOT]], label %[[IF_THEN9:.*]], label %[[FOR_INC]]
+; CHECK:       [[IF_THEN9]]:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i64 [[TMP27]], [[SHL11]]
+; CHECK-NEXT:    store i64 [[XOR]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp1 = icmp sgt i32 %reg.4.val, 0
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %sh_prom = zext nneg i32 %control1 to i64
+  %shl = shl nuw i64 1, %sh_prom
+  %sh_prom5 = zext nneg i32 %control2 to i64
+  %shl6 = shl nuw i64 1, %sh_prom5
+  %sh_prom10 = zext nneg i32 %target to i64
+  %shl11 = shl nuw nsw i64 1, %sh_prom10
+  %wide.trip.count = zext nneg i32 %reg.4.val to i64
+  %0 = freeze i64 %shl6
+  %1 = or i64 %shl, %0
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i64, ptr %reg.24.val, i64 %indvars.iv
+  %2 = load i64, ptr %arrayidx, align 8
+  %3 = and i64 %2, %1
+  %or.cond.not = icmp eq i64 %3, %1
+  br i1 %or.cond.not, label %if.then9, label %for.inc
+
+if.then9:
+  %xor = xor i64 %2, %shl11
+  store i64 %xor, ptr %arrayidx, align 8
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

>From 57b79d2dd285d7a9f388b3b339db82f060abc889 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Thu, 8 May 2025 19:35:07 -0700
Subject: [PATCH 2/4] [LV] Remove headerMask when EVL  transforms.

---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index dc5be520505eb..d5e5ad95c3e4a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2087,6 +2087,23 @@ void VPlanTransforms::addActiveLaneMask(
     HeaderMask->replaceAllUsesWith(LaneMask);
 }
 
+static bool replaceHeaderMaskToEVL(VPValue *HeaderMask, VPRecipeBase *R) {
+  using namespace llvm::VPlanPatternMatch;
+  VPValue *EdgeMask;
+  if (!R)
+    return false;
+  if (match(R, m_Binary<VPInstruction::BranchOnCount>(
+                   m_VPInstruction<VPInstruction::AnyOf>(
+                       m_Binary<VPInstruction::LogicalAnd>(
+                           m_Specific(HeaderMask), m_VPValue(EdgeMask))),
+                   m_VPValue()))) {
+
+    cast<VPInstruction>(R->getOperand(0))->setOperand(0, EdgeMask);
+    return true;
+  }
+  return false;
+}
+
 /// Try to convert \p CurRecipe to a corresponding EVL-based recipe. Returns
 /// nullptr if no EVL-based recipe could be created.
 /// \p HeaderMask  Header Mask.
@@ -2202,6 +2219,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
     for (VPUser *U : collectUsersRecursively(HeaderMask)) {
       auto *CurRecipe = cast<VPRecipeBase>(U);
+      if (replaceHeaderMaskToEVL(HeaderMask, CurRecipe))
+        continue;
       VPRecipeBase *EVLRecipe = createEVLRecipe(
           HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL, PrevEVL);
       if (!EVLRecipe)

>From 8ee3a5a8710e6533515badd3d3685e6a4e58c989 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Sun, 11 May 2025 20:02:56 -0700
Subject: [PATCH 3/4] [TTI] Add `preferFlattenControlFlow` for loop vectorizer.
 (NFC)

This patch add a need attribut in TTI to let LV knows which is better.

Default value of preferFlattenControlFlow is true to match current TTI
implementation.

If preferFlattenControlFlow() return false, LV will try to generate
conditional VPBB if possible.
---
 llvm/include/llvm/Analysis/TargetTransformInfo.h     | 4 ++++
 llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 2 ++
 llvm/include/llvm/CodeGen/BasicTTIImpl.h             | 4 ++++
 llvm/lib/Analysis/TargetTransformInfo.cpp            | 4 ++++
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h     | 2 ++
 5 files changed, 16 insertions(+)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8f4ce80ada5ed..20c4b52098770 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1822,6 +1822,10 @@ class TargetTransformInfo {
   /// otherwise scalar epilogue loop.
   LLVM_ABI bool preferEpilogueVectorization() const;
 
+  /// Return true if the loop vectorizer shoud consider vectorizing with
+  /// flattern control flow, otherwise create conditional vector basic block.
+  bool preferFlattenControlFlow() const;
+
   /// \returns True if the target wants to expand the given reduction intrinsic
   /// into a shuffle sequence.
   LLVM_ABI bool shouldExpandReduction(const IntrinsicInst *II) const;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index a80b4c5179bad..1211c80f8ff51 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1091,6 +1091,8 @@ class TargetTransformInfoImplBase {
 
   virtual bool preferEpilogueVectorization() const { return true; }
 
+  virtual bool preferFlattenControlFlow() const { return true; }
+
   virtual bool shouldExpandReduction(const IntrinsicInst *II) const {
     return true;
   }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index ff8778168686d..5000bef968213 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -777,6 +777,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return BaseT::preferPredicateOverEpilogue(TFI);
   }
 
+  bool preferFlattenControlFlow() const override {
+    return thisT()->preferFlattenControlFlow();
+  }
+
   TailFoldingStyle
   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const override {
     return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 0f857399660fe..d455e1dc63c13 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -371,6 +371,10 @@ bool TargetTransformInfo::preferPredicateOverEpilogue(
   return TTIImpl->preferPredicateOverEpilogue(TFI);
 }
 
+bool TargetTransformInfo::preferFlattenControlFlow() const {
+  return TTIImpl->preferFlattenControlFlow();
+}
+
 TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle(
     bool IVUpdateMayOverflow) const {
   return TTIImpl->getPreferredTailFoldingStyle(IVUpdateMayOverflow);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 0a784461d67bf..412dd8ae2b991 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -146,6 +146,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     return false;
   }
 
+  bool preferFlattenControlFlow() const { return false; }
+
   InstructionCost
   getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                         unsigned AddressSpace,

>From e2382bf098ba24fd1b2d19c941abdfd809745a25 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Sun, 4 May 2025 16:01:34 -0700
Subject: [PATCH 4/4] [LV] Introduce conditional vector basic block..

This patch add the transformation that convert flatten control flow
with conditional vector basic block.
This transformation can help program skip masked operations without any
active lane.

First, this transformation will collect all masked stores and operands
bottom-up. And put these msaked operations into a new vector basic
block.

Second, this transformation will split original vector loop and insert
the new basic block between split blocks. And update the conditional
branch in the orignal blocks.

E.g.
Before: {
  vector.loop:
    ...
    BranchOnCount %IV, %TC
  Successors middle.block, vector.loop
}

After: {
  vector.loop:
    ...
    %any.active.mask = any-of(%mask)
    BranchOnCount %any.active.mask, 0
  Successors vector.loop.split, vector.if.bb

  vector.if.bb:
    ... (Masked operations)
  Successors vector.loop.split

  vector.loop.split:
    ...
    BranchOnCount %IV, %TC
  Successors middle.block, vector.loop
}
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   8 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 157 ++++++++++++++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |  23 +++
 .../RISCV/vplan-conditional-basic-block.ll    |  10 +-
 4 files changed, 196 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2fe59a464457f..a834b6106f028 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -351,6 +351,11 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
     cl::desc(
         "Prefer predicating a reduction operation over an after loop select."));
 
+static cl::opt<bool>
+    PreferFlattenControlFlow("prefer-flatten-control-flow", cl::init(true),
+                             cl::Hidden,
+                             cl::desc("Prefer flatten control flow."));
+
 cl::opt<bool> llvm::EnableVPlanNativePath(
     "enable-vplan-native-path", cl::Hidden,
     cl::desc("Enable VPlan-native vectorization path with "
@@ -9287,6 +9292,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   }
   VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);
 
+  if (!PreferFlattenControlFlow && !TTI.preferFlattenControlFlow())
+    VPlanTransforms::optimizeConditionalVPBB(*Plan);
+
   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
   return Plan;
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d5e5ad95c3e4a..e1c8690986599 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3221,3 +3221,160 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
       Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
   removeDeadRecipes(Plan);
 }
+
+void VPlanTransforms::optimizeConditionalVPBB(VPlan &Plan) {
+
+  VPDominatorTree VPDT;
+  VPDT.recalculate(Plan);
+
+  SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
+
+  // Get the mask from the store recipes.
+  auto GetMask = [&HeaderMasks](VPRecipeBase &R) -> VPValue * {
+    using namespace llvm::VPlanPatternMatch;
+    if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(R)) {
+      VPValue *OrigMask = cast<VPWidenMemoryRecipe>(R).getMask();
+      if (!OrigMask)
+        return OrigMask;
+
+      if (any_of(HeaderMasks, [OrigMask](VPValue *HeaderMask) {
+            return OrigMask == HeaderMask;
+          }))
+        return nullptr;
+
+      // Match active.lane.mask.
+      if (match(OrigMask, m_VPInstruction<VPInstruction::ActiveLaneMask>(
+                              m_VPValue(), m_VPValue())))
+        return nullptr;
+
+      return OrigMask;
+    }
+    return nullptr;
+  };
+
+  // First, collect all masked stores.
+  SmallVector<std::pair<VPRecipeBase *, VPValue *>> MaskedStores;
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+      Plan.getEntry());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+    for (VPRecipeBase &R : *VPBB) {
+      if (VPValue *Mask = GetMask(R))
+        MaskedStores.emplace_back(&R, Mask);
+    }
+  }
+
+  DenseSet<VPRecipeBase *> Candidates;
+  auto AddOperandsToCandidates = [&Candidates](VPRecipeBase *R) {
+    for (VPValue *Op : R->operands())
+      if (VPRecipeBase *OpR = Op->getDefiningRecipe())
+        Candidates.insert(OpR);
+  };
+
+  SmallVector<SetVector<VPRecipeBase *>> Tries;
+  while (!MaskedStores.empty()) {
+    auto [LR, M] = MaskedStores.pop_back_val();
+    Candidates.clear();
+    AddOperandsToCandidates(LR);
+
+    SetVector<VPRecipeBase *> CurrentTree;
+    CurrentTree.insert(LR);
+
+    VPBasicBlock *MaskBlock =
+        M->hasDefiningRecipe() ? M->getDefiningRecipe()->getParent() : nullptr;
+    auto End = MaskBlock == LR->getParent()
+                   ? M->getDefiningRecipe()->getReverseIterator()
+                   : LR->getParent()->getFirstNonPhi()->getReverseIterator();
+    // Greedily add all recipes that are used to compute stored value to the
+    // tree. All users of the added recipe must dominate the store
+    // recipe.
+    for (VPRecipeBase &R : make_range(LR->getReverseIterator(), End)) {
+      // Recipe is not a part of the tree
+      if (!Candidates.contains(&R))
+        continue;
+
+      if (any_of(R.definedValues(), [&LR = LR, &VPDT](VPValue *Def) {
+            for (VPUser *U : Def->users()) {
+              if (auto *UR = dyn_cast<VPRecipeBase>(U)) {
+                if (UR == LR || VPDT.properlyDominates(UR, LR))
+                  continue;
+              }
+              return true;
+            }
+            return false;
+          }))
+        continue;
+
+      CurrentTree.insert(&R);
+      AddOperandsToCandidates(&R);
+    }
+    // Previous traversal could add recipes that are used by non-added recipes,
+    // thus need to be removed from the list.
+    DenseSet<VPRecipeBase *> ToRemove;
+    bool Changed;
+    do {
+      Changed = false;
+      for (VPRecipeBase *R : CurrentTree) {
+        if (ToRemove.contains(R))
+          continue;
+        if (any_of(R->definedValues(), [&](VPValue *Def) {
+              for (VPUser *U : Def->users()) {
+                if (auto *UR = dyn_cast<VPRecipeBase>(U))
+                  if (!CurrentTree.contains(UR) || ToRemove.contains(UR))
+                    return true;
+              }
+              return false;
+            })) {
+          Changed = true;
+          ToRemove.insert(R);
+        }
+      }
+    } while (Changed);
+
+    for (VPRecipeBase *R : ToRemove)
+      CurrentTree.remove(R);
+
+    if (CurrentTree.size() > 1)
+      Tries.push_back(CurrentTree);
+  }
+  for (const auto &List : Tries) {
+    VPRecipeBase *LR = List.front();
+    VPValue *M = cast<VPWidenMemoryRecipe>(LR)->getMask();
+    assert(M && "Mask VPValue must exist at this point");
+    auto Recipes = reverse(List.getArrayRef());
+
+    // Split current basic block at LR point so that VPConditionalRegionBlock
+    // can be added inbetween.
+    VPBasicBlock *ParentBB = LR->getParent();
+    VPBasicBlock *ContBB = ParentBB->splitAt(LR->getIterator());
+
+    // Create VPBB and insert it between ParentBB and ContBB.
+    VPBasicBlock *IfBB = Plan.createVPBasicBlock("vector.if.bb");
+    VPBlockUtils::insertBlockAfter(IfBB, ParentBB);
+    if (ContBB->getNumSuccessors() == 0)
+      ParentBB->getEnclosingLoopRegion()->setExiting(ContBB);
+
+    // Copy recipes into conditional block.
+    for (VPRecipeBase *R : Recipes)
+      R->moveBefore(*IfBB, IfBB->end());
+
+    // Add the condition and brach in the parent block.
+    auto *ActiveLane =
+        new VPInstruction(VPInstruction::AnyOf, {M}, nullptr, "any.of.mask");
+
+    Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
+    LLVMContext &Ctx = CanonicalIVType->getContext();
+    VPValue *Zero =
+        Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt1Ty(Ctx), 0));
+
+    auto *BranchOnCount =
+        new VPInstruction(VPInstruction::BranchOnCount, {ActiveLane, Zero});
+    ParentBB->appendRecipe(ActiveLane);
+    ParentBB->appendRecipe(BranchOnCount);
+
+    // Set proper predecessor and successors for modifed basicblocks.
+    ParentBB->clearSuccessors();
+    ParentBB->setTwoSuccessors(ContBB, IfBB);
+    ContBB->clearPredecessors();
+    ContBB->setPredecessors({ParentBB, IfBB});
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 34e2de4eb3b74..0e535d6bf7c36 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -234,6 +234,29 @@ struct VPlanTransforms {
   /// removed in the future.
   static DenseMap<VPBasicBlock *, VPValue *>
   introduceMasksAndLinearize(VPlan &Plan, bool FoldTail);
+
+  /// Try to convet flatten control flow to the conditional vector basic block.
+  /// If no active bits in the mask, will skip all the masked operations.
+  /// This transformation will collect all masked operations bottom-up from the
+  /// masked stores and put all of masked operations in a new vector basic
+  /// block. This original vector.loop will be split and the new created basic
+  /// block will inserted in between.
+  ///
+  /// After transformation the vplan will looks like.
+  /// vector.loop:
+  ///   ...
+  ///   %any.active.mask = any-of(%Mask)
+  ///   Branch-On-Count %any.active.mask, 0
+  /// successors vector.loop.split, vector.if.bb
+  ///
+  /// vector.if.bb:
+  ///   (Masked operations)
+  /// successors vector.loop.split
+  ///
+  /// vector.loop.split:
+  ///   ...
+  /// successors middle.block, vector.loop
+  static void optimizeConditionalVPBB(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
index aec6538e1f869..2b8f2542ab544 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v %s | FileCheck %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v -prefer-flatten-control-flow=false %s | FileCheck %s
 
 define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) {
 ; CHECK-LABEL: define void @test(
@@ -28,7 +28,7 @@ define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[IF_THEN9_SPLIT:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
@@ -40,10 +40,16 @@ define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i1 [[TMP13]], false
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[IF_THEN9_SPLIT]], label %[[VECTOR_IF_BB:.*]]
+; CHECK:       [[VECTOR_IF_BB]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP11]], i32 8, <4 x i1> [[TMP7]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP12]], i32 8, <4 x i1> [[TMP8]])
+; CHECK-NEXT:    br label %[[IF_THEN9_SPLIT]]
+; CHECK:       [[IF_THEN9_SPLIT]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]