[llvm] [LV][RFC] Generating conditional VPBB that will be skip when the mask is inactive in VPlan. (PR #141900)

Tue Aug 5 23:46:56 PDT 2025

https://github.com/ElvisWang123 updated https://github.com/llvm/llvm-project/pull/141900

>From fc01e0d399550d6df41cac8bfa0a9c041eb58d87 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Mon, 28 Apr 2025 22:31:50 -0700
Subject: [PATCH 1/3] [LV] Pre-commit test case for condition VPBB. (NFC)

---
 .../RISCV/vplan-conditional-basic-block.ll    | 119 ++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
new file mode 100644
index 0000000000000..d43c598db5476
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v %s | FileCheck %s
+
+define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i32 [[CONTROL1:%.*]], i32 [[CONTROL2:%.*]], i32 [[TARGET:%.*]], i32 [[REG_4_VAL:%.*]], ptr [[REG_24_VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[REG_4_VAL]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[SH_PROM:%.*]] = zext nneg i32 [[CONTROL1]] to i64
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 1, [[SH_PROM]]
+; CHECK-NEXT:    [[SH_PROM5:%.*]] = zext nneg i32 [[CONTROL2]] to i64
+; CHECK-NEXT:    [[SHL6:%.*]] = shl nuw i64 1, [[SH_PROM5]]
+; CHECK-NEXT:    [[SH_PROM10:%.*]] = zext nneg i32 [[TARGET]] to i64
+; CHECK-NEXT:    [[SHL11:%.*]] = shl nuw nsw i64 1, [[SH_PROM10]]
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[REG_4_VAL]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze i64 [[SHL6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[SHL]], [[TMP0]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[SHL11]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP2]], i32 8, <4 x i1> [[TMP7]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP12]], i32 8, <4 x i1> [[TMP8]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[REG_24_VAL]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[TMP28:%.*]] = and i64 [[TMP27]], [[TMP1]]
+; CHECK-NEXT:    [[OR_COND_NOT:%.*]] = icmp eq i64 [[TMP28]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[OR_COND_NOT]], label %[[IF_THEN9:.*]], label %[[FOR_INC]]
+; CHECK:       [[IF_THEN9]]:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i64 [[TMP27]], [[SHL11]]
+; CHECK-NEXT:    store i64 [[XOR]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp1 = icmp sgt i32 %reg.4.val, 0
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %sh_prom = zext nneg i32 %control1 to i64
+  %shl = shl nuw i64 1, %sh_prom
+  %sh_prom5 = zext nneg i32 %control2 to i64
+  %shl6 = shl nuw i64 1, %sh_prom5
+  %sh_prom10 = zext nneg i32 %target to i64
+  %shl11 = shl nuw nsw i64 1, %sh_prom10
+  %wide.trip.count = zext nneg i32 %reg.4.val to i64
+  %0 = freeze i64 %shl6
+  %1 = or i64 %shl, %0
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i64, ptr %reg.24.val, i64 %indvars.iv
+  %2 = load i64, ptr %arrayidx, align 8
+  %3 = and i64 %2, %1
+  %or.cond.not = icmp eq i64 %3, %1
+  br i1 %or.cond.not, label %if.then9, label %for.inc
+
+if.then9:
+  %xor = xor i64 %2, %shl11
+  store i64 %xor, ptr %arrayidx, align 8
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

>From 680098af4740622f849c082bde6f64212fcd3135 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Sun, 11 May 2025 20:02:56 -0700
Subject: [PATCH 2/3] [TTI] Add `preferFlattenControlFlow` for loop vectorizer.
 (NFC)

This patch add a need attribut in TTI to let LV knows which is better.

Default value of preferControlFlow is false to match current TTI
implementation.

If preferFlattenControlFlow() return true, LV will try to generate
conditional VPBB if possible.
---
 llvm/include/llvm/Analysis/TargetTransformInfo.h     | 5 +++++
 llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 2 ++
 llvm/include/llvm/CodeGen/BasicTTIImpl.h             | 2 ++
 llvm/lib/Analysis/TargetTransformInfo.cpp            | 4 ++++
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h     | 2 ++
 5 files changed, 15 insertions(+)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index aa4550de455e0..eb3b1d745bf54 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1835,6 +1835,11 @@ class TargetTransformInfo {
   /// otherwise scalar epilogue loop.
   LLVM_ABI bool preferEpilogueVectorization() const;
 
+  /// Return true if the loop vectorizer can generate control flow (conditional
+  /// block) inside the vector region. Otherwise lv will generate a single block
+  /// for vector region and the control flow will be handled by mask.
+  bool preferControlFlow() const;
+
   /// \returns True if the target wants to expand the given reduction intrinsic
   /// into a shuffle sequence.
   LLVM_ABI bool shouldExpandReduction(const IntrinsicInst *II) const;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index abdbca04488db..c01e42440a17c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1091,6 +1091,8 @@ class TargetTransformInfoImplBase {
 
   virtual bool preferEpilogueVectorization() const { return true; }
 
+  virtual bool preferControlFlow() const { return false; }
+
   virtual bool shouldExpandReduction(const IntrinsicInst *II) const {
     return true;
   }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 1fcedcd35ee8b..02173deeb6f5a 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -777,6 +777,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return BaseT::preferPredicateOverEpilogue(TFI);
   }
 
+  bool preferControlFlow() const override { return BaseT::preferControlFlow(); }
+
   TailFoldingStyle
   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const override {
     return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c7eb2ec18c679..8b4ea3fdac478 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -371,6 +371,10 @@ bool TargetTransformInfo::preferPredicateOverEpilogue(
   return TTIImpl->preferPredicateOverEpilogue(TFI);
 }
 
+bool TargetTransformInfo::preferControlFlow() const {
+  return TTIImpl->preferControlFlow();
+}
+
 TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle(
     bool IVUpdateMayOverflow) const {
   return TTIImpl->getPreferredTailFoldingStyle(IVUpdateMayOverflow);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 05d504cbcb6bb..4d6de79ff4010 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -138,6 +138,8 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
     return false;
   }
 
+  bool preferControlFlow() const override { return false; }
+
   InstructionCost
   getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                         unsigned AddressSpace,

>From 66f94541646cf0c6a9a5603dc583156a77e1ef31 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Sun, 4 May 2025 16:01:34 -0700
Subject: [PATCH 3/3] [LV] Introduce conditional vector basic block..

This patch add the transformation that convert flatten control flow
with conditional vector basic block.
This transformation can help program skip masked operations without any
active lane.

First, this transformation will collect all masked stores and operands
bottom-up. And put these msaked operations into a new vector basic
block.

Second, this transformation will split original vector loop and insert
the new basic block between split blocks. And update the conditional
branch in the orignal blocks.

E.g.
Before: {
  vector.loop:
    ...
    BranchOnCount %IV, %TC
  Successors middle.block, vector.loop
}

After: {
  vector.loop:
    ...
    %any.active.mask = any-of(%mask)
    BranchOnCount %any.active.mask, 0
  Successors vector.loop.split, vector.if.bb

  vector.if.bb:
    ... (Masked operations)
  Successors vector.loop.split

  vector.loop.split:
    ...
    BranchOnCount %IV, %TC
  Successors middle.block, vector.loop
}
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   6 +
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 157 ++++++++++++++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |  23 +++
 .../RISCV/vplan-conditional-basic-block.ll    |  11 +-
 4 files changed, 195 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a52aa8420b301..a929ab98ed91c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -354,6 +354,10 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
     cl::desc(
         "Prefer predicating a reduction operation over an after loop select."));
 
+static cl::opt<bool> PreferControlFlow(
+    "prefer-control-flow", cl::init(false), cl::Hidden,
+    cl::desc("Generate control flow inside the vector region."));
+
 cl::opt<bool> llvm::EnableVPlanNativePath(
     "enable-vplan-native-path", cl::Hidden,
     cl::desc("Enable VPlan-native vectorization path with "
@@ -8455,6 +8459,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
           !VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength,
                                     *Plan, CM.getMaxSafeElements()))
         break;
+      if (PreferControlFlow || TTI.preferControlFlow())
+        VPlanTransforms::optimizeConditionalVPBB(*Plan);
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
       VPlans.push_back(std::move(Plan));
     }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a7965a053e6e3..109db46eac3c3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2229,6 +2229,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
     // tryToBuildVPlanWithVPRecipes beforehand.
     for (VPUser *U : collectUsersRecursively(HeaderMask)) {
       auto *CurRecipe = cast<VPRecipeBase>(U);
+
       VPRecipeBase *EVLRecipe =
           optimizeMaskToEVL(HeaderMask, *CurRecipe, TypeInfo, *AllOneMask, EVL);
       if (!EVLRecipe)
@@ -3517,3 +3518,159 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator(
       MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
   MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
 }
+
+void VPlanTransforms::optimizeConditionalVPBB(VPlan &Plan) {
+  VPDominatorTree VPDT;
+  VPDT.recalculate(Plan);
+
+  SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
+
+  // Get the mask from the store recipes.
+  auto GetMask = [&HeaderMasks](VPRecipeBase &R) -> VPValue * {
+    using namespace llvm::VPlanPatternMatch;
+    if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(R)) {
+      VPValue *OrigMask = cast<VPWidenMemoryRecipe>(R).getMask();
+      if (!OrigMask)
+        return OrigMask;
+
+      if (any_of(HeaderMasks, [OrigMask](VPValue *HeaderMask) {
+            return OrigMask == HeaderMask;
+          }))
+        return nullptr;
+
+      // Match active.lane.mask.
+      if (match(OrigMask, m_VPInstruction<VPInstruction::ActiveLaneMask>(
+                              m_VPValue(), m_VPValue())))
+        return nullptr;
+
+      return OrigMask;
+    }
+    return nullptr;
+  };
+
+  // First, collect all masked stores.
+  SmallVector<std::pair<VPRecipeBase *, VPValue *>> MaskedStores;
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+      Plan.getEntry());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+    for (VPRecipeBase &R : *VPBB) {
+      if (VPValue *Mask = GetMask(R))
+        MaskedStores.emplace_back(&R, Mask);
+    }
+  }
+
+  DenseSet<VPRecipeBase *> Candidates;
+  auto AddOperandsToCandidates = [&Candidates](VPRecipeBase *R) {
+    for (VPValue *Op : R->operands())
+      if (VPRecipeBase *OpR = Op->getDefiningRecipe())
+        Candidates.insert(OpR);
+  };
+
+  SmallVector<SetVector<VPRecipeBase *>> Tries;
+  while (!MaskedStores.empty()) {
+    auto [LR, M] = MaskedStores.pop_back_val();
+    Candidates.clear();
+    AddOperandsToCandidates(LR);
+
+    SetVector<VPRecipeBase *> CurrentTree;
+    CurrentTree.insert(LR);
+
+    VPBasicBlock *MaskBlock =
+        M->hasDefiningRecipe() ? M->getDefiningRecipe()->getParent() : nullptr;
+    auto End = MaskBlock == LR->getParent()
+                   ? M->getDefiningRecipe()->getReverseIterator()
+                   : LR->getParent()->getFirstNonPhi()->getReverseIterator();
+    // Greedily add all recipes that are used to compute stored value to the
+    // tree. All users of the added recipe must dominate the store
+    // recipe.
+    for (VPRecipeBase &R : make_range(LR->getReverseIterator(), End)) {
+      // Recipe is not a part of the tree
+      if (!Candidates.contains(&R))
+        continue;
+
+      if (any_of(R.definedValues(), [&LR = LR, &VPDT](VPValue *Def) {
+            for (VPUser *U : Def->users()) {
+              if (auto *UR = dyn_cast<VPRecipeBase>(U)) {
+                if (UR == LR || VPDT.properlyDominates(UR, LR))
+                  continue;
+              }
+              return true;
+            }
+            return false;
+          }))
+        continue;
+
+      CurrentTree.insert(&R);
+      AddOperandsToCandidates(&R);
+    }
+    // Previous traversal could add recipes that are used by non-added recipes,
+    // thus need to be removed from the list.
+    DenseSet<VPRecipeBase *> ToRemove;
+    bool Changed;
+    do {
+      Changed = false;
+      for (VPRecipeBase *R : CurrentTree) {
+        if (ToRemove.contains(R))
+          continue;
+        if (any_of(R->definedValues(), [&](VPValue *Def) {
+              for (VPUser *U : Def->users()) {
+                if (auto *UR = dyn_cast<VPRecipeBase>(U))
+                  if (!CurrentTree.contains(UR) || ToRemove.contains(UR))
+                    return true;
+              }
+              return false;
+            })) {
+          Changed = true;
+          ToRemove.insert(R);
+        }
+      }
+    } while (Changed);
+
+    for (VPRecipeBase *R : ToRemove)
+      CurrentTree.remove(R);
+
+    if (CurrentTree.size() > 1)
+      Tries.push_back(CurrentTree);
+  }
+  for (const auto &List : Tries) {
+    VPRecipeBase *LR = List.front();
+    VPValue *M = cast<VPWidenMemoryRecipe>(LR)->getMask();
+    assert(M && "Mask VPValue must exist at this point");
+    auto Recipes = reverse(List.getArrayRef());
+
+    // Split current basic block at LR point so that VPConditionalRegionBlock
+    // can be added inbetween.
+    VPBasicBlock *ParentBB = LR->getParent();
+    VPBasicBlock *ContBB = ParentBB->splitAt(LR->getIterator());
+
+    // Create VPBB and insert it between ParentBB and ContBB.
+    VPBasicBlock *IfBB = Plan.createVPBasicBlock("vector.if.bb");
+    VPBlockUtils::insertBlockAfter(IfBB, ParentBB);
+    if (ContBB->getNumSuccessors() == 0)
+      ParentBB->getEnclosingLoopRegion()->setExiting(ContBB);
+
+    // Copy recipes into conditional block.
+    for (VPRecipeBase *R : Recipes)
+      R->moveBefore(*IfBB, IfBB->end());
+
+    // Add the condition and brach in the parent block.
+    auto *ActiveLane =
+        new VPInstruction(VPInstruction::AnyOf, {M}, nullptr, "any.of.mask");
+
+    Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
+    LLVMContext &Ctx = CanonicalIVType->getContext();
+    VPValue *Zero =
+        Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt1Ty(Ctx), 0));
+
+    auto *BranchOnCount =
+        new VPInstruction(VPInstruction::BranchOnCount, {ActiveLane, Zero});
+    ParentBB->appendRecipe(ActiveLane);
+    ParentBB->appendRecipe(BranchOnCount);
+
+    // Set proper predecessor and successors for modifed basicblocks.
+    ParentBB->clearSuccessors();
+    ParentBB->setSuccessors({ContBB, IfBB});
+    ContBB->clearPredecessors();
+    ContBB->setPredecessors({ParentBB, IfBB});
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5943684e17a76..c90e9c6370a71 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -284,6 +284,29 @@ struct VPlanTransforms {
   static void
   addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
                                     std::optional<unsigned> VScaleForTuning);
+
+  /// Try to convet flatten control flow to the conditional vector basic block.
+  /// If no active bits in the mask, will skip all the masked operations.
+  /// This transformation will collect all masked operations bottom-up from the
+  /// masked stores and put all of masked operations in a new vector basic
+  /// block. This original vector.loop will be split and the new created basic
+  /// block will inserted in between.
+  ///
+  /// After transformation the vplan will looks like.
+  /// vector.loop:
+  ///   ...
+  ///   %any.active.mask = any-of(%Mask)
+  ///   Branch-On-Count %any.active.mask, 0
+  /// successors vector.loop.split, vector.if.bb
+  ///
+  /// vector.if.bb:
+  ///   (Masked operations)
+  /// successors vector.loop.split
+  ///
+  /// vector.loop.split:
+  ///   ...
+  /// successors middle.block, vector.loop
+  static void optimizeConditionalVPBB(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
index d43c598db5476..55dfc811a5e8c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v %s | FileCheck %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v -prefer-control-flow %s | FileCheck %s
 
 define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) {
 ; CHECK-LABEL: define void @test(
@@ -28,7 +28,7 @@ define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY_SPLIT:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
@@ -37,11 +37,18 @@ define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i1> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i1 [[TMP13]], false
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[VECTOR_BODY_SPLIT]], label %[[VECTOR_IF_BB:.*]]
+; CHECK:       [[VECTOR_IF_BB]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP2]], i32 8, <4 x i1> [[TMP7]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP12]], i32 8, <4 x i1> [[TMP8]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY_SPLIT]]
+; CHECK:       [[VECTOR_BODY_SPLIT]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]