[llvm] [LV][RFC] Generating conditional VPBB that will be skip when the mask is inactive in VPlan. (PR #141900)

Sun Dec 14 18:23:32 PST 2025

https://github.com/ElvisWang123 updated https://github.com/llvm/llvm-project/pull/141900

>From 65634758469617f5e86d187299eea44cec6d5bb5 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Mon, 28 Apr 2025 22:31:50 -0700
Subject: [PATCH 1/3] [LV] Pre-commit test case for condition VPBB. (NFC)

---
 .../RISCV/vplan-conditional-basic-block.ll    | 119 ++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
new file mode 100644
index 0000000000000..d43c598db5476
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v %s | FileCheck %s
+
+define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i32 [[CONTROL1:%.*]], i32 [[CONTROL2:%.*]], i32 [[TARGET:%.*]], i32 [[REG_4_VAL:%.*]], ptr [[REG_24_VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[REG_4_VAL]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[SH_PROM:%.*]] = zext nneg i32 [[CONTROL1]] to i64
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 1, [[SH_PROM]]
+; CHECK-NEXT:    [[SH_PROM5:%.*]] = zext nneg i32 [[CONTROL2]] to i64
+; CHECK-NEXT:    [[SHL6:%.*]] = shl nuw i64 1, [[SH_PROM5]]
+; CHECK-NEXT:    [[SH_PROM10:%.*]] = zext nneg i32 [[TARGET]] to i64
+; CHECK-NEXT:    [[SHL11:%.*]] = shl nuw nsw i64 1, [[SH_PROM10]]
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[REG_4_VAL]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze i64 [[SHL6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[SHL]], [[TMP0]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[SHL11]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP2]], i32 8, <4 x i1> [[TMP7]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP12]], i32 8, <4 x i1> [[TMP8]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[REG_24_VAL]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[TMP28:%.*]] = and i64 [[TMP27]], [[TMP1]]
+; CHECK-NEXT:    [[OR_COND_NOT:%.*]] = icmp eq i64 [[TMP28]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[OR_COND_NOT]], label %[[IF_THEN9:.*]], label %[[FOR_INC]]
+; CHECK:       [[IF_THEN9]]:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i64 [[TMP27]], [[SHL11]]
+; CHECK-NEXT:    store i64 [[XOR]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp1 = icmp sgt i32 %reg.4.val, 0
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %sh_prom = zext nneg i32 %control1 to i64
+  %shl = shl nuw i64 1, %sh_prom
+  %sh_prom5 = zext nneg i32 %control2 to i64
+  %shl6 = shl nuw i64 1, %sh_prom5
+  %sh_prom10 = zext nneg i32 %target to i64
+  %shl11 = shl nuw nsw i64 1, %sh_prom10
+  %wide.trip.count = zext nneg i32 %reg.4.val to i64
+  %0 = freeze i64 %shl6
+  %1 = or i64 %shl, %0
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i64, ptr %reg.24.val, i64 %indvars.iv
+  %2 = load i64, ptr %arrayidx, align 8
+  %3 = and i64 %2, %1
+  %or.cond.not = icmp eq i64 %3, %1
+  br i1 %or.cond.not, label %if.then9, label %for.inc
+
+if.then9:
+  %xor = xor i64 %2, %shl11
+  store i64 %xor, ptr %arrayidx, align 8
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

>From f5c344e59133a52f3f1e994ead57ffc933322e76 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Sun, 11 May 2025 20:02:56 -0700
Subject: [PATCH 2/3] [TTI] Add `preferFlattenControlFlow` for loop vectorizer.
 (NFC)

This patch add a need attribut in TTI to let LV knows which is better.

Default value of preferControlFlow is false to match current TTI
implementation.

If preferFlattenControlFlow() return true, LV will try to generate
conditional VPBB if possible.
---
 llvm/include/llvm/Analysis/TargetTransformInfo.h     | 6 ++++++
 llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 2 ++
 llvm/include/llvm/CodeGen/BasicTTIImpl.h             | 2 ++
 llvm/lib/Analysis/TargetTransformInfo.cpp            | 4 ++++
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h     | 2 ++
 5 files changed, 16 insertions(+)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5a4eb8daf0af6..dce7aca6761b7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1880,6 +1880,12 @@ class TargetTransformInfo {
   /// maximum register pressure exceeds getNumberOfRegisters.
   LLVM_ABI bool shouldConsiderVectorizationRegPressure() const;
 
+  /// Return true if the loop vectorizer can generate control flow (conditional
+  /// blocks) inside the vector region. Otherwise, the loop vectorizer will
+  /// generate a single block for the vector region and handle control flow via
+  /// a mask.
+  LLVM_ABI bool preferControlFlow() const;
+
   /// \returns True if the target wants to expand the given reduction intrinsic
   /// into a shuffle sequence.
   LLVM_ABI bool shouldExpandReduction(const IntrinsicInst *II) const;
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 74857a5b83aba..272a3a17dcbba 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1091,6 +1091,8 @@ class TargetTransformInfoImplBase {
 
   virtual bool shouldConsiderVectorizationRegPressure() const { return false; }
 
+  virtual bool preferControlFlow() const { return false; }
+
   virtual bool shouldExpandReduction(const IntrinsicInst *II) const {
     return true;
   }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 0b60c38f0ce8c..b5a547f79c663 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -801,6 +801,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return BaseT::preferPredicateOverEpilogue(TFI);
   }
 
+  bool preferControlFlow() const override { return BaseT::preferControlFlow(); }
+
   TailFoldingStyle
   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const override {
     return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2961d9361e5fa..3133bf221ae7c 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -370,6 +370,10 @@ bool TargetTransformInfo::preferPredicateOverEpilogue(
   return TTIImpl->preferPredicateOverEpilogue(TFI);
 }
 
+bool TargetTransformInfo::preferControlFlow() const {
+  return TTIImpl->preferControlFlow();
+}
+
 TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle(
     bool IVUpdateMayOverflow) const {
   return TTIImpl->getPreferredTailFoldingStyle(IVUpdateMayOverflow);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index e6b75d7c458db..5c5d52c84fef8 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -143,6 +143,8 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
 
   bool shouldConsiderVectorizationRegPressure() const override { return true; }
 
+  bool preferControlFlow() const override { return false; }
+
   InstructionCost
   getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
                            TTI::TargetCostKind CostKind) const override;

>From 697832994680859bdf58fb9231dd35d0f4f8f2b6 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang at sifive.com>
Date: Sun, 4 May 2025 16:01:34 -0700
Subject: [PATCH 3/3] [LV] Introduce conditional vector basic block.

This patch add the transformation that convert flatten control flow
with conditional vector basic block.
This transformation can help program skip masked operations without any
active lane.

First, this transformation will collect all masked stores and operands
bottom-up. And put these msaked operations into a new vector basic
block.

Second, this transformation will split original vector loop and insert
the new basic block between split blocks. And update the conditional
branch in the orignal blocks.

E.g.
Before: {
  vector.loop:
    ...
    BranchOnCount %IV, %TC
  Successors middle.block, vector.loop
}

After: {
  vector.loop:
    ...
    %any.active.mask = any-of(%mask)
    BranchOnCond %any.active.mask
  Successors vector.if.bb, vector.loop.split

  vector.if.bb:
    ... (Masked operations)
  Successors vector.loop.split

  vector.loop.split:
    ...
    BranchOnCount %IV, %TC
  Successors middle.block, vector.loop
}
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  11 ++
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 158 ++++++++++++++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |  23 +++
 .../RISCV/vplan-conditional-basic-block.ll    |  19 ++-
 4 files changed, 205 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 082489f70f1c6..94604df6b14a7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -351,6 +351,10 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
     cl::desc(
         "Prefer predicating a reduction operation over an after loop select."));
 
+static cl::opt<bool> PreferControlFlow(
+    "prefer-control-flow", cl::init(false), cl::Hidden,
+    cl::desc("Generate control flow inside the vector region."));
+
 cl::opt<bool> llvm::EnableVPlanNativePath(
     "enable-vplan-native-path", cl::Hidden,
     cl::desc("Enable VPlan-native vectorization path with "
@@ -4267,6 +4271,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
           case VPInstruction::ExplicitVectorLength:
             C += VPI->cost(VF, CostCtx);
             break;
+          case VPInstruction::AnyOf:
+            if (!VPI->getUnderlyingValue())
+              C += VPI->cost(VF, CostCtx);
+            break;
           default:
             break;
           }
@@ -8647,6 +8655,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   }
   VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues, *PSE.getSE());
 
+  if (PreferControlFlow || TTI.preferControlFlow())
+    VPlanTransforms::optimizeConditionalVPBB(*Plan);
+
   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
   return Plan;
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6627133878fdb..90d7cc1f2c3ce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5182,3 +5182,161 @@ void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
     }
   }
 }
+
+void VPlanTransforms::optimizeConditionalVPBB(VPlan &Plan) {
+  VPDominatorTree VPDT(Plan);
+
+  VPValue *HeaderMask = findHeaderMask(Plan);
+
+  // Get the mask from the store recipes.
+  auto GetMask = [&HeaderMask](VPRecipeBase &R) -> VPValue * {
+    using namespace llvm::VPlanPatternMatch;
+    if (!isa<VPWidenStoreRecipe>(R))
+      return nullptr;
+    VPValue *OrigMask = cast<VPWidenMemoryRecipe>(R).getMask();
+    if (!OrigMask || OrigMask == HeaderMask ||
+        match(OrigMask, m_VPInstruction<VPInstruction::ActiveLaneMask>(
+                            m_VPValue(), m_VPValue())))
+      return nullptr;
+
+    return OrigMask;
+  };
+
+  // First, collect all masked stores.
+  SmallVector<std::pair<VPRecipeBase *, VPValue *>> MaskedStores;
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+      Plan.getEntry());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+    for (VPRecipeBase &R : *VPBB) {
+      if (VPValue *Mask = GetMask(R))
+        MaskedStores.emplace_back(&R, Mask);
+    }
+  }
+
+  if (MaskedStores.empty())
+    return;
+
+  DenseSet<VPRecipeBase *> Candidates;
+  auto AddOperandsToCandidates = [&Candidates](VPRecipeBase *R) {
+    for (VPValue *Op : R->operands())
+      if (VPRecipeBase *OpR = Op->getDefiningRecipe())
+        Candidates.insert(OpR);
+  };
+
+  SmallVector<SetVector<VPRecipeBase *>> Tries;
+  while (!MaskedStores.empty()) {
+    auto [SR, M] = MaskedStores.pop_back_val();
+    Candidates.clear();
+    AddOperandsToCandidates(SR);
+
+    SetVector<VPRecipeBase *> CurrentTree;
+    CurrentTree.insert(SR);
+
+    VPBasicBlock *MaskBlock =
+        M->hasDefiningRecipe() ? M->getDefiningRecipe()->getParent() : nullptr;
+
+    // Don't move recipes before the mask and PHI recipes.
+    auto End =
+        MaskBlock == SR->getParent()
+            ? M->getDefiningRecipe()->getReverseIterator()
+            : std::next(
+                  SR->getParent()->getFirstNonPhi()->getReverseIterator());
+    // Also don't move the recipes through any recipe that may have side effects
+    // or write to memory.
+    for (auto It = std::next(SR->getReverseIterator()); It != End; ++It) {
+      if (It->mayHaveSideEffects() || It->mayWriteToMemory()) {
+        End = It;
+        break;
+      }
+    }
+
+    // Greedily add all recipes that are used to compute the stored value to the
+    // tree. All users of the added recipe must dominate the store
+    // recipe.
+    for (VPRecipeBase &R : make_range(SR->getReverseIterator(), End)) {
+      // Recipe is not part of the tree
+      if (!Candidates.contains(&R))
+        continue;
+
+      if (any_of(R.definedValues(), [&SR = SR, &VPDT](VPValue *Def) {
+            for (VPUser *U : Def->users()) {
+              if (auto *UR = dyn_cast<VPRecipeBase>(U)) {
+                if (UR == SR || VPDT.properlyDominates(UR, SR))
+                  continue;
+              }
+              return true;
+            }
+            return false;
+          }))
+        continue;
+
+      CurrentTree.insert(&R);
+      AddOperandsToCandidates(&R);
+    }
+    // The previous traversal could have added recipes that are used by
+    // non-added recipes, which need to be removed from the list.
+    SmallDenseSet<VPRecipeBase *, 8> ToRemove;
+    bool Changed;
+    do {
+      Changed = false;
+      for (VPRecipeBase *R : CurrentTree) {
+        if (ToRemove.contains(R))
+          continue;
+        if (any_of(R->definedValues(), [&](VPValue *Def) {
+              for (VPUser *U : Def->users()) {
+                if (auto *UR = dyn_cast<VPRecipeBase>(U))
+                  if (!CurrentTree.contains(UR) || ToRemove.contains(UR))
+                    return true;
+              }
+              return false;
+            })) {
+          Changed = true;
+          ToRemove.insert(R);
+        }
+      }
+    } while (Changed);
+
+    for (VPRecipeBase *R : ToRemove)
+      CurrentTree.remove(R);
+
+    if (CurrentTree.size() > 1)
+      Tries.push_back(CurrentTree);
+  }
+
+  for (const auto &List : Tries) {
+    VPRecipeBase *SR = List.front();
+    VPValue *M = cast<VPWidenMemoryRecipe>(SR)->getMask();
+    assert(M && "Mask VPValue must exist at this point");
+    auto Recipes = reverse(List.getArrayRef());
+
+    // Split the current basic block at the store recipe point so that
+    // a predicated block can be added in between.
+    VPBasicBlock *ParentBB = SR->getParent();
+    VPBasicBlock *ContBB = ParentBB->splitAt(SR->getIterator());
+
+    // Create VPBB and insert it between ParentBB and ContBB.
+    VPBasicBlock *IfBB = Plan.createVPBasicBlock("vector.if.bb");
+    VPBlockUtils::insertBlockAfter(IfBB, ParentBB);
+    if (ContBB->getNumSuccessors() == 0)
+      ParentBB->getEnclosingLoopRegion()->setExiting(ContBB);
+
+    // Move recipes into the conditional block.
+    for (VPRecipeBase *R : Recipes)
+      R->moveBefore(*IfBB, IfBB->end());
+
+    // Add the condition and branch in the parent block.
+    auto *ActiveLane = new VPInstruction(VPInstruction::AnyOf, {M}, {}, {},
+                                         DebugLoc::getUnknown(), "any.of.mask");
+
+    auto *BranchOnCond =
+        new VPInstruction(VPInstruction::BranchOnCond, ActiveLane);
+    ParentBB->appendRecipe(ActiveLane);
+    ParentBB->appendRecipe(BranchOnCond);
+
+    // Set proper predecessors and successors for the conditional block.
+    ParentBB->clearSuccessors();
+    ParentBB->setSuccessors({IfBB, ContBB});
+    ContBB->clearPredecessors();
+    ContBB->setPredecessors({ParentBB, IfBB});
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index afdf1655b4622..0328b3aa474df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -402,6 +402,29 @@ struct VPlanTransforms {
   /// users in the original exit block using the VPIRInstruction wrapping to the
   /// LCSSA phi.
   static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
+
+  /// Try to convert flattened control flow into a conditional vector basic
+  /// block. If there are no active bits in the mask, it will skip all masked
+  /// operations. This transformation will collect all masked operations
+  /// bottom-up from the masked stores and put all masked operations in a new
+  /// vector basic block. The original vector.loop will be split and the newly
+  /// created basic block will be inserted in between.
+  ///
+  ///
+  ///      [ ] <-- vector.loop
+  ///      ^  |    %any.active.mask = any-of(%Mask)
+  ///     /   |    Branch-On-Count %any.active.mask, 0
+  ///    /    |\
+  ///   |  (T)| \ (F)
+  ///   |     |  v
+  ///   |     |  [ ] <-- vector.if.bb (masked operations)
+  ///   |     |    |
+  ///   |     |    v
+  ///   |     +-->[ ] <-- vector.loop.split
+  ///   |         |  |
+  ///   +---------+  v
+  ///               [ ] <-- middle.block
+  static void optimizeConditionalVPBB(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
index d43c598db5476..26679a2391bcf 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-conditional-basic-block.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v %s | FileCheck %s
+; RUN: opt -p loop-vectorize -force-vector-width=4 -S -mtriple=riscv64 -mattr=+v -prefer-control-flow %s | FileCheck %s
 
 define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr %reg.24.val) {
 ; CHECK-LABEL: define void @test(
@@ -28,20 +28,27 @@ define void @test(i32 %control1, i32 %control2, i32 %target, i32 %reg.4.val, ptr
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[IF_THEN9_SPLIT:.*]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[REG_24_VAL]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[TMP2]], i64 4
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP5]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = freeze <4 x i1> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[VECTOR_IF_BB:.*]], label %[[IF_THEN9_SPLIT]]
+; CHECK:       [[VECTOR_IF_BB]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i64> [[WIDE_LOAD3]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP2]], i32 4
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr [[TMP2]], i32 8, <4 x i1> [[TMP7]])
-; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr [[TMP12]], i32 8, <4 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP9]], ptr align 8 [[TMP2]], <4 x i1> [[TMP7]])
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[TMP10]], ptr align 8 [[TMP4]], <4 x i1> [[TMP8]])
+; CHECK-NEXT:    br label %[[IF_THEN9_SPLIT]]
+; CHECK:       [[IF_THEN9_SPLIT]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]