[llvm] [VPlan] Run removeDeadRecipes early. (PR #190191)

Thu Apr 2 08:13:55 PDT 2026

llvmbot wrote:



@llvm/pr-subscribers-backend-systemz

@llvm/pr-subscribers-backend-risc-v

Author: Florian Hahn (fhahn)

<details>
<summary>Changes</summary>

The iniial simplifyRecipes run can leave dead recipes, which removeDeadRecipes can clean up, similar for dead instructions in the input.

---

Patch is 332.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/190191.diff


43 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+5) 
- (modified) llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h (+3) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+14-1) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll (+4-3) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll (+4-3) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll (+33-32) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll (+99-96) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll (+6-6) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-fold-tail.ll (+3-2) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-chained.ll (+10-6) 
- (modified) llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll (+5-6) 
- (modified) llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll (+4-3) 
- (modified) llvm/test/Transforms/LoopVectorize/VPlan/PowerPC/vplan-force-tail-with-evl.ll (+4-4) 
- (modified) llvm/test/Transforms/LoopVectorize/VPlan/find-last.ll (-2) 
- (modified) llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-sink-replicate-region.ll (+29-29) 
- (modified) llvm/test/Transforms/LoopVectorize/VPlan/icmp-uniforms.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll (+2-30) 
- (modified) llvm/test/Transforms/LoopVectorize/VPlan/tail-folding.ll (-8) 
- (modified) llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-reductions.ll (+1-3) 
- (modified) llvm/test/Transforms/LoopVectorize/VPlan/vplan-sink-scalars-and-merge.ll (+13-13) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll (+4-3) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll (+9-8) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/small-size.ll (+26-24) 
- (modified) llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll (+10-5) 
- (modified) llvm/test/Transforms/LoopVectorize/constant-fold-commutative-and.ll (+4-3) 
- (modified) llvm/test/Transforms/LoopVectorize/constantfolder.ll (+4-3) 
- (modified) llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll (+9-8) 
- (modified) llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll (+21-15) 
- (modified) llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll (+47-45) 
- (modified) llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll (+4-3) 
- (modified) llvm/test/Transforms/LoopVectorize/iv-select-cmp-fold-tail.ll (+3-2) 
- (modified) llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll (+28-24) 
- (modified) llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll (+78-63) 
- (modified) llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll (+75-70) 
- (modified) llvm/test/Transforms/LoopVectorize/reduction-minmax-users-and-predicated.ll (+27-27) 
- (modified) llvm/test/Transforms/LoopVectorize/reduction-predselect.ll (+36-27) 
- (modified) llvm/test/Transforms/LoopVectorize/select-neg-cond.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll (+17-16) 
- (modified) llvm/test/Transforms/LoopVectorize/tail-folding-div.ll (+20-15) 
- (modified) llvm/test/Transforms/LoopVectorize/tail-folding-masked-mem-opts.ll (+3-2) 
- (modified) llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll (+34-25) 
- (modified) llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll (+17-16) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5ddb47a22e84f..2e9fe4ccc098b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7925,6 +7925,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
       CM.getInLoopReductions(), Hints.allowReordering());
 
   VPlanTransforms::simplifyRecipes(*VPlan0);
+  VPlanTransforms::removeDeadRecipes(*VPlan0);
   // If we're vectorizing a loop with an uncountable exit, make sure that the
   // recipes are safe to handle.
   // TODO: Remove this once we can properly check the VPlan itself for both
@@ -8182,6 +8183,10 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
                    Range);
   }
 
+  // Ensure scalar VF plans only contain VF=1, as required by hasScalarVFOnly.
+  if (Range.Start.isScalar())
+    Range.End = Range.Start * 2;
+
   for (ElementCount VF : Range)
     Plan->addVF(VF);
   Plan->setName("Initial VPlan");
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 64315df74dda5..a85ca963c6935 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -106,6 +106,9 @@ class VPRecipeBuilder {
     return Ingredient2Recipe[I];
   }
 
+  /// Return true if a recipe was created for the given ingredient.
+  bool hasRecipe(Instruction *I) const { return Ingredient2Recipe.contains(I); }
+
   /// Build a VPReplicationRecipe for \p VPI. If it is predicated, add the mask
   /// as last operand. Range.End may be decreased to ensure same recipe behavior
   /// from \p Range.Start to \p Range.End.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9944df4778774..14dc7c7447a49 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -668,9 +668,13 @@ static void removeRedundantInductionCasts(VPlan &Plan) {
           break;
         }
       }
+      // Cast recipe may have been removed by earlier simplifications.
+      if (!FoundUserCast)
+        break;
       FindMyCast = FoundUserCast;
     }
-    FindMyCast->replaceAllUsesWith(IV);
+    if (FindMyCast != IV)
+      FindMyCast->replaceAllUsesWith(IV);
   }
 }
 
@@ -3685,6 +3689,15 @@ void VPlanTransforms::createInterleaveGroups(
   // single VPInterleaveRecipe at its insertion point.
   VPDominatorTree VPDT(Plan);
   for (const auto *IG : InterleaveGroups) {
+    // Skip interleave groups where members don't have recipes. This can happen
+    // when removeDeadRecipes removes recipes that are part of interleave groups
+    // but have no users.
+    if (llvm::any_of(llvm::seq(0u, IG->getFactor()), [&](unsigned I) {
+          Instruction *Member = IG->getMember(I);
+          return Member && !RecipeBuilder.hasRecipe(Member);
+        }))
+      continue;
+
     auto *Start =
         cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
     VPIRMetadata InterleaveMD(*Start);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index cec944ee831da..3dabc1a16e5de 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -291,8 +291,10 @@ define void @latch_branch_cost(ptr %dst) {
 ; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; PRED:       [[VECTOR_BODY]]:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
-; PRED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
-; PRED-NEXT:    [[TMP0:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 99)
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; PRED-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; PRED-NEXT:    [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 99)
 ; PRED-NEXT:    [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
 ; PRED-NEXT:    br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; PRED:       [[PRED_STORE_IF]]:
@@ -357,7 +359,6 @@ define void @latch_branch_cost(ptr %dst) {
 ; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
 ; PRED:       [[PRED_STORE_CONTINUE14]]:
 ; PRED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
 ; PRED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104
 ; PRED-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index 7b74e42962bfa..fe31a17598b3d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -599,9 +599,11 @@ define void @forced_scalar_instr(ptr %gep.dst) {
 ; COMMON-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; COMMON:       [[VECTOR_BODY]]:
 ; COMMON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
-; COMMON-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
 ; COMMON-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
-; COMMON-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 4)
+; COMMON-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; COMMON-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; COMMON-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; COMMON-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 4)
 ; COMMON-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
 ; COMMON-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; COMMON:       [[PRED_STORE_IF]]:
@@ -641,7 +643,6 @@ define void @forced_scalar_instr(ptr %gep.dst) {
 ; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; COMMON:       [[PRED_STORE_CONTINUE6]]:
 ; COMMON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; COMMON-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; COMMON-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
 ; COMMON-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; COMMON:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index b1b00bdf84d83..6a9cc494afe33 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -184,8 +184,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IND:%.*]] = add <16 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
@@ -203,7 +205,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK:       pred.load.if1:
+; CHECK:       pred.load.if3:
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1
@@ -212,12 +214,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1
 ; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
-; CHECK:       pred.load.continue2:
+; CHECK:       pred.load.continue4:
 ; CHECK-NEXT:    [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ]
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
+; CHECK:       pred.load.if5:
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1
@@ -226,12 +228,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1
 ; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
+; CHECK:       pred.load.continue6:
 ; CHECK-NEXT:    [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
+; CHECK:       pred.load.if7:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1
@@ -240,12 +242,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1
 ; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
+; CHECK:       pred.load.continue8:
 ; CHECK-NEXT:    [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4
 ; CHECK-NEXT:    br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
-; CHECK:       pred.load.if7:
+; CHECK:       pred.load.if9:
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1
@@ -254,12 +256,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1
 ; CHECK-NEXT:    [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.continue8:
+; CHECK:       pred.load.continue10:
 ; CHECK-NEXT:    [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5
 ; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
-; CHECK:       pred.load.if9:
+; CHECK:       pred.load.if11:
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
 ; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1
@@ -268,12 +270,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1
 ; CHECK-NEXT:    [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
-; CHECK:       pred.load.continue10:
+; CHECK:       pred.load.continue12:
 ; CHECK-NEXT:    [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ]
 ; CHECK-NEXT:    [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ]
 ; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6
 ; CHECK-NEXT:    br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
-; CHECK:       pred.load.if11:
+; CHECK:       pred.load.if13:
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
@@ -282,12 +284,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1
 ; CHECK-NEXT:    [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
-; CHECK:       pred.load.continue12:
+; CHECK:       pred.load.continue14:
 ; CHECK-NEXT:    [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ]
 ; CHECK-NEXT:    [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ]
 ; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7
 ; CHECK-NEXT:    br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
-; CHECK:       pred.load.if13:
+; CHECK:       pred.load.if15:
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1
@@ -296,12 +298,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1
 ; CHECK-NEXT:    [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
-; CHECK:       pred.load.continue14:
+; CHECK:       pred.load.continue16:
 ; CHECK-NEXT:    [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ]
 ; CHECK-NEXT:    [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ]
 ; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8
 ; CHECK-NEXT:    br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
-; CHECK:       pred.load.if15:
+; CHECK:       pred.load.if17:
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1
@@ -310,12 +312,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1
 ; CHECK-NEXT:    [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
-; CHECK:       pred.load.continue16:
+; CHECK:       pred.load.continue18:
 ; CHECK-NEXT:    [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ]
 ; CHECK-NEXT:    [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ]
 ; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9
 ; CHECK-NEXT:    br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]]
-; CHECK:       pred.load.if17:
+; CHECK:       pred.load.if19:
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
 ; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1
@@ -324,12 +326,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1
 ; CHECK-NEXT:    [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE18]]
-; CHECK:       pred.load.continue18:
+; CHECK:       pred.load.continue20:
 ; CHECK-NEXT:    [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ]
 ; CHECK-NEXT:    [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ]
 ; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10
 ; CHECK-NEXT:    br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]]
-; CHECK:       pred.load.if19:
+; CHECK:       pred.load.if21:
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
 ; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1
@@ -338,12 +340,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1
 ; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE20]]
-; CHECK:       pred.load.continue20:
+; CHECK:       pred.load.continue22:
 ; CHECK-NEXT:    [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ]
 ; CHECK-NEXT:    [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ]
 ; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11
 ; CHECK-NEXT:    br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]]
-; CHECK:       pred.load.if21:
+; CHECK:       pred.load.if23:
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
 ; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1
@@ -352,12 +354,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1
 ; CHECK-NEXT:    [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE22]]
-;...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/190191