[llvm] [LoopVectorize] Don't scalarize predicated instruction with optsize (PR #129265)

Fri Feb 28 08:25:41 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-vectorizers

Author: John Brawn (john-brawn-arm)

<details>
<summary>Changes</summary>

Scalarizing predicated instructions results in a worse code size impact than having a scalar epilogue, which we already forbid with optsize, so we shouldn't allow it.

A couple of notes on the implementation:
 * OptForSizeBasedOnProfile has been moved into the cost model and renamed to OptForSize, as shouldOptimizeForSize checks both the function attribute and profile.
 * We still allow tail folding if we don't need to scalarize any instructions, e.g. see foo_optsize in the test Transforms/LoopVectorize/X86/optsize.ll.

This change requires a lot of test changes. Where a test is specifically testing scalarized predicated instructions I've adjusted it so it still does, either by removing optsize if it makes no difference or forcing tail predication to be enabled. For tests of optsize I've updated the test to check we're not scalarizing.

Fixes #66652

---

Patch is 104.93 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129265.diff


16 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+26-17) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll (+11-37) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll (+2-177) 
- (modified) llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll (+6-181) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/optsize.ll (+82-82) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/small-size.ll (+48-265) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll (+2-3) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/struct-return.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll (+16-31) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e2612698b6b0f..601591cae9dc5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -491,12 +491,7 @@ class InnerLoopVectorizer {
         MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
         PSI(PSI), RTChecks(RTChecks), Plan(Plan),
-        VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {
-    // Query this against the original loop and save it here because the profile
-    // of the original loop header may change as the transformation happens.
-    OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
-        OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
-  }
+        VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {}
 
   virtual ~InnerLoopVectorizer() = default;
 
@@ -669,10 +664,6 @@ class InnerLoopVectorizer {
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
 
-  // Whether this loop should be optimized for size based on profile guided size
-  // optimizatios.
-  bool OptForSizeBasedOnProfile;
-
   /// Structure to hold information about generated runtime checks, responsible
   /// for cleaning the checks, if vectorization turns out unprofitable.
   GeneratedRTChecks &RTChecks;
@@ -986,13 +977,18 @@ class LoopVectorizationCostModel {
                              AssumptionCache *AC,
                              OptimizationRemarkEmitter *ORE, const Function *F,
                              const LoopVectorizeHints *Hints,
-                             InterleavedAccessInfo &IAI)
+                             InterleavedAccessInfo &IAI,
+                             ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
         Hints(Hints), InterleaveInfo(IAI) {
     if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
       initializeVScaleForTuning();
     CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
+    // Query this against the original loop and save it here because the profile
+    // of the original loop header may change as the transformation happens.
+    OptForSize = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+                                             PGSOQueryType::IRPass);
   }
 
   /// \return An upper bound for the vectorization factors (both fixed and
@@ -1833,6 +1829,10 @@ class LoopVectorizationCostModel {
 
   /// The kind of cost that we are calculating
   TTI::TargetCostKind CostKind;
+
+  /// Whether this loop should be optimized for size based on function attribute
+  /// or profile information.
+  bool OptForSize;
 };
 } // end namespace llvm
 
@@ -2612,9 +2612,8 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
   if (!SCEVCheckBlock)
     return nullptr;
 
-  assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
-           (OptForSizeBasedOnProfile &&
-            Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
+  assert((!Cost->OptForSize ||
+          Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
          "Cannot SCEV check stride or overflow when optimizing for size");
   assert(!LoopBypassBlocks.empty() &&
          "Should already be a bypass block due to iteration count check");
@@ -2639,7 +2638,7 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
   if (!MemCheckBlock)
     return nullptr;
 
-  if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
+  if (Cost->OptForSize) {
     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
            "Cannot emit memory checks when optimizing for size, unless forced "
            "to vectorize.");
@@ -5518,6 +5517,9 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
     // includes the scalarization overhead of the predicated instruction.
     InstructionCost VectorCost = getInstructionCost(I, VF);
 
+    if (VectorCost == InstructionCost::getInvalid())
+      continue;
+
     // Compute the cost of the scalarized instruction. This cost is the cost of
     // the instruction as if it wasn't if-converted and instead remained in the
     // predicated block. We will scale this cost by block probability after
@@ -5660,6 +5662,13 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   if (VF.isScalable())
     return InstructionCost::getInvalid();
 
+  // Don't scalarize predicated instructions when optimizing for size unless
+  // we're forced to.
+  if (isPredicatedInst(I) && OptForSize &&
+      !ForceTailFoldingStyle.getNumOccurrences() &&
+      Hints->getForce() != LoopVectorizeHints::FK_Enabled)
+    return InstructionCost::getInvalid();
+
   Type *ValTy = getLoadStoreType(I);
   auto *SE = PSE.getSE();
 
@@ -10090,7 +10099,7 @@ static bool processLoopInVPlanNativePath(
       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
 
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints, IAI);
+                                &Hints, IAI, PSI, BFI);
   // Use the planner for outer loop vectorization.
   // TODO: CM is not used at this point inside the planner. Turn CM into an
   // optional argument if we don't need it in the future.
@@ -10627,7 +10636,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Use the cost model.
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
-                                F, &Hints, IAI);
+                                F, &Hints, IAI, PSI, BFI);
   // Use the planner for vectorization.
   LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
                                ORE);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index cf4fc143fe8c3..843ff4fd5477e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -1588,55 +1588,29 @@ exit:
   ret void
 }
 
-define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
+define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
 ; DEFAULT-LABEL: define void @redundant_branch_and_tail_folding(
-; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
+; DEFAULT-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) {
 ; DEFAULT-NEXT:  entry:
 ; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DEFAULT:       vector.ph:
 ; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DEFAULT:       vector.body:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
-; DEFAULT-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 20)
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_IND1:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_IND:%.*]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 1)
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i32>
-; DEFAULT-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
-; DEFAULT-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; DEFAULT:       pred.store.if:
-; DEFAULT-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
-; DEFAULT-NEXT:    store i32 [[TMP4]], ptr [[DST]], align 4
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; DEFAULT:       pred.store.continue:
-; DEFAULT-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
-; DEFAULT-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
-; DEFAULT:       pred.store.if1:
-; DEFAULT-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
-; DEFAULT-NEXT:    store i32 [[TMP6]], ptr [[DST]], align 4
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE2]]
-; DEFAULT:       pred.store.continue2:
-; DEFAULT-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
-; DEFAULT-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
-; DEFAULT:       pred.store.if3:
-; DEFAULT-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
-; DEFAULT-NEXT:    store i32 [[TMP8]], ptr [[DST]], align 4
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE4]]
-; DEFAULT:       pred.store.continue4:
-; DEFAULT-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
-; DEFAULT-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
-; DEFAULT:       pred.store.if5:
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
 ; DEFAULT-NEXT:    store i32 [[TMP10]], ptr [[DST]], align 4
-; DEFAULT-NEXT:    br label [[PRED_STORE_CONTINUE6]]
-; DEFAULT:       pred.store.continue6:
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
-; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; DEFAULT-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; DEFAULT-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; DEFAULT:       middle.block:
-; DEFAULT-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; DEFAULT:       scalar.ph:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; DEFAULT-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; DEFAULT:       loop.header:
 ; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -1653,7 +1627,7 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize {
 ; DEFAULT-NEXT:    ret void
 ;
 ; PRED-LABEL: define void @redundant_branch_and_tail_folding(
-; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
+; PRED-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) {
 ; PRED-NEXT:  entry:
 ; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
index 291b8f3348f09..1fcb0eb8048b2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
@@ -229,7 +229,6 @@ for.cond.cleanup:
 
 ; This should be vectorized and tail predicated without optsize, as that's
 ; faster, but not with optsize, as it's much larger.
-; FIXME: Currently we avoid tail predication only with minsize
 define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) {
 ; DEFAULT-LABEL: define void @tail_predicate_without_optsize(
 ; DEFAULT-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) {
@@ -429,182 +428,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; OPTSIZE-LABEL: define void @tail_predicate_without_optsize(
 ; OPTSIZE-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
 ; OPTSIZE-NEXT:  [[ENTRY:.*]]:
-; OPTSIZE-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; OPTSIZE:       [[VECTOR_PH]]:
-; OPTSIZE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
-; OPTSIZE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; OPTSIZE-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0
-; OPTSIZE-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT3]], <16 x i8> poison, <16 x i32> zeroinitializer
-; OPTSIZE-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
-; OPTSIZE-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer
-; OPTSIZE-NEXT:    br label %[[VECTOR_BODY:.*]]
-; OPTSIZE:       [[VECTOR_BODY]]:
-; OPTSIZE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE36:.*]] ]
-; OPTSIZE-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE36]] ]
-; OPTSIZE-NEXT:    [[VEC_IND1:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE36]] ]
-; OPTSIZE-NEXT:    [[TMP72:%.*]] = icmp ule <16 x i64> [[VEC_IND]], splat (i64 14)
-; OPTSIZE-NEXT:    [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]]
-; OPTSIZE-NEXT:    [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1)
-; OPTSIZE-NEXT:    [[TMP3:%.*]] = mul <16 x i8> [[TMP2]], [[BROADCAST_SPLAT4]]
-; OPTSIZE-NEXT:    [[TMP4:%.*]] = add <16 x i8> [[TMP3]], [[TMP1]]
-; OPTSIZE-NEXT:    [[TMP5:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 2)
-; OPTSIZE-NEXT:    [[TMP6:%.*]] = mul <16 x i8> [[TMP5]], [[BROADCAST_SPLAT6]]
-; OPTSIZE-NEXT:    [[TMP7:%.*]] = add <16 x i8> [[TMP4]], [[TMP6]]
-; OPTSIZE-NEXT:    [[TMP8:%.*]] = extractelement <16 x i1> [[TMP72]], i32 0
-; OPTSIZE-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; OPTSIZE:       [[PRED_STORE_IF]]:
-; OPTSIZE-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; OPTSIZE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]]
-; OPTSIZE-NEXT:    [[TMP11:%.*]] = extractelement <16 x i8> [[TMP7]], i32 0
-; OPTSIZE-NEXT:    store i8 [[TMP11]], ptr [[TMP10]], align 1
-; OPTSIZE-NEXT:    br label %[[PRED_STORE_CONTINUE]]
-; OPTSIZE:       [[PRED_STORE_CONTINUE]]:
-; OPTSIZE-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP72]], i32 1
-; OPTSIZE-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
-; OPTSIZE:       [[PRED_STORE_IF7]]:
-; OPTSIZE-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 1
-; OPTSIZE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]]
-; OPTSIZE-NEXT:    [[TMP15:%.*]] = extractelement <16 x i8> [[TMP7]], i32 1
-; OPTSIZE-NEXT:    store i8 [[TMP15]], ptr [[TMP14]], align 1
-; OPTSIZE-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
-; OPTSIZE:       [[PRED_STORE_CONTINUE8]]:
-; OPTSIZE-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP72]], i32 2
-; OPTSIZE-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
-; OPTSIZE:       [[PRED_STORE_IF9]]:
-; OPTSIZE-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 2
-; OPTSIZE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP17]]
-; OPTSIZE-NEXT:    [[TMP19:%.*]] = extractelement <16 x i8> [[TMP7]], i32 2
-; OPTSIZE-NEXT:    store i8 [[TMP19]], ptr [[TMP18]], align 1
-; OPTSIZE-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
-; OPTSIZE:       [[PRED_STORE_CONTINUE10]]:
-; OPTSIZE-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP72]], i32 3
-; OPTSIZE-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
-; OPTSIZE:       [[PRED_STORE_IF11]]:
-; OPTSIZE-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 3
-; OPTSIZE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP21]]
-; OPTSIZE-NEXT:    [[TMP23:%.*]] = extractelement <16 x i8> [[TMP7]], i32 3
-; OPTSIZE-NEXT:    store i8 [[TMP23]], ptr [[TMP22]], align 1
-; OPTSIZE-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
-; OPTSIZE:       [[PRED_STORE_CONTINUE12]]:
-; OPTSIZE-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP72]], i32 4
-; OPTSIZE-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
-; OPTSIZE:       [[PRED_STORE_IF13]]:
-; OPTSIZE-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 4
-; OPTSIZE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP25]]
-; OPTSIZE-NEXT:    [[TMP27:%.*]] = extractelement <16 x i8> [[TMP7]], i32 4
-; OPTSIZE-NEXT:    store i8 [[TMP27]], ptr [[TMP26]], align 1
-; OPTSIZE-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
-; OPTSIZE:       [[PRED_STORE_CONTINUE14]]:
-; OPTSIZE-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP72]], i32 5
-; OPTSIZE-NEXT:    br i1 [[TMP28]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
-; OPTSIZE:       [[PRED_STORE_IF15]]:
-; OPTSIZE-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 5
-; OPTSIZE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP29]]
-; OPTSIZE-NEXT:    [[TMP31:%.*]] = extractelement <16 x i8> [[TMP7]], i32 5
-; OPTSIZE-NEXT:    store i8 [[TMP31]], ptr [[TMP30]], align 1
-; OPTSIZE-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
-; OPTSIZE:       [[PRED_STORE_CONTINUE16]]:
-; OPTSIZE-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP72]], i32 6
-; OPTSIZE-NEXT:    br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]]
-; OPTSIZE:       [[PRED_STORE_IF17]]:
-; OPTSIZE-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX]], 6
-; OPTSIZE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP33]]
-; OPTSIZE-NEXT:    [[TMP35:%.*]] = extractelement <16 x i8> [[TMP7]], i32 6
-; OPTSIZE-NEXT:    store i8 [[TMP35]], ptr [[TMP34]], align 1
-; OPTSIZE-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
-; OPTSIZE:       [[PRED_STORE_CONTINUE18]]:
-; OPTSIZE-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP72]], i32 7
-; OPTSIZE-NEXT:    br i1 [[TMP36]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]]
-; OPTSIZE:       [[PRED_STORE_IF19]]:
-; OPTSIZE-NEXT:    [[TMP37:%.*]] = add i64 [[INDEX]], 7
-; OPTSIZE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP37]]
-; OPTSIZE-NEXT:    [[TMP39:%.*]] = extractelement <16 x i8> [[TMP7]], i32 7
-; OPTSIZE-NEXT:    store i8 [[TMP39]], ptr [[TMP38]], align 1
-; OPTSIZE-NEXT:    br label %[[PRED_STORE_CONTINUE20]]
-; OPTSIZE:       [[PRED_STORE_CONTINUE20]]:
-; OPTSIZE-NEXT:    [[TMP40:%.*]] = extractelement <16 x i1> [[TMP72]], i32 8
-; OPTSIZE-NEXT:    br i1 [[TMP40]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]]
-; OPTSIZE:       [[PRED_STORE_IF21]]:
-; OPTSIZE-NEXT:    [[TMP41:%.*]] = add i64 [[INDEX]], 8
-; OPTSIZE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP41]]
-; OPTSIZE-NEXT:    [[TMP43:%.*]] = extractelement <16 x i8> [[TMP7]], i32 8
-; OPTSIZE-NEXT:    store i8 [[TMP43]], ptr [[TMP42]], align 1
-; OPTSIZE-NEXT:    br label %[[PRED_STORE_CONTINUE22]]
-; OPTSIZE:       [[PRED_STORE_CONTINUE22]]:
-; OPTSIZE-NEXT:    [[TMP44:%.*]] = extractelement <16 x i1> [[TMP72]], i32 9
-; OPTSIZE-NEXT:    br i1 [[TMP44]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]]
-; OPTSIZE:       [[PRED_STORE_IF23]]:
-; OPTSIZE-NEXT:    [[TMP45:%.*]] = add i64 [[INDEX]], 9
-; OPTSIZE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP45]]
-; OPTSIZE-NEXT:    [[TMP47:%.*]] = extractelement <16 x i8> [[TMP7]], i32 9
-; OPTSIZE-NEXT:    store i8 [[TMP47]], ptr [[TMP46]], align 1
-; OPTSIZE-NEXT:    br label %[[PRED_STORE_CONTINUE24]]
-; OPTSIZE:       [[PRED_STORE_CONTINUE24]]:
-; OPTSIZE-NEXT:    ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/129265