[llvm] 7fe41ac - Revert "[LV] Unconditionally branch from middle to scalar preheader if the scalar loop must execute"

Fri Feb 5 09:05:42 PST 2021

Adrian,

I'm going to need you to provide a bit more information here. The test 
failure in stage1 was fixed at the time you reverted this patch.  The 
remaining failure in the bot is very unclear.  What is a execution_time 
failure? From the log output, the "failing" run finished in 0.5 seconds, 
whereas the previous "succeeding" run finished in 11 seconds.  Without 
further context, I'd say that's no failure.

I'll also note that I did not receive email from this bot.  I received 
notice from the various other bots and fixed the ARM test issue, but 
unless I missed it in with the others, this bot is not notifying.

In general, I'm a fan of fast reverts, but I have to admit, this one 
appears borderline at the moment.

Philip

On 2/5/21 3:53 AM, Adrian Kuegel via llvm-commits wrote:
> Author: Adrian Kuegel
> Date: 2021-02-05T12:51:03+01:00
> New Revision: 7fe41ac3dff2d44c3d2c31b28554fbe4a86eaa6c
>
> URL: https://github.com/llvm/llvm-project/commit/7fe41ac3dff2d44c3d2c31b28554fbe4a86eaa6c
> DIFF: https://github.com/llvm/llvm-project/commit/7fe41ac3dff2d44c3d2c31b28554fbe4a86eaa6c.diff
>
> LOG: Revert "[LV] Unconditionally branch from middle to scalar preheader if the scalar loop must execute"
>
> This reverts commit 3e5ce49e5371ce4feadbf97dd5c2b652d9db3d1d.
>
> Tests started failing on PPC, for example:
> http://lab.llvm.org:8011/#/builders/105/builds/5569
>
> Added:
>      
>
> Modified:
>      llvm/lib/Transforms/Utils/LoopVersioning.cpp
>      llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
>      llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
>      llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
>      llvm/test/Transforms/LoopVectorize/loop-form.ll
>
> Removed:
>      
>
>
> ################################################################################
> diff  --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
> index 8a89158788cf..de4fb446fdf2 100644
> --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
> +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
> @@ -44,11 +44,11 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI,
>         AliasChecks(Checks.begin(), Checks.end()),
>         Preds(LAI.getPSE().getUnionPredicate()), LAI(LAI), LI(LI), DT(DT),
>         SE(SE) {
> +  assert(L->getUniqueExitBlock() && "No single exit block");
>   }
>   
>   void LoopVersioning::versionLoop(
>       const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
> -  assert(VersionedLoop->getUniqueExitBlock() && "No single exit block");
>     assert(VersionedLoop->isLoopSimplifyForm() &&
>            "Loop is not in loop-simplify form");
>   
>
> diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
> index 3277842edbfe..6bce0caeb36f 100644
> --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
> +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
> @@ -852,7 +852,7 @@ class InnerLoopVectorizer {
>     /// Middle Block between the vector and the scalar.
>     BasicBlock *LoopMiddleBlock;
>   
> -  /// The unique ExitBlock of the scalar loop if one exists.  Note that
> +  /// The (unique) ExitBlock of the scalar loop.  Note that
>     /// there can be multiple exiting edges reaching this block.
>     BasicBlock *LoopExitBlock;
>   
> @@ -3147,13 +3147,9 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
>                                  DT->getNode(Bypass)->getIDom()) &&
>            "TC check is expected to dominate Bypass");
>   
> -  // Update dominator for Bypass & LoopExit (if needed).
> +  // Update dominator for Bypass & LoopExit.
>     DT->changeImmediateDominator(Bypass, TCCheckBlock);
> -  if (!Cost->requiresScalarEpilogue())
> -    // If there is an epilogue which must run, there's no edge from the
> -    // middle block to exit blocks  and thus no need to update the immediate
> -    // dominator of the exit blocks.
> -    DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
> +  DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
>   
>     ReplaceInstWithInst(
>         TCCheckBlock->getTerminator(),
> @@ -3192,11 +3188,7 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
>     // Update dominator only if this is first RT check.
>     if (LoopBypassBlocks.empty()) {
>       DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
> -    if (!Cost->requiresScalarEpilogue())
> -      // If there is an epilogue which must run, there's no edge from the
> -      // middle block to exit blocks  and thus no need to update the immediate
> -      // dominator of the exit blocks.
> -      DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
> +    DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
>     }
>   
>     ReplaceInstWithInst(
> @@ -3252,11 +3244,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
>     // Update dominator only if this is first RT check.
>     if (LoopBypassBlocks.empty()) {
>       DT->changeImmediateDominator(Bypass, MemCheckBlock);
> -    if (!Cost->requiresScalarEpilogue())
> -      // If there is an epilogue which must run, there's no edge from the
> -      // middle block to exit blocks  and thus no need to update the immediate
> -      // dominator of the exit blocks.
> -      DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
> +    DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
>     }
>   
>     Instruction *FirstCheckInst;
> @@ -3381,10 +3369,9 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
>   Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
>     LoopScalarBody = OrigLoop->getHeader();
>     LoopVectorPreHeader = OrigLoop->getLoopPreheader();
> +  LoopExitBlock = OrigLoop->getUniqueExitBlock();
> +  assert(LoopExitBlock && "Must have an exit block");
>     assert(LoopVectorPreHeader && "Invalid loop structure");
> -  LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
> -  assert((LoopExitBlock || Cost->requiresScalarEpilogue()) &&
> -         "multiple exit loop without required epilogue?");
>   
>     LoopMiddleBlock =
>         SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
> @@ -3393,20 +3380,12 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
>         SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
>                    nullptr, Twine(Prefix) + "scalar.ph");
>   
> +  // Set up branch from middle block to the exit and scalar preheader blocks.
> +  // completeLoopSkeleton will update the condition to use an iteration check,
> +  // if required to decide whether to execute the remainder.
> +  BranchInst *BrInst =
> +      BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
>     auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
> -
> -  // Set up the middle block terminator.  Two cases:
> -  // 1) If we know that we must execute the scalar epilogue, emit an
> -  //    unconditional branch.
> -  // 2) Otherwise, we must have a single unique exit block (due to how we
> -  //    implement the multiple exit case).  In this case, set up a conditonal
> -  //    branch from the middle block to the loop scalar preheader, and the
> -  //    exit block.  completeLoopSkeleton will update the condition to use an
> -  //    iteration check, if required to decide whether to execute the remainder.
> -  BranchInst *BrInst = Cost->requiresScalarEpilogue() ?
> -    BranchInst::Create(LoopScalarPreHeader) :
> -    BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
> -                       Builder.getTrue());
>     BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
>     ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
>   
> @@ -3418,11 +3397,7 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
>                    nullptr, nullptr, Twine(Prefix) + "vector.body");
>   
>     // Update dominator for loop exit.
> -  if (!Cost->requiresScalarEpilogue())
> -    // If there is an epilogue which must run, there's no edge from the
> -    // middle block to exit blocks  and thus no need to update the immediate
> -    // dominator of the exit blocks.
> -    DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
> +  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
>   
>     // Create and register the new vector loop.
>     Loop *Lp = LI->AllocateLoop();
> @@ -3519,14 +3494,10 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
>     auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
>   
>     // Add a check in the middle block to see if we have completed
> -  // all of the iterations in the first vector loop.  Three cases:
> -  // 1) If we require a scalar epilogue, there is no conditional branch as
> -  //    we unconditionally branch to the scalar preheader.  Do nothing.
> -  // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
> -  //    Thus if tail is to be folded, we know we don't need to run the
> -  //    remainder and we can use the previous value for the condition (true).
> -  // 3) Otherwise, construct a runtime check.
> -  if (!Cost->requiresScalarEpilogue() && !Cost->foldTailByMasking()) {
> +  // all of the iterations in the first vector loop.
> +  // If (N - N%VF) == N, then we *don't* need to run the remainder.
> +  // If tail is to be folded, we know we don't need to run the remainder.
> +  if (!Cost->foldTailByMasking()) {
>       Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
>                                           Count, VectorTripCount, "cmp.n",
>                                           LoopMiddleBlock->getTerminator());
> @@ -3590,17 +3561,17 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
>     |    [  ]_|   <-- vector loop.
>     |     |
>     |     v
> -  \   -[ ]   <--- middle-block.
> -   \/   |
> -   /\   v
> -   | ->[ ]     <--- new preheader.
> +  |   -[ ]   <--- middle-block.
> +  |  /  |
> +  | /   v
> +  -|- >[ ]     <--- new preheader.
>      |    |
> - (opt)  v      <-- edge from middle to exit iff epilogue is not required.
> +   |    v
>      |   [ ] \
> -   |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
> +   |   [ ]_|   <-- old scalar loop to handle remainder.
>       \   |
>        \  v
> -      >[ ]     <-- exit block(s).
> +      >[ ]     <-- exit block.
>      ...
>      */
>   
> @@ -4021,18 +3992,13 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
>     // Forget the original basic block.
>     PSE.getSE()->forgetLoop(OrigLoop);
>   
> -  // If we inserted an edge from the middle block to the unique exit block,
> -  // update uses outside the loop (phis) to account for the newly inserted
> -  // edge.
> -  if (!Cost->requiresScalarEpilogue()) {
> -    // Fix-up external users of the induction variables.
> -    for (auto &Entry : Legal->getInductionVars())
> -      fixupIVUsers(Entry.first, Entry.second,
> -                   getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
> -                   IVEndValues[Entry.first], LoopMiddleBlock);
> +  // Fix-up external users of the induction variables.
> +  for (auto &Entry : Legal->getInductionVars())
> +    fixupIVUsers(Entry.first, Entry.second,
> +                 getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
> +                 IVEndValues[Entry.first], LoopMiddleBlock);
>   
> -    fixLCSSAPHIs();
> -  }
> +  fixLCSSAPHIs();
>     for (Instruction *PI : PredicatedInstructions)
>       sinkScalarOperands(&*PI);
>   
> @@ -4250,13 +4216,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
>     // recurrence in the exit block, and then add an edge for the middle block.
>     // Note that LCSSA does not imply single entry when the original scalar loop
>     // had multiple exiting edges (as we always run the last iteration in the
> -  // scalar epilogue); in that case, there is no edge from middle to exit and
> -  // and thus no phis which needed updated.
> -  if (!Cost->requiresScalarEpilogue())
> -    for (PHINode &LCSSAPhi : LoopExitBlock->phis())
> -      if (any_of(LCSSAPhi.incoming_values(),
> -                 [Phi](Value *V) { return V == Phi; }))
> -        LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
> +  // scalar epilogue); in that case, the exiting path through middle will be
> +  // dynamically dead and the value picked for the phi doesn't matter.
> +  for (PHINode &LCSSAPhi : LoopExitBlock->phis())
> +    if (any_of(LCSSAPhi.incoming_values(),
> +               [Phi](Value *V) { return V == Phi; }))
> +      LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
>   }
>   
>   void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
> @@ -4421,11 +4386,10 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
>     // We know that the loop is in LCSSA form. We need to update the PHI nodes
>     // in the exit blocks.  See comment on analogous loop in
>     // fixFirstOrderRecurrence for a more complete explaination of the logic.
> -  if (!Cost->requiresScalarEpilogue())
> -    for (PHINode &LCSSAPhi : LoopExitBlock->phis())
> -      if (any_of(LCSSAPhi.incoming_values(),
> -                 [LoopExitInst](Value *V) { return V == LoopExitInst; }))
> -        LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
> +  for (PHINode &LCSSAPhi : LoopExitBlock->phis())
> +    if (any_of(LCSSAPhi.incoming_values(),
> +               [LoopExitInst](Value *V) { return V == LoopExitInst; }))
> +      LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
>   
>     // Fix the scalar loop reduction variable with the incoming reduction sum
>     // from the vector body and from the backedge value.
> @@ -8074,11 +8038,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
>   
>       // Update dominator for Bypass & LoopExit.
>       DT->changeImmediateDominator(Bypass, TCCheckBlock);
> -    if (!Cost->requiresScalarEpilogue())
> -      // For loops with multiple exits, there's no edge from the middle block
> -      // to exit blocks (as the epilogue must run) and thus no need to update
> -      // the immediate dominator of the exit blocks.
> -      DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
> +    DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
>   
>       LoopBypassBlocks.push_back(TCCheckBlock);
>   
> @@ -8142,12 +8102,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
>   
>     DT->changeImmediateDominator(LoopScalarPreHeader,
>                                  EPI.EpilogueIterationCountCheck);
> -  if (!Cost->requiresScalarEpilogue())
> -    // If there is an epilogue which must run, there's no edge from the
> -    // middle block to exit blocks  and thus no need to update the immediate
> -    // dominator of the exit blocks.
> -    DT->changeImmediateDominator(LoopExitBlock,
> -                                 EPI.EpilogueIterationCountCheck);
> +  DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
>   
>     // Keep track of bypass blocks, as they feed start values to the induction
>     // phis in the scalar loop preheader.
>
> diff  --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
> index ec280bf5d5e4..7d4a3c5c9935 100644
> --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
> +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
> @@ -471,9 +471,10 @@ define i16 @multiple_exit(i16* %p, i32 %n) {
>   ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
>   ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
>   ; CHECK:       middle.block:
> +; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
>   ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
>   ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
> -; CHECK-NEXT:    br label [[SCALAR_PH]]
> +; CHECK-NEXT:    br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]]
>   ; CHECK:       scalar.ph:
>   ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
>   ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
> @@ -485,14 +486,14 @@ define i16 @multiple_exit(i16* %p, i32 %n) {
>   ; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
>   ; CHECK-NEXT:    [[REC_NEXT]] = load i16, i16* [[B]], align 2
>   ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
> -; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
> +; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]]
>   ; CHECK:       for.body:
>   ; CHECK-NEXT:    store i16 [[SCALAR_RECUR]], i16* [[B]], align 4
>   ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
>   ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
>   ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], [[LOOP7:!llvm.loop !.*]]
>   ; CHECK:       if.end:
> -; CHECK-NEXT:    [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[SCALAR_RECUR]], [[FOR_COND]] ]
> +; CHECK-NEXT:    [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[SCALAR_RECUR]], [[FOR_COND]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
>   ; CHECK-NEXT:    ret i16 [[REC_LCSSA]]
>   ;
>   entry:
> @@ -557,9 +558,10 @@ define i16 @multiple_exit2(i16* %p, i32 %n) {
>   ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
>   ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
>   ; CHECK:       middle.block:
> +; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
>   ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
>   ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
> -; CHECK-NEXT:    br label [[SCALAR_PH]]
> +; CHECK-NEXT:    br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]]
>   ; CHECK:       scalar.ph:
>   ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
>   ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
> @@ -571,14 +573,14 @@ define i16 @multiple_exit2(i16* %p, i32 %n) {
>   ; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
>   ; CHECK-NEXT:    [[REC_NEXT]] = load i16, i16* [[B]], align 2
>   ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
> -; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
> +; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]]
>   ; CHECK:       for.body:
>   ; CHECK-NEXT:    store i16 [[SCALAR_RECUR]], i16* [[B]], align 4
>   ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I]], 1
>   ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
>   ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], [[LOOP9:!llvm.loop !.*]]
>   ; CHECK:       if.end:
> -; CHECK-NEXT:    [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_COND]] ], [ 10, [[FOR_BODY]] ]
> +; CHECK-NEXT:    [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_COND]] ], [ 10, [[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
>   ; CHECK-NEXT:    ret i16 [[REC_LCSSA]]
>   ;
>   entry:
>
> diff  --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
> index f0ba677348ab..0d4bdf0ecac3 100644
> --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
> +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
> @@ -447,7 +447,7 @@ define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalia
>   ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 508
>   ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
>   ; CHECK:       middle.block:
> -; CHECK-NEXT:    br label [[SCALAR_PH]]
> +; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
>   ; CHECK:       scalar.ph:
>   ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1016, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
>   ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
> @@ -463,7 +463,7 @@ define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalia
>   ; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
>   ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
>   ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022
> -; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], [[LOOP13:!llvm.loop !.*]]
> +; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], [[LOOP13:!llvm.loop !.*]]
>   ;
>   entry:
>     br label %for.body
> @@ -528,7 +528,7 @@ define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noali
>   ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
>   ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]]
>   ; CHECK:       middle.block:
> -; CHECK-NEXT:    br label [[SCALAR_PH]]
> +; CHECK-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
>   ; CHECK:       scalar.ph:
>   ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
>   ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
> @@ -544,7 +544,7 @@ define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noali
>   ; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
>   ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
>   ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[N]]
> -; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], [[LOOP15:!llvm.loop !.*]]
> +; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], [[LOOP15:!llvm.loop !.*]]
>   ;
>   entry:
>     br label %for.body
> @@ -973,7 +973,7 @@ define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
>   ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
>   ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP24:!llvm.loop !.*]]
>   ; CHECK:       middle.block:
> -; CHECK-NEXT:    br label [[SCALAR_PH]]
> +; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
>   ; CHECK:       scalar.ph:
>   ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
>   ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
> @@ -985,7 +985,7 @@ define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
>   ; CHECK-NEXT:    store i32 [[Z]], i32* [[P_I_Y]], align 4
>   ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
>   ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
> -; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], [[LOOP25:!llvm.loop !.*]]
> +; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP25:!llvm.loop !.*]]
>   ; CHECK:       for.end:
>   ; CHECK-NEXT:    ret void
>   ;
> @@ -1066,7 +1066,7 @@ define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
>   ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
>   ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF3]]
>   ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[BIN_RDX4]], i32 0
> -; CHECK-NEXT:    br label [[SCALAR_PH]]
> +; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
>   ; CHECK:       scalar.ph:
>   ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
>   ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
> @@ -1081,9 +1081,10 @@ define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
>   ; CHECK-NEXT:    [[TMP21]] = add nsw i32 [[TMP20]], [[S]]
>   ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
>   ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
> -; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], [[LOOP27:!llvm.loop !.*]]
> +; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP27:!llvm.loop !.*]]
>   ; CHECK:       for.end:
> -; CHECK-NEXT:    ret i32 [[TMP21]]
> +; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ [[TMP21]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
> +; CHECK-NEXT:    ret i32 [[TMP22]]
>   ;
>   entry:
>     br label %for.body
> @@ -1162,7 +1163,7 @@ define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
>   ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
>   ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP28:!llvm.loop !.*]]
>   ; CHECK:       middle.block:
> -; CHECK-NEXT:    br label [[SCALAR_PH]]
> +; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
>   ; CHECK:       scalar.ph:
>   ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
>   ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
> @@ -1176,7 +1177,7 @@ define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
>   ; CHECK-NEXT:    store i32 [[TMP21]], i32* [[P_I_Y]], align 4
>   ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
>   ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
> -; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], [[LOOP29:!llvm.loop !.*]]
> +; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP29:!llvm.loop !.*]]
>   ; CHECK:       for.end:
>   ; CHECK-NEXT:    ret void
>   ;
> @@ -1263,7 +1264,7 @@ define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
>   ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
>   ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF3]]
>   ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[BIN_RDX4]], i32 0
> -; CHECK-NEXT:    br label [[SCALAR_PH]]
> +; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
>   ; CHECK:       scalar.ph:
>   ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
>   ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
> @@ -1281,9 +1282,10 @@ define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
>   ; CHECK-NEXT:    [[TMP25]] = add nsw i32 [[TMP24]], [[S]]
>   ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
>   ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
> -; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], [[LOOP31:!llvm.loop !.*]]
> +; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP31:!llvm.loop !.*]]
>   ; CHECK:       for.end:
> -; CHECK-NEXT:    ret i32 [[TMP25]]
> +; CHECK-NEXT:    [[TMP26:%.*]] = phi i32 [ [[TMP25]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
> +; CHECK-NEXT:    ret i32 [[TMP26]]
>   ;
>   entry:
>     br label %for.body
>
> diff  --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll
> index f32002fae2b6..91780789088b 100644
> --- a/llvm/test/Transforms/LoopVectorize/loop-form.ll
> +++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll
> @@ -146,14 +146,15 @@ define void @early_exit(i16* %p, i32 %n) {
>   ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
>   ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
>   ; CHECK:       middle.block:
> -; CHECK-NEXT:    br label [[SCALAR_PH]]
> +; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
> +; CHECK-NEXT:    br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]]
>   ; CHECK:       scalar.ph:
>   ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
>   ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
>   ; CHECK:       for.cond:
>   ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
>   ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
> -; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
> +; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]]
>   ; CHECK:       for.body:
>   ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
>   ; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
> @@ -285,14 +286,15 @@ define void @multiple_unique_exit(i16* %p, i32 %n) {
>   ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
>   ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
>   ; CHECK:       middle.block:
> -; CHECK-NEXT:    br label [[SCALAR_PH]]
> +; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
> +; CHECK-NEXT:    br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]]
>   ; CHECK:       scalar.ph:
>   ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
>   ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
>   ; CHECK:       for.cond:
>   ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
>   ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
> -; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
> +; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]]
>   ; CHECK:       for.body:
>   ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
>   ; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
> @@ -372,14 +374,17 @@ define i32 @multiple_unique_exit2(i16* %p, i32 %n) {
>   ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
>   ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
>   ; CHECK:       middle.block:
> -; CHECK-NEXT:    br label [[SCALAR_PH]]
> +; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
> +; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = sub i32 [[N_VEC]], 1
> +; CHECK-NEXT:    [[IND_ESCAPE1:%.*]] = sub i32 [[N_VEC]], 1
> +; CHECK-NEXT:    br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]]
>   ; CHECK:       scalar.ph:
>   ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
>   ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
>   ; CHECK:       for.cond:
>   ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
>   ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
> -; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
> +; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]]
>   ; CHECK:       for.body:
>   ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
>   ; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
> @@ -388,7 +393,7 @@ define i32 @multiple_unique_exit2(i16* %p, i32 %n) {
>   ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
>   ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], [[LOOP9:!llvm.loop !.*]]
>   ; CHECK:       if.end:
> -; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[FOR_BODY]] ], [ [[I]], [[FOR_COND]] ]
> +; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[FOR_BODY]] ], [ [[I]], [[FOR_COND]] ], [ [[IND_ESCAPE1]], [[MIDDLE_BLOCK]] ]
>   ; CHECK-NEXT:    ret i32 [[I_LCSSA]]
>   ;
>   ; TAILFOLD-LABEL: @multiple_unique_exit2(
> @@ -461,14 +466,15 @@ define i32 @multiple_unique_exit3(i16* %p, i32 %n) {
>   ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
>   ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
>   ; CHECK:       middle.block:
> -; CHECK-NEXT:    br label [[SCALAR_PH]]
> +; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
> +; CHECK-NEXT:    br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]]
>   ; CHECK:       scalar.ph:
>   ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
>   ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
>   ; CHECK:       for.cond:
>   ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
>   ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
> -; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
> +; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]]
>   ; CHECK:       for.body:
>   ; CHECK-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
>   ; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
> @@ -477,7 +483,7 @@ define i32 @multiple_unique_exit3(i16* %p, i32 %n) {
>   ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
>   ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], [[LOOP11:!llvm.loop !.*]]
>   ; CHECK:       if.end:
> -; CHECK-NEXT:    [[EXIT:%.*]] = phi i32 [ 0, [[FOR_COND]] ], [ 1, [[FOR_BODY]] ]
> +; CHECK-NEXT:    [[EXIT:%.*]] = phi i32 [ 0, [[FOR_COND]] ], [ 1, [[FOR_BODY]] ], [ 0, [[MIDDLE_BLOCK]] ]
>   ; CHECK-NEXT:    ret i32 [[EXIT]]
>   ;
>   ; TAILFOLD-LABEL: @multiple_unique_exit3(
> @@ -994,7 +1000,8 @@ define void @scalar_predication(float* %addr) {
>   ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
>   ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]]
>   ; CHECK:       middle.block:
> -; CHECK-NEXT:    br label [[SCALAR_PH]]
> +; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 201, 200
> +; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
>   ; CHECK:       scalar.ph:
>   ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
>   ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
> @@ -1002,7 +1009,7 @@ define void @scalar_predication(float* %addr) {
>   ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
>   ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, float* [[ADDR]], i64 [[IV]]
>   ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200
> -; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_BODY:%.*]]
> +; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_BODY:%.*]]
>   ; CHECK:       loop.body:
>   ; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[GEP]], align 4
>   ; CHECK-NEXT:    [[PRED:%.*]] = fcmp oeq float [[TMP11]], 0.000000e+00
> @@ -1088,7 +1095,8 @@ define i32 @me_reduction(i32* %addr) {
>   ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> <i32 1, i32 undef>
>   ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[TMP5]], [[RDX_SHUF]]
>   ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[BIN_RDX]], i32 0
> -; CHECK-NEXT:    br label [[SCALAR_PH]]
> +; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 201, 200
> +; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
>   ; CHECK:       scalar.ph:
>   ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
>   ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
> @@ -1098,7 +1106,7 @@ define i32 @me_reduction(i32* %addr) {
>   ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP_LATCH]] ]
>   ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[ADDR]], i64 [[IV]]
>   ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200
> -; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
> +; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_LATCH]]
>   ; CHECK:       loop.latch:
>   ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[GEP]], align 4
>   ; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP8]]
> @@ -1106,7 +1114,7 @@ define i32 @me_reduction(i32* %addr) {
>   ; CHECK-NEXT:    [[EXITCOND2_NOT:%.*]] = icmp eq i64 [[IV]], 400
>   ; CHECK-NEXT:    br i1 [[EXITCOND2_NOT]], label [[EXIT]], label [[LOOP_HEADER]], [[LOOP15:!llvm.loop !.*]]
>   ; CHECK:       exit:
> -; CHECK-NEXT:    [[LCSSA:%.*]] = phi i32 [ 0, [[LOOP_HEADER]] ], [ [[ACCUM_NEXT]], [[LOOP_LATCH]] ]
> +; CHECK-NEXT:    [[LCSSA:%.*]] = phi i32 [ 0, [[LOOP_HEADER]] ], [ [[ACCUM_NEXT]], [[LOOP_LATCH]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
>   ; CHECK-NEXT:    ret i32 [[LCSSA]]
>   ;
>   ; TAILFOLD-LABEL: @me_reduction(
>
>
>          
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits