[llvm] 1742203 - [SimplifyCFG] FoldBranchToCommonDest(): re-lift restrictions on liveout uses of bonus instructions

Roman Lebedev via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 22 14:29:43 PST 2021


Author: Roman Lebedev
Date: 2021-01-23T01:29:05+03:00
New Revision: 17422038442c9e2b572c7324b5a22d32e7fd9b83

URL: https://github.com/llvm/llvm-project/commit/17422038442c9e2b572c7324b5a22d32e7fd9b83
DIFF: https://github.com/llvm/llvm-project/commit/17422038442c9e2b572c7324b5a22d32e7fd9b83.diff

LOG: [SimplifyCFG] FoldBranchToCommonDest(): re-lift restrictions on liveout uses of bonus instructions

I have previously tried doing that in
b33fbbaa34f0fe9fb16789afc72ae424c1825b69 / d38205144febf4dc42c9270c6aa3d978f1ef65e1,
but eventually it was pointed out that the approach taken there
was just broken wrt how the uses of bonus instructions are updated
to account for the fact that they should now use either bonus instruction
or the cloned bonus instruction. In particluar, all that manual handling
of PHI nodes in successors was just wrong.

But, the fix is actually much much simpler than my initial approach:
just tell SSAUpdate about both instances of bonus instruction,
and let it deal with all the PHI handling.

Alive2 confirms that the reproducers from the original bugs (@pr48450*)
are now handled correctly.

This effectively reverts commit 59560e85897afc50090b6c3d920bacfd28b49d06,
effectively relanding b33fbbaa34f0fe9fb16789afc72ae424c1825b69.

Added: 
    

Modified: 
    llvm/lib/Transforms/Utils/SimplifyCFG.cpp
    llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
    llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
    llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
    llvm/test/Transforms/LoopUnroll/peel-loop-inner.ll
    llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 510387bbf9ef..bad8d90cda65 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2916,39 +2916,16 @@ static bool PerformBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
     PredBlock->getInstList().insert(PBI->getIterator(), NewBonusInst);
     NewBonusInst->takeName(&BonusInst);
     BonusInst.setName(NewBonusInst->getName() + ".old");
-    BonusInst.replaceUsesWithIf(
-        NewBonusInst, [BB, BI, UniqueSucc, PredBlock](Use &U) {
-          auto *User = cast<Instruction>(U.getUser());
-          // Ignore non-external uses of bonus instructions.
-          if (User->getParent() == BB) {
-            assert(!isa<PHINode>(User) &&
-                   "Non-external users are never PHI instructions.");
-            return false;
-          }
-          if (User->getParent() == PredBlock) {
-            // The "exteral" use is in the block into which we just cloned the
-            // bonus instruction. This means two things: 1. we are in an
-            // unreachable block 2. the instruction is self-referencing.
-            // So let's just rewrite it...
-            return true;
-          }
-          (void)BI;
-          assert(isa<PHINode>(User) && "All external users must be PHI's.");
-          auto *PN = cast<PHINode>(User);
-          assert(is_contained(successors(BB), User->getParent()) &&
-                 "All external users must be in successors of BB.");
-          assert((PN->getIncomingBlock(U) == BB ||
-                  PN->getIncomingBlock(U) == PredBlock) &&
-                 "The incoming block for that incoming value external use "
-                 "must be either the original block with bonus instructions, "
-                 "or the new predecessor block.");
-          // UniqueSucc is the block for which we change it's predecessors,
-          // so it is the only block in which we'll need to update PHI nodes.
-          if (User->getParent() != UniqueSucc)
-            return false;
-          // Update the incoming value for the new predecessor.
-          return PN->getIncomingBlock(U) == PredBlock;
-        });
+
+    // Update (liveout) uses of bonus instructions,
+    // now that the bonus instruction has been cloned into predecessor.
+    SSAUpdater SSAUpdate;
+    SSAUpdate.Initialize(BonusInst.getType(),
+                         (NewBonusInst->getName() + ".merge").str());
+    SSAUpdate.AddAvailableValue(BB, &BonusInst);
+    SSAUpdate.AddAvailableValue(PredBlock, NewBonusInst);
+    for (Use &U : make_early_inc_range(BonusInst.uses()))
+      SSAUpdate.RewriteUseAfterInsertions(U);
   }
 
   // Now that the Cond was cloned into the predecessor basic block,
@@ -3025,22 +3002,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
       return Changed;
   }
 
-  // Also, for now, all liveout uses of bonus instructions must be in PHI nodes
-  // in successor blocks as incoming values from the bonus instructions's block,
-  // otherwise we'll fail to update them.
-  // FIXME: We could lift this restriction, but we need to form PHI nodes and
-  // rewrite offending uses, but we can't do that without having a domtree.
-  if (any_of(*BB, [BB](Instruction &I) {
-        return any_of(I.uses(), [BB](Use &U) {
-          auto *User = cast<Instruction>(U.getUser());
-          if (User->getParent() == BB)
-            return false; // Not an external use.
-          auto *PN = dyn_cast<PHINode>(User);
-          return !PN || PN->getIncomingBlock(U) != BB;
-        });
-      }))
-    return Changed;
-
   // Cond is known to be a compare or binary operator.  Check to make sure that
   // neither operand is a potentially-trapping constant expression.
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index 5bf4ebf92f14..fa58c72a0efb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1077,36 +1077,34 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    sub sp, #24
 ; CHECK-NEXT:    cmp r3, #8
 ; CHECK-NEXT:    blo.w .LBB16_12
-; CHECK-NEXT:  @ %bb.1: @ %if.then
-; CHECK-NEXT:    movs r7, #0
-; CHECK-NEXT:    cmp.w r7, r3, lsr #2
+; CHECK-NEXT:  @ %bb.1: @ %entry
+; CHECK-NEXT:    lsrs.w r12, r3, #2
 ; CHECK-NEXT:    beq.w .LBB16_12
 ; CHECK-NEXT:  @ %bb.2: @ %while.body.lr.ph
 ; CHECK-NEXT:    ldrh r4, [r0]
-; CHECK-NEXT:    lsr.w r9, r3, #2
-; CHECK-NEXT:    ldrd r5, r12, [r0, #4]
-; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    movs r6, #1
+; CHECK-NEXT:    ldrd r5, r3, [r0, #4]
 ; CHECK-NEXT:    sub.w r0, r4, #8
 ; CHECK-NEXT:    and r8, r0, #7
 ; CHECK-NEXT:    add.w r7, r0, r0, lsr #29
-; CHECK-NEXT:    asrs r6, r7, #3
-; CHECK-NEXT:    cmp r6, #1
+; CHECK-NEXT:    asr.w lr, r7, #3
+; CHECK-NEXT:    cmp.w lr, #1
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    asrgt r3, r7, #3
+; CHECK-NEXT:    asrgt r6, r7, #3
 ; CHECK-NEXT:    add.w r7, r5, r4, lsl #1
-; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
-; CHECK-NEXT:    subs r3, r7, #2
-; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    rsbs r3, r4, #0
-; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    add.w r3, r12, #16
+; CHECK-NEXT:    subs r7, #2
+; CHECK-NEXT:    str r7, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    rsbs r7, r4, #0
+; CHECK-NEXT:    str r7, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    add.w r7, r3, #16
+; CHECK-NEXT:    str r6, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_3: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    subs.w r9, r9, #1
+; CHECK-NEXT:    subs.w r12, r12, #1
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #8
 ; CHECK-NEXT:    add.w r0, r5, r0, lsl #1
@@ -1117,16 +1115,16 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    @ Child Loop BB16_6 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #8
-; CHECK-NEXT:    ldrh.w lr, [r12, #14]
-; CHECK-NEXT:    ldrh.w r0, [r12, #12]
+; CHECK-NEXT:    ldrh.w lr, [r3, #14]
+; CHECK-NEXT:    ldrh r0, [r3, #12]
 ; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    ldrh.w r4, [r12, #10]
-; CHECK-NEXT:    ldrh.w r7, [r12, #8]
-; CHECK-NEXT:    ldrh.w r3, [r12, #6]
-; CHECK-NEXT:    ldrh.w r6, [r12, #4]
-; CHECK-NEXT:    ldrh.w r11, [r12, #2]
-; CHECK-NEXT:    ldrh.w r10, [r12]
+; CHECK-NEXT:    ldrh r4, [r3, #10]
+; CHECK-NEXT:    ldrh r7, [r3, #8]
+; CHECK-NEXT:    ldrh r6, [r3, #6]
+; CHECK-NEXT:    ldrh.w r9, [r3, #4]
+; CHECK-NEXT:    ldrh.w r11, [r3, #2]
+; CHECK-NEXT:    ldrh.w r10, [r3]
 ; CHECK-NEXT:    vstrb.8 q0, [r1], #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r5]
 ; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
@@ -1136,10 +1134,10 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    adds r1, r5, #6
 ; CHECK-NEXT:    vfma.f16 q0, q1, r11
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #4]
-; CHECK-NEXT:    vfma.f16 q0, q1, r6
+; CHECK-NEXT:    vfma.f16 q0, q1, r9
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    add.w r1, r5, #10
-; CHECK-NEXT:    vfma.f16 q0, q1, r3
+; CHECK-NEXT:    vfma.f16 q0, q1, r6
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #8]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r7
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 3986b53cab21..3d0309d1579f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1049,36 +1049,34 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    cmp r3, #8
 ; CHECK-NEXT:    blo.w .LBB16_12
-; CHECK-NEXT:  @ %bb.1: @ %if.then
-; CHECK-NEXT:    movs r7, #0
-; CHECK-NEXT:    cmp.w r7, r3, lsr #2
+; CHECK-NEXT:  @ %bb.1: @ %entry
+; CHECK-NEXT:    lsrs.w r12, r3, #2
 ; CHECK-NEXT:    beq.w .LBB16_12
 ; CHECK-NEXT:  @ %bb.2: @ %while.body.lr.ph
-; CHECK-NEXT:    ldrh r4, [r0]
-; CHECK-NEXT:    lsr.w r10, r3, #2
-; CHECK-NEXT:    ldrd r5, r12, [r0, #4]
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    sub.w r7, r4, #8
-; CHECK-NEXT:    add.w r0, r7, r7, lsr #29
-; CHECK-NEXT:    asrs r6, r0, #3
-; CHECK-NEXT:    cmp r6, #1
+; CHECK-NEXT:    ldrh r6, [r0]
+; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    ldrd r5, r10, [r0, #4]
+; CHECK-NEXT:    sub.w r3, r6, #8
+; CHECK-NEXT:    add.w r0, r3, r3, lsr #29
+; CHECK-NEXT:    asrs r7, r0, #3
+; CHECK-NEXT:    cmp r7, #1
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    asrgt r3, r0, #3
-; CHECK-NEXT:    add.w r0, r5, r4, lsl #2
+; CHECK-NEXT:    asrgt r4, r0, #3
+; CHECK-NEXT:    add.w r0, r5, r6, lsl #2
 ; CHECK-NEXT:    sub.w r9, r0, #4
-; CHECK-NEXT:    rsbs r0, r4, #0
-; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    and r3, r7, #7
+; CHECK-NEXT:    rsbs r0, r6, #0
+; CHECK-NEXT:    str r4, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    and r4, r3, #7
 ; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    add.w r0, r12, #32
-; CHECK-NEXT:    str r4, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r10, #32
+; CHECK-NEXT:    str r6, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_3: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    subs.w r10, r10, #1
+; CHECK-NEXT:    subs.w r12, r12, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 ; CHECK-NEXT:    add.w r0, r5, r0, lsl #2
 ; CHECK-NEXT:    add.w r5, r0, #16
@@ -1087,25 +1085,25 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB16_6 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
+; CHECK-NEXT:    add.w lr, r10, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
-; CHECK-NEXT:    ldrd r7, r6, [r12]
-; CHECK-NEXT:    ldrd r0, r4, [r12, #8]
-; CHECK-NEXT:    ldrd r3, lr, [r12, #16]
-; CHECK-NEXT:    ldrd r11, r8, [r12, #24]
+; CHECK-NEXT:    ldrd r3, r7, [r10]
+; CHECK-NEXT:    ldm.w lr, {r0, r4, r6, lr}
+; CHECK-NEXT:    ldrd r11, r8, [r10, #24]
 ; CHECK-NEXT:    vstrb.8 q0, [r9], #16
 ; CHECK-NEXT:    vldrw.u32 q0, [r5], #32
 ; CHECK-NEXT:    strd r9, r1, [sp, #24] @ 8-byte Folded Spill
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #-28]
-; CHECK-NEXT:    vmul.f32 q0, q0, r7
+; CHECK-NEXT:    vmul.f32 q0, q0, r3
 ; CHECK-NEXT:    vldrw.u32 q6, [r5, #-24]
 ; CHECK-NEXT:    vldrw.u32 q4, [r5, #-20]
-; CHECK-NEXT:    vfma.f32 q0, q1, r6
+; CHECK-NEXT:    vfma.f32 q0, q1, r7
 ; CHECK-NEXT:    vldrw.u32 q5, [r5, #-16]
 ; CHECK-NEXT:    vfma.f32 q0, q6, r0
 ; CHECK-NEXT:    vldrw.u32 q2, [r5, #-12]
 ; CHECK-NEXT:    vfma.f32 q0, q4, r4
 ; CHECK-NEXT:    vldrw.u32 q3, [r5, #-8]
-; CHECK-NEXT:    vfma.f32 q0, q5, r3
+; CHECK-NEXT:    vfma.f32 q0, q5, r6
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    vfma.f32 q0, q2, lr
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #-4]
@@ -1149,26 +1147,26 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:  .LBB16_8: @ %for.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
 ; CHECK-NEXT:    ldrd r9, r1, [sp, #24] @ 8-byte Folded Reload
-; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    cmp.w r3, #0
+; CHECK-NEXT:    ldr r4, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    cmp.w r4, #0
 ; CHECK-NEXT:    beq .LBB16_3
 ; CHECK-NEXT:    b .LBB16_9
 ; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    mov r6, r5
-; CHECK-NEXT:    mov lr, r3
+; CHECK-NEXT:    mov r3, r5
+; CHECK-NEXT:    mov lr, r4
 ; CHECK-NEXT:  .LBB16_10: @ %while.body76
 ; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldr r0, [r7], #4
-; CHECK-NEXT:    vldrw.u32 q1, [r6], #4
+; CHECK-NEXT:    vldrw.u32 q1, [r3], #4
 ; CHECK-NEXT:    subs.w lr, lr, #1
 ; CHECK-NEXT:    vfma.f32 q0, q1, r0
 ; CHECK-NEXT:    bne .LBB16_10
 ; CHECK-NEXT:    b .LBB16_11
 ; CHECK-NEXT:  .LBB16_11: @ %while.end.loopexit
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    add.w r5, r5, r3, lsl #2
+; CHECK-NEXT:    add.w r5, r5, r4, lsl #2
 ; CHECK-NEXT:    b .LBB16_3
 ; CHECK-NEXT:  .LBB16_12: @ %if.end
 ; CHECK-NEXT:    add sp, #32

diff  --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
index 3503649976f9..06a6edb2566e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -213,14 +213,14 @@ define i8* @test(i8* nocapture readonly %input_row, i8* nocapture readonly %inpu
 ; CHECK-NEXT:    cmp r3, #4
 ; CHECK-NEXT:    strd r0, r1, [sp, #12] @ 8-byte Folded Spill
 ; CHECK-NEXT:    bne .LBB2_8
-; CHECK-NEXT:  @ %bb.1: @ %for.cond.preheader
+; CHECK-NEXT:  @ %bb.1: @ %entry
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    beq .LBB2_8
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.lr.ph
 ; CHECK-NEXT:    ldr r3, [sp, #64]
 ; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldr.w r11, [sp, #56]
+; CHECK-NEXT:    ldr r4, [sp, #56]
 ; CHECK-NEXT:    add.w r0, r1, r3, lsl #1
 ; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r1, r3
@@ -229,57 +229,57 @@ define i8* @test(i8* nocapture readonly %input_row, i8* nocapture readonly %inpu
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r3, #7
-; CHECK-NEXT:    lsrs r0, r0, #3
+; CHECK-NEXT:    lsr.w r11, r0, #3
 ; CHECK-NEXT:    b .LBB2_5
 ; CHECK-NEXT:  .LBB2_3: @ in Loop: Header=BB2_5 Depth=1
-; CHECK-NEXT:    mov r10, r12
-; CHECK-NEXT:    mov r8, r12
-; CHECK-NEXT:    mov r6, r12
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:  .LBB2_4: @ %for.cond.cleanup23
 ; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
-; CHECK-NEXT:    ldr r3, [sp, #72]
-; CHECK-NEXT:    add.w r1, r8, r10
-; CHECK-NEXT:    add r1, r6
-; CHECK-NEXT:    add r1, r12
-; CHECK-NEXT:    strb.w r1, [r3, r9]
+; CHECK-NEXT:    ldr r1, [sp, #72]
+; CHECK-NEXT:    add.w r0, r8, r12
+; CHECK-NEXT:    add r0, r6
+; CHECK-NEXT:    add r0, r10
+; CHECK-NEXT:    strb.w r0, [r1, r9]
 ; CHECK-NEXT:    add.w r9, r9, #1
 ; CHECK-NEXT:    cmp r9, r2
 ; CHECK-NEXT:    beq .LBB2_8
 ; CHECK-NEXT:  .LBB2_5: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB2_7 Depth 2
-; CHECK-NEXT:    ldr r1, [sp, #68]
-; CHECK-NEXT:    ldr.w r12, [r1, r9, lsl #2]
-; CHECK-NEXT:    subs r1, r0, r0
+; CHECK-NEXT:    ldr r0, [sp, #68]
+; CHECK-NEXT:    ldr.w r10, [r0, r9, lsl #2]
+; CHECK-NEXT:    subs.w r0, r11, r11
 ; CHECK-NEXT:    ble .LBB2_3
 ; CHECK-NEXT:  @ %bb.6: @ %for.body24.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
-; CHECK-NEXT:    ldr r7, [sp, #64]
-; CHECK-NEXT:    mov r6, r12
-; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, r1
+; CHECK-NEXT:    ldr r3, [sp, #64]
+; CHECK-NEXT:    mov r6, r10
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, r0
+; CHECK-NEXT:    ldrd r5, r0, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:    mla r7, r9, r3, r1
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    mov r8, r12
-; CHECK-NEXT:    mla r7, r9, r7, r3
-; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
-; CHECK-NEXT:    mov r10, r12
+; CHECK-NEXT:    ldr r3, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    mov r12, r10
 ; CHECK-NEXT:  .LBB2_7: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB2_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrb.s16 q0, [r4], #8
-; CHECK-NEXT:    vadd.i16 q1, q0, r11
+; CHECK-NEXT:    vldrb.s16 q0, [r5], #8
+; CHECK-NEXT:    vadd.i16 q1, q0, r4
 ; CHECK-NEXT:    vldrb.s16 q0, [r7], #8
-; CHECK-NEXT:    vmlava.s16 r12, q0, q1
-; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r11
-; CHECK-NEXT:    vmlava.s16 r6, q0, q1
+; CHECK-NEXT:    vmlava.s16 r10, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r3], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r11
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
+; CHECK-NEXT:    vmlava.s16 r6, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r0], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
 ; CHECK-NEXT:    vmlava.s16 r8, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r11
-; CHECK-NEXT:    vmlava.s16 r10, q0, q1
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
+; CHECK-NEXT:    vmlava.s16 r12, q0, q1
 ; CHECK-NEXT:    le lr, .LBB2_7
 ; CHECK-NEXT:    b .LBB2_4
 ; CHECK-NEXT:  .LBB2_8: @ %if.end
@@ -396,14 +396,14 @@ define i8* @test_optsize(i8* nocapture readonly %input_row, i8* nocapture readon
 ; CHECK-NEXT:    cmp r3, #4
 ; CHECK-NEXT:    strd r0, r1, [sp, #12] @ 8-byte Folded Spill
 ; CHECK-NEXT:    bne .LBB3_8
-; CHECK-NEXT:  @ %bb.1: @ %for.cond.preheader
+; CHECK-NEXT:  @ %bb.1: @ %entry
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    beq .LBB3_8
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.lr.ph
 ; CHECK-NEXT:    ldr r3, [sp, #64]
 ; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldr.w r11, [sp, #56]
+; CHECK-NEXT:    ldr r4, [sp, #56]
 ; CHECK-NEXT:    add.w r0, r1, r3, lsl #1
 ; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r1, r3
@@ -412,55 +412,55 @@ define i8* @test_optsize(i8* nocapture readonly %input_row, i8* nocapture readon
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r3, #7
-; CHECK-NEXT:    lsrs r0, r0, #3
+; CHECK-NEXT:    lsr.w r11, r0, #3
 ; CHECK-NEXT:  .LBB3_3: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB3_5 Depth 2
-; CHECK-NEXT:    ldr r1, [sp, #68]
-; CHECK-NEXT:    ldr.w r12, [r1, r9, lsl #2]
-; CHECK-NEXT:    subs r1, r0, r0
+; CHECK-NEXT:    ldr r0, [sp, #68]
+; CHECK-NEXT:    ldr.w r10, [r0, r9, lsl #2]
+; CHECK-NEXT:    subs.w r0, r11, r11
 ; CHECK-NEXT:    ble .LBB3_6
 ; CHECK-NEXT:  @ %bb.4: @ %for.body24.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB3_3 Depth=1
-; CHECK-NEXT:    ldr r7, [sp, #64]
-; CHECK-NEXT:    mov r6, r12
-; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, r1
+; CHECK-NEXT:    ldr r3, [sp, #64]
+; CHECK-NEXT:    mov r6, r10
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, r0
+; CHECK-NEXT:    ldrd r5, r0, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:    mla r7, r9, r3, r1
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    mov r8, r12
-; CHECK-NEXT:    mla r7, r9, r7, r3
-; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
-; CHECK-NEXT:    mov r10, r12
+; CHECK-NEXT:    ldr r3, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    mov r12, r10
 ; CHECK-NEXT:  .LBB3_5: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB3_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrb.s16 q0, [r4], #8
-; CHECK-NEXT:    vadd.i16 q1, q0, r11
+; CHECK-NEXT:    vldrb.s16 q0, [r5], #8
+; CHECK-NEXT:    vadd.i16 q1, q0, r4
 ; CHECK-NEXT:    vldrb.s16 q0, [r7], #8
-; CHECK-NEXT:    vmlava.s16 r12, q0, q1
-; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r11
-; CHECK-NEXT:    vmlava.s16 r6, q0, q1
+; CHECK-NEXT:    vmlava.s16 r10, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r3], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r11
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
+; CHECK-NEXT:    vmlava.s16 r6, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r0], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
 ; CHECK-NEXT:    vmlava.s16 r8, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r11
-; CHECK-NEXT:    vmlava.s16 r10, q0, q1
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
+; CHECK-NEXT:    vmlava.s16 r12, q0, q1
 ; CHECK-NEXT:    le lr, .LBB3_5
 ; CHECK-NEXT:    b .LBB3_7
 ; CHECK-NEXT:  .LBB3_6: @ in Loop: Header=BB3_3 Depth=1
-; CHECK-NEXT:    mov r10, r12
-; CHECK-NEXT:    mov r8, r12
-; CHECK-NEXT:    mov r6, r12
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:  .LBB3_7: @ %for.cond.cleanup23
 ; CHECK-NEXT:    @ in Loop: Header=BB3_3 Depth=1
-; CHECK-NEXT:    ldr r3, [sp, #72]
-; CHECK-NEXT:    add.w r1, r8, r10
-; CHECK-NEXT:    add r1, r6
-; CHECK-NEXT:    add r1, r12
-; CHECK-NEXT:    strb.w r1, [r3, r9]
+; CHECK-NEXT:    ldr r1, [sp, #72]
+; CHECK-NEXT:    add.w r0, r8, r12
+; CHECK-NEXT:    add r0, r6
+; CHECK-NEXT:    add r0, r10
+; CHECK-NEXT:    strb.w r0, [r1, r9]
 ; CHECK-NEXT:    add.w r9, r9, #1
 ; CHECK-NEXT:    cmp r9, r2
 ; CHECK-NEXT:    bne .LBB3_3
@@ -689,13 +689,13 @@ define i8* @signext(i8* %input_row, i8* %input_col, i16 zeroext %output_ch, i16
 ; CHECK-NEXT:    cmp r3, #4
 ; CHECK-NEXT:    stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill
 ; CHECK-NEXT:    bne .LBB5_8
-; CHECK-NEXT:  @ %bb.1: @ %for.cond.preheader
+; CHECK-NEXT:  @ %bb.1: @ %entry
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    beq .LBB5_8
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.lr.ph
 ; CHECK-NEXT:    ldr r2, [sp, #92]
-; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    ldr r4, [sp, #76]
 ; CHECK-NEXT:    add.w r0, r1, r2, lsl #1
@@ -709,20 +709,20 @@ define i8* @signext(i8* %input_row, i8* %input_col, i16 zeroext %output_ch, i16
 ; CHECK-NEXT:    lsrs r1, r0, #3
 ; CHECK-NEXT:    b .LBB5_5
 ; CHECK-NEXT:  .LBB5_3: @ in Loop: Header=BB5_5 Depth=1
-; CHECK-NEXT:    mov r10, r12
-; CHECK-NEXT:    mov r8, r12
-; CHECK-NEXT:    mov r6, r12
+; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:  .LBB5_4: @ %for.cond.cleanup23
 ; CHECK-NEXT:    @ in Loop: Header=BB5_5 Depth=1
-; CHECK-NEXT:    add.w r0, r8, r10
+; CHECK-NEXT:    add.w r0, r12, r8
 ; CHECK-NEXT:    ldr r1, [sp, #100]
 ; CHECK-NEXT:    add r0, r6
-; CHECK-NEXT:    add r0, r12
-; CHECK-NEXT:    strb.w r0, [r1, r11]
-; CHECK-NEXT:    add.w r11, r11, #1
+; CHECK-NEXT:    add r0, r10
+; CHECK-NEXT:    strb.w r0, [r1, r9]
+; CHECK-NEXT:    add.w r9, r9, #1
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT:    cmp r11, r0
+; CHECK-NEXT:    cmp r9, r0
 ; CHECK-NEXT:    beq .LBB5_8
 ; CHECK-NEXT:  .LBB5_5: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
@@ -730,36 +730,37 @@ define i8* @signext(i8* %input_row, i8* %input_col, i16 zeroext %output_ch, i16
 ; CHECK-NEXT:    ldr r0, [sp, #96]
 ; CHECK-NEXT:    cmp r1, r1
 ; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    ldr.w r12, [r0, r11, lsl #2]
+; CHECK-NEXT:    ldr.w r10, [r0, r9, lsl #2]
 ; CHECK-NEXT:    bge .LBB5_3
 ; CHECK-NEXT:  @ %bb.6: @ %for.body24.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB5_5 Depth=1
 ; CHECK-NEXT:    ldr r2, [sp, #92]
 ; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    mov r6, r12
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    mov r8, r12
-; CHECK-NEXT:    mla r3, r11, r2, r0
-; CHECK-NEXT:    mov r9, r2
-; CHECK-NEXT:    mov r10, r12
-; CHECK-NEXT:    ldm.w sp, {r0, r5, r7} @ 12-byte Folded Reload
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    mla r3, r9, r2, r0
+; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r7, r0, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r11, r2
+; CHECK-NEXT:    mov r8, r10
 ; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB5_7: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB5_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrb.s16 q0, [r0], #8
+; CHECK-NEXT:    vldrb.s16 q0, [r7], #8
 ; CHECK-NEXT:    vadd.i16 q1, q0, r4
 ; CHECK-NEXT:    vldrb.s16 q0, [r3], #8
-; CHECK-NEXT:    vmlava.s16 r12, q0, q1
-; CHECK-NEXT:    vldrb.s16 q1, [r7], #8
+; CHECK-NEXT:    vmlava.s16 r10, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
 ; CHECK-NEXT:    vadd.i16 q1, q1, r4
 ; CHECK-NEXT:    vmlava.s16 r6, q0, q1
-; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
+; CHECK-NEXT:    vldrb.s16 q1, [r0], #8
 ; CHECK-NEXT:    vadd.i16 q1, q1, r4
-; CHECK-NEXT:    vmlava.s16 r8, q0, q1
+; CHECK-NEXT:    vmlava.s16 r12, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
 ; CHECK-NEXT:    vadd.i16 q1, q1, r4
-; CHECK-NEXT:    vmlava.s16 r10, q0, q1
+; CHECK-NEXT:    vmlava.s16 r8, q0, q1
 ; CHECK-NEXT:    letp lr, .LBB5_7
 ; CHECK-NEXT:    b .LBB5_4
 ; CHECK-NEXT:  .LBB5_8: @ %if.end
@@ -879,13 +880,13 @@ define i8* @signext_optsize(i8* %input_row, i8* %input_col, i16 zeroext %output_
 ; CHECK-NEXT:    cmp r3, #4
 ; CHECK-NEXT:    stm.w r12, {r0, r1, r2} @ 12-byte Folded Spill
 ; CHECK-NEXT:    bne .LBB6_8
-; CHECK-NEXT:  @ %bb.1: @ %for.cond.preheader
+; CHECK-NEXT:  @ %bb.1: @ %entry
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    beq .LBB6_8
 ; CHECK-NEXT:  @ %bb.2: @ %for.body.lr.ph
 ; CHECK-NEXT:    ldr r2, [sp, #92]
-; CHECK-NEXT:    mov.w r11, #0
+; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    ldr r4, [sp, #76]
 ; CHECK-NEXT:    add.w r0, r1, r2, lsl #1
@@ -903,53 +904,54 @@ define i8* @signext_optsize(i8* %input_row, i8* %input_col, i16 zeroext %output_
 ; CHECK-NEXT:    ldr r0, [sp, #96]
 ; CHECK-NEXT:    cmp r1, r1
 ; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    ldr.w r12, [r0, r11, lsl #2]
+; CHECK-NEXT:    ldr.w r10, [r0, r9, lsl #2]
 ; CHECK-NEXT:    bge .LBB6_6
 ; CHECK-NEXT:  @ %bb.4: @ %for.body24.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB6_3 Depth=1
 ; CHECK-NEXT:    ldr r2, [sp, #92]
 ; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    mov r6, r12
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    mov r8, r12
-; CHECK-NEXT:    mla r3, r11, r2, r0
-; CHECK-NEXT:    mov r9, r2
-; CHECK-NEXT:    mov r10, r12
-; CHECK-NEXT:    ldm.w sp, {r0, r5, r7} @ 12-byte Folded Reload
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    mla r3, r9, r2, r0
+; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r7, r0, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r11, r2
+; CHECK-NEXT:    mov r8, r10
 ; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB6_5: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB6_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrb.s16 q0, [r0], #8
+; CHECK-NEXT:    vldrb.s16 q0, [r7], #8
 ; CHECK-NEXT:    vadd.i16 q1, q0, r4
 ; CHECK-NEXT:    vldrb.s16 q0, [r3], #8
-; CHECK-NEXT:    vmlava.s16 r12, q0, q1
-; CHECK-NEXT:    vldrb.s16 q1, [r7], #8
+; CHECK-NEXT:    vmlava.s16 r10, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
 ; CHECK-NEXT:    vadd.i16 q1, q1, r4
 ; CHECK-NEXT:    vmlava.s16 r6, q0, q1
-; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
+; CHECK-NEXT:    vldrb.s16 q1, [r0], #8
 ; CHECK-NEXT:    vadd.i16 q1, q1, r4
-; CHECK-NEXT:    vmlava.s16 r8, q0, q1
+; CHECK-NEXT:    vmlava.s16 r12, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
 ; CHECK-NEXT:    vadd.i16 q1, q1, r4
-; CHECK-NEXT:    vmlava.s16 r10, q0, q1
+; CHECK-NEXT:    vmlava.s16 r8, q0, q1
 ; CHECK-NEXT:    letp lr, .LBB6_5
 ; CHECK-NEXT:    b .LBB6_7
 ; CHECK-NEXT:  .LBB6_6: @ in Loop: Header=BB6_3 Depth=1
-; CHECK-NEXT:    mov r10, r12
-; CHECK-NEXT:    mov r8, r12
-; CHECK-NEXT:    mov r6, r12
+; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:  .LBB6_7: @ %for.cond.cleanup23
 ; CHECK-NEXT:    @ in Loop: Header=BB6_3 Depth=1
-; CHECK-NEXT:    add.w r0, r8, r10
+; CHECK-NEXT:    add.w r0, r12, r8
 ; CHECK-NEXT:    ldr r1, [sp, #100]
 ; CHECK-NEXT:    add r0, r6
-; CHECK-NEXT:    add r0, r12
-; CHECK-NEXT:    strb.w r0, [r1, r11]
-; CHECK-NEXT:    add.w r11, r11, #1
+; CHECK-NEXT:    add r0, r10
+; CHECK-NEXT:    strb.w r0, [r1, r9]
+; CHECK-NEXT:    add.w r9, r9, #1
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT:    cmp r11, r0
+; CHECK-NEXT:    cmp r9, r0
 ; CHECK-NEXT:    bne .LBB6_3
 ; CHECK-NEXT:  .LBB6_8: @ %if.end
 ; CHECK-NEXT:    ldr r0, [sp, #100]

diff  --git a/llvm/test/Transforms/LoopUnroll/peel-loop-inner.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-inner.ll
index b294558f1d47..8d89053c96f1 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-inner.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-inner.ll
@@ -7,13 +7,10 @@ define void @basic(i32 %K, i32 %N) {
 ; CHECK-NEXT:    br label [[OUTER:%.*]]
 ; CHECK:       outer:
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
-; CHECK-NEXT:    [[CMP_INNER_PEEL:%.*]] = icmp sgt i32 [[K:%.*]], 1
-; CHECK-NEXT:    br i1 [[CMP_INNER_PEEL]], label [[INNER_PEEL2:%.*]], label [[OUTER_BACKEDGE]]
-; CHECK:       inner.peel2:
-; CHECK-NEXT:    [[CMP_INNER_PEEL8:%.*]] = icmp sgt i32 [[K]], 3
+; CHECK-NEXT:    [[CMP_INNER_PEEL8:%.*]] = icmp sgt i32 [[K:%.*]], 3
 ; CHECK-NEXT:    br i1 [[CMP_INNER_PEEL8]], label [[INNER:%.*]], label [[OUTER_BACKEDGE]]
 ; CHECK:       inner:
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ [[J_INC:%.*]], [[INNER]] ], [ 3, [[INNER_PEEL2]] ]
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ [[J_INC:%.*]], [[INNER]] ], [ 3, [[OUTER]] ]
 ; CHECK-NEXT:    [[J_INC]] = add nuw nsw i32 [[J]], 1
 ; CHECK-NEXT:    [[CMP_INNER:%.*]] = icmp slt i32 [[J_INC]], [[K]]
 ; CHECK-NEXT:    br i1 [[CMP_INNER]], label [[INNER]], label [[OUTER_BACKEDGE]], [[LOOP0:!llvm.loop !.*]]

diff  --git a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll
index ba7a5840d600..002a0e57d99f 100644
--- a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll
+++ b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll
@@ -245,11 +245,10 @@ define void @one_pred_with_extra_op_liveout(i8 %v0, i8 %v1) {
 ; CHECK-LABEL: @one_pred_with_extra_op_liveout(
 ; CHECK-NEXT:  pred:
 ; CHECK-NEXT:    [[C0:%.*]] = icmp eq i8 [[V0:%.*]], 0
-; CHECK-NEXT:    br i1 [[C0]], label [[DISPATCH:%.*]], label [[FINAL_RIGHT:%.*]]
-; CHECK:       dispatch:
 ; CHECK-NEXT:    [[V1_ADJ:%.*]] = add i8 [[V0]], [[V1:%.*]]
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i8 [[V1_ADJ]], 0
-; CHECK-NEXT:    br i1 [[C1]], label [[FINAL_LEFT:%.*]], label [[FINAL_RIGHT]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[C0]], [[C1]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FINAL_LEFT:%.*]], label [[FINAL_RIGHT:%.*]]
 ; CHECK:       final_left:
 ; CHECK-NEXT:    call void @sideeffect0()
 ; CHECK-NEXT:    call void @use8(i8 [[V1_ADJ]])
@@ -277,11 +276,10 @@ define void @one_pred_with_extra_op_liveout_multiuse(i8 %v0, i8 %v1) {
 ; CHECK-LABEL: @one_pred_with_extra_op_liveout_multiuse(
 ; CHECK-NEXT:  pred:
 ; CHECK-NEXT:    [[C0:%.*]] = icmp eq i8 [[V0:%.*]], 0
-; CHECK-NEXT:    br i1 [[C0]], label [[DISPATCH:%.*]], label [[FINAL_RIGHT:%.*]]
-; CHECK:       dispatch:
 ; CHECK-NEXT:    [[V1_ADJ:%.*]] = add i8 [[V0]], [[V1:%.*]]
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i8 [[V1_ADJ]], 0
-; CHECK-NEXT:    br i1 [[C1]], label [[FINAL_LEFT:%.*]], label [[FINAL_RIGHT]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[C0]], [[C1]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FINAL_LEFT:%.*]], label [[FINAL_RIGHT:%.*]]
 ; CHECK:       final_left:
 ; CHECK-NEXT:    call void @sideeffect0()
 ; CHECK-NEXT:    call void @use8(i8 [[V1_ADJ]])
@@ -315,11 +313,10 @@ define void @one_pred_with_extra_op_liveout_distant_phi(i8 %v0, i8 %v1) {
 ; CHECK-NEXT:    br i1 [[C0]], label [[PRED:%.*]], label [[LEFT_END:%.*]]
 ; CHECK:       pred:
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i8 [[V1:%.*]], 0
-; CHECK-NEXT:    br i1 [[C1]], label [[DISPATCH:%.*]], label [[FINAL_RIGHT:%.*]]
-; CHECK:       dispatch:
 ; CHECK-NEXT:    [[V2_ADJ:%.*]] = add i8 [[V0]], [[V1]]
 ; CHECK-NEXT:    [[C2:%.*]] = icmp eq i8 [[V2_ADJ]], 0
-; CHECK-NEXT:    br i1 [[C2]], label [[FINAL_LEFT:%.*]], label [[FINAL_RIGHT]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[C1]], [[C2]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FINAL_LEFT:%.*]], label [[FINAL_RIGHT:%.*]]
 ; CHECK:       final_left:
 ; CHECK-NEXT:    call void @sideeffect0()
 ; CHECK-NEXT:    call void @use8(i8 [[V2_ADJ]])
@@ -468,11 +465,10 @@ define void @one_pred_with_extra_op_eexternally_used_only(i8 %v0, i8 %v1) {
 ; CHECK-LABEL: @one_pred_with_extra_op_eexternally_used_only(
 ; CHECK-NEXT:  pred:
 ; CHECK-NEXT:    [[C0:%.*]] = icmp eq i8 [[V0:%.*]], 0
-; CHECK-NEXT:    br i1 [[C0]], label [[DISPATCH:%.*]], label [[FINAL_RIGHT:%.*]]
-; CHECK:       dispatch:
 ; CHECK-NEXT:    [[V1_ADJ:%.*]] = add i8 [[V0]], [[V1:%.*]]
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i8 [[V1]], 0
-; CHECK-NEXT:    br i1 [[C1]], label [[FINAL_LEFT:%.*]], label [[FINAL_RIGHT]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[C0]], [[C1]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FINAL_LEFT:%.*]], label [[FINAL_RIGHT:%.*]]
 ; CHECK:       final_left:
 ; CHECK-NEXT:    call void @sideeffect0()
 ; CHECK-NEXT:    call void @use8(i8 [[V1_ADJ]])
@@ -500,11 +496,10 @@ define void @one_pred_with_extra_op_externally_used_only_multiuse(i8 %v0, i8 %v1
 ; CHECK-LABEL: @one_pred_with_extra_op_externally_used_only_multiuse(
 ; CHECK-NEXT:  pred:
 ; CHECK-NEXT:    [[C0:%.*]] = icmp eq i8 [[V0:%.*]], 0
-; CHECK-NEXT:    br i1 [[C0]], label [[DISPATCH:%.*]], label [[FINAL_RIGHT:%.*]]
-; CHECK:       dispatch:
 ; CHECK-NEXT:    [[V1_ADJ:%.*]] = add i8 [[V0]], [[V1:%.*]]
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i8 [[V1]], 0
-; CHECK-NEXT:    br i1 [[C1]], label [[FINAL_LEFT:%.*]], label [[FINAL_RIGHT]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[C0]], [[C1]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FINAL_LEFT:%.*]], label [[FINAL_RIGHT:%.*]]
 ; CHECK:       final_left:
 ; CHECK-NEXT:    call void @sideeffect0()
 ; CHECK-NEXT:    call void @use8(i8 [[V1_ADJ]])
@@ -643,11 +638,10 @@ define void @one_pred_with_extra_op_externally_used_only_after_cond_distant_phi(
 ; CHECK-NEXT:    br i1 [[C0]], label [[PRED:%.*]], label [[LEFT_END:%.*]]
 ; CHECK:       pred:
 ; CHECK-NEXT:    [[C1:%.*]] = icmp eq i8 [[V1:%.*]], 0
-; CHECK-NEXT:    br i1 [[C1]], label [[DISPATCH:%.*]], label [[FINAL_RIGHT:%.*]]
-; CHECK:       dispatch:
 ; CHECK-NEXT:    [[C3:%.*]] = icmp eq i8 [[V3:%.*]], 0
 ; CHECK-NEXT:    [[V2_ADJ:%.*]] = add i8 [[V4:%.*]], [[V5:%.*]]
-; CHECK-NEXT:    br i1 [[C3]], label [[FINAL_LEFT:%.*]], label [[FINAL_RIGHT]]
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[C1]], [[C3]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FINAL_LEFT:%.*]], label [[FINAL_RIGHT:%.*]]
 ; CHECK:       final_left:
 ; CHECK-NEXT:    call void @sideeffect0()
 ; CHECK-NEXT:    call void @use8(i8 [[V2_ADJ]])
@@ -739,17 +733,22 @@ define void @pr48450() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[COUNTDOWN:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[DEC:%.*]], [[FOR_BODYTHREAD_PRE_SPLIT:%.*]] ]
+; CHECK-NEXT:    [[COUNTDOWN:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[DEC_MERGE:%.*]], [[FOR_BODYTHREAD_PRE_SPLIT:%.*]] ]
 ; CHECK-NEXT:    [[C:%.*]] = call i1 @gen1()
 ; CHECK-NEXT:    br i1 [[C]], label [[FOR_INC:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       for.inc:
-; CHECK-NEXT:    [[DEC]] = add i8 [[COUNTDOWN]], -1
-; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8 [[COUNTDOWN]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[IF_END_LOOPEXIT:%.*]], label [[FOR_BODYTHREAD_PRE_SPLIT]]
+; CHECK-NEXT:    [[DEC_OLD:%.*]] = add i8 [[COUNTDOWN]], -1
+; CHECK-NEXT:    [[CMP_NOT_OLD:%.*]] = icmp eq i8 [[COUNTDOWN]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT_OLD]], label [[IF_END_LOOPEXIT:%.*]], label [[FOR_BODYTHREAD_PRE_SPLIT]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[C2:%.*]] = call i1 @gen1()
-; CHECK-NEXT:    br i1 [[C2]], label [[FOR_INC]], label [[IF_END_LOOPEXIT]]
+; CHECK-NEXT:    [[C2_NOT:%.*]] = xor i1 [[C2]], true
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[COUNTDOWN]], -1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8 [[COUNTDOWN]], 0
+; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[C2_NOT]], [[CMP_NOT]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[IF_END_LOOPEXIT]], label [[FOR_BODYTHREAD_PRE_SPLIT]]
 ; CHECK:       for.bodythread-pre-split:
+; CHECK-NEXT:    [[DEC_MERGE]] = phi i8 [ [[DEC]], [[IF_THEN]] ], [ [[DEC_OLD]], [[FOR_INC]] ]
 ; CHECK-NEXT:    call void @sideeffect0()
 ; CHECK-NEXT:    br label [[FOR_BODY]]
 ; CHECK:       if.end.loopexit:
@@ -785,18 +784,23 @@ define void @pr48450_2(i1 %enable_loopback) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[COUNTDOWN:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[DEC:%.*]], [[FOR_BODYTHREAD_PRE_SPLIT:%.*]] ]
+; CHECK-NEXT:    [[COUNTDOWN:%.*]] = phi i8 [ 8, [[ENTRY:%.*]] ], [ [[DEC_MERGE:%.*]], [[FOR_BODYTHREAD_PRE_SPLIT:%.*]] ]
 ; CHECK-NEXT:    [[C:%.*]] = call i1 @gen1()
 ; CHECK-NEXT:    br i1 [[C]], label [[FOR_INC:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       for.inc:
-; CHECK-NEXT:    [[DEC]] = add i8 [[COUNTDOWN]], -1
-; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8 [[COUNTDOWN]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[IF_END_LOOPEXIT:%.*]], label [[FOR_BODYTHREAD_PRE_SPLIT]]
+; CHECK-NEXT:    [[DEC_OLD:%.*]] = add i8 [[COUNTDOWN]], -1
+; CHECK-NEXT:    [[CMP_NOT_OLD:%.*]] = icmp eq i8 [[COUNTDOWN]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT_OLD]], label [[IF_END_LOOPEXIT:%.*]], label [[FOR_BODYTHREAD_PRE_SPLIT]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[C2:%.*]] = call i1 @gen1()
-; CHECK-NEXT:    br i1 [[C2]], label [[FOR_INC]], label [[IF_END_LOOPEXIT]]
+; CHECK-NEXT:    [[C2_NOT:%.*]] = xor i1 [[C2]], true
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[COUNTDOWN]], -1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8 [[COUNTDOWN]], 0
+; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[C2_NOT]], [[CMP_NOT]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[IF_END_LOOPEXIT]], label [[FOR_BODYTHREAD_PRE_SPLIT]]
 ; CHECK:       for.bodythread-pre-split:
-; CHECK-NEXT:    [[SHOULD_LOOPBACK:%.*]] = phi i1 [ true, [[FOR_INC]] ], [ false, [[FOR_BODYTHREAD_PRE_SPLIT_LOOPBACK:%.*]] ]
+; CHECK-NEXT:    [[DEC_MERGE]] = phi i8 [ [[DEC_OLD]], [[FOR_INC]] ], [ [[DEC_MERGE]], [[FOR_BODYTHREAD_PRE_SPLIT_LOOPBACK:%.*]] ], [ [[DEC]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[SHOULD_LOOPBACK:%.*]] = phi i1 [ true, [[FOR_INC]] ], [ false, [[FOR_BODYTHREAD_PRE_SPLIT_LOOPBACK]] ], [ true, [[IF_THEN]] ]
 ; CHECK-NEXT:    [[DO_LOOPBACK:%.*]] = and i1 [[SHOULD_LOOPBACK]], [[ENABLE_LOOPBACK:%.*]]
 ; CHECK-NEXT:    call void @sideeffect0()
 ; CHECK-NEXT:    br i1 [[DO_LOOPBACK]], label [[FOR_BODYTHREAD_PRE_SPLIT_LOOPBACK]], label [[FOR_BODY]]


        


More information about the llvm-commits mailing list