[llvm-branch-commits] [llvm] 372eb2b - [ARM] Add low overhead loops terminators to AnalyzeBranch

David Green via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Sat Jan 16 10:35:57 PST 2021


Author: David Green
Date: 2021-01-16T18:30:21Z
New Revision: 372eb2bbb6fb903ce76266e659dfefbaee67722b

URL: https://github.com/llvm/llvm-project/commit/372eb2bbb6fb903ce76266e659dfefbaee67722b
DIFF: https://github.com/llvm/llvm-project/commit/372eb2bbb6fb903ce76266e659dfefbaee67722b.diff

LOG: [ARM] Add low overhead loops terminators to AnalyzeBranch

This treats low overhead loop branches the same as jump tables and
indirect branches in analyzeBranch - they cannot be analyzed but the
direct branches on the end of the block may be removed. This helps
remove the unnecessary branches earlier, which can help produce better
codegen (and change block layout in a number of cases).

Differential Revision: https://reviews.llvm.org/D94392

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
    llvm/lib/Target/ARM/ARMBaseInstrInfo.h
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
    llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
    llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
    llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
    llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
    llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
    llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
    llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
    llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index fa564f50f679..54586e0c256b 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -374,7 +374,8 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
     }
 
     if (isIndirectBranchOpcode(I->getOpcode()) ||
-        isJumpTableBranchOpcode(I->getOpcode())) {
+        isJumpTableBranchOpcode(I->getOpcode()) ||
+        isLowOverheadTerminatorOpcode(I->getOpcode())) {
       // Indirect branches and jump tables can't be analyzed, but we still want
       // to clean up any instructions at the tail of the basic block.
       CantAnalyze = true;

diff  --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index deb008025b1d..b14f7e480856 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -634,6 +634,11 @@ static inline bool isJumpTableBranchOpcode(int Opc) {
          Opc == ARM::t2BR_JT;
 }
 
+static inline bool isLowOverheadTerminatorOpcode(int Opc) {
+  return Opc == ARM::t2DoLoopStartTP || Opc == ARM::t2WhileLoopStart ||
+         Opc == ARM::t2LoopEnd || Opc == ARM::t2LoopEndDec;
+}
+
 static inline
 bool isIndirectBranchOpcode(int Opc) {
   return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index fec6ff7c2154..ec574ad827a4 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -330,9 +330,9 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vdup.32 q2, r12
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    b .LBB2_4
+; CHECK-NEXT:    b .LBB2_5
 ; CHECK-NEXT:  .LBB2_2: @ %cond.load25
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vmovx.f16 s0, s28
 ; CHECK-NEXT:    vmov r4, s28
 ; CHECK-NEXT:    vmov r2, s0
@@ -344,7 +344,7 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vmov.16 q6[3], r2
 ; CHECK-NEXT:  .LBB2_3: @ %else26
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vmul.f16 q0, q6, q5
 ; CHECK-NEXT:    adds r0, #8
 ; CHECK-NEXT:    vcvtt.f32.f16 s23, s1
@@ -355,9 +355,18 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    vcvtb.f32.f16 s20, s0
 ; CHECK-NEXT:    vadd.f32 q5, q3, q5
 ; CHECK-NEXT:    subs.w lr, lr, #1
-; CHECK-NEXT:    bne .LBB2_4
-; CHECK-NEXT:    b .LBB2_21
-; CHECK-NEXT:  .LBB2_4: @ %vector.body
+; CHECK-NEXT:    bne .LBB2_5
+; CHECK-NEXT:  @ %bb.4: @ %middle.block
+; CHECK-NEXT:    vdup.32 q0, r12
+; CHECK-NEXT:    vcmp.u32 cs, q0, q4
+; CHECK-NEXT:    vpsel q0, q5, q3
+; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vadd.f32 q0, q0, r0
+; CHECK-NEXT:    b .LBB2_23
+; CHECK-NEXT:  .LBB2_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov q3, q5
@@ -379,13 +388,13 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    rsbs r4, r4, #0
 ; CHECK-NEXT:    bfi r2, r4, #3, #1
 ; CHECK-NEXT:    lsls r4, r2, #31
-; CHECK-NEXT:    bne .LBB2_9
-; CHECK-NEXT:  @ %bb.5: @ %else
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bne .LBB2_10
+; CHECK-NEXT:  @ %bb.6: @ %else
+; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    lsls r4, r2, #30
-; CHECK-NEXT:    bpl .LBB2_10
-; CHECK-NEXT:  .LBB2_6: @ %cond.load6
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bpl .LBB2_11
+; CHECK-NEXT:  .LBB2_7: @ %cond.load6
+; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vldr.16 s20, [r0, #2]
 ; CHECK-NEXT:    vmov r5, s24
 ; CHECK-NEXT:    vmovx.f16 s24, s25
@@ -397,25 +406,25 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    vmov r4, s24
 ; CHECK-NEXT:    vmov.16 q5[3], r4
 ; CHECK-NEXT:    lsls r4, r2, #29
-; CHECK-NEXT:    bmi .LBB2_11
-; CHECK-NEXT:  .LBB2_7: @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bmi .LBB2_12
+; CHECK-NEXT:  .LBB2_8: @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vmov q6, q5
 ; CHECK-NEXT:    lsls r2, r2, #28
-; CHECK-NEXT:    bmi .LBB2_12
-; CHECK-NEXT:  .LBB2_8: @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bmi .LBB2_13
+; CHECK-NEXT:  .LBB2_9: @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vmov q5, q6
-; CHECK-NEXT:    b .LBB2_13
-; CHECK-NEXT:  .LBB2_9: @ %cond.load
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    b .LBB2_14
+; CHECK-NEXT:  .LBB2_10: @ %cond.load
+; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vldr.16 s24, [r0]
 ; CHECK-NEXT:    lsls r4, r2, #30
-; CHECK-NEXT:    bmi .LBB2_6
-; CHECK-NEXT:  .LBB2_10: @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bmi .LBB2_7
+; CHECK-NEXT:  .LBB2_11: @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vmov q5, q6
 ; CHECK-NEXT:    lsls r4, r2, #29
-; CHECK-NEXT:    bpl .LBB2_7
-; CHECK-NEXT:  .LBB2_11: @ %cond.load9
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bpl .LBB2_8
+; CHECK-NEXT:  .LBB2_12: @ %cond.load9
+; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vmovx.f16 s24, s20
 ; CHECK-NEXT:    vmov r4, s20
 ; CHECK-NEXT:    vldr.16 s28, [r0, #4]
@@ -428,9 +437,9 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    vmov r4, s20
 ; CHECK-NEXT:    vmov.16 q6[3], r4
 ; CHECK-NEXT:    lsls r2, r2, #28
-; CHECK-NEXT:    bpl .LBB2_8
-; CHECK-NEXT:  .LBB2_12: @ %cond.load12
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bpl .LBB2_9
+; CHECK-NEXT:  .LBB2_13: @ %cond.load12
+; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vmovx.f16 s20, s24
 ; CHECK-NEXT:    vmov r4, s24
 ; CHECK-NEXT:    vmov r2, s20
@@ -441,8 +450,8 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    vmov.16 q5[2], r2
 ; CHECK-NEXT:    vmov r2, s24
 ; CHECK-NEXT:    vmov.16 q5[3], r2
-; CHECK-NEXT:  .LBB2_13: @ %else13
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:  .LBB2_14: @ %else13
+; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vcmp.u32 cs, q2, q4
 ; CHECK-NEXT:    @ implicit-def: $q7
 ; CHECK-NEXT:    vmrs r4, p0
@@ -460,13 +469,13 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    rsbs r4, r4, #0
 ; CHECK-NEXT:    bfi r2, r4, #3, #1
 ; CHECK-NEXT:    lsls r4, r2, #31
-; CHECK-NEXT:    bne .LBB2_17
-; CHECK-NEXT:  @ %bb.14: @ %else17
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bne .LBB2_18
+; CHECK-NEXT:  @ %bb.15: @ %else17
+; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    lsls r4, r2, #30
-; CHECK-NEXT:    bpl .LBB2_18
-; CHECK-NEXT:  .LBB2_15: @ %cond.load19
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bpl .LBB2_19
+; CHECK-NEXT:  .LBB2_16: @ %cond.load19
+; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vldr.16 s24, [r1, #2]
 ; CHECK-NEXT:    vmov r5, s28
 ; CHECK-NEXT:    vmovx.f16 s28, s29
@@ -478,23 +487,23 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    vmov r4, s28
 ; CHECK-NEXT:    vmov.16 q6[3], r4
 ; CHECK-NEXT:    lsls r4, r2, #29
-; CHECK-NEXT:    bmi .LBB2_19
-; CHECK-NEXT:  .LBB2_16: @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bmi .LBB2_20
+; CHECK-NEXT:  .LBB2_17: @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vmov q7, q6
 ; CHECK-NEXT:    lsls r2, r2, #28
 ; CHECK-NEXT:    bmi.w .LBB2_2
-; CHECK-NEXT:    b .LBB2_20
-; CHECK-NEXT:  .LBB2_17: @ %cond.load16
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    b .LBB2_21
+; CHECK-NEXT:  .LBB2_18: @ %cond.load16
+; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vldr.16 s28, [r1]
 ; CHECK-NEXT:    lsls r4, r2, #30
-; CHECK-NEXT:    bmi .LBB2_15
-; CHECK-NEXT:  .LBB2_18: @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bmi .LBB2_16
+; CHECK-NEXT:  .LBB2_19: @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vmov q6, q7
 ; CHECK-NEXT:    lsls r4, r2, #29
-; CHECK-NEXT:    bpl .LBB2_16
-; CHECK-NEXT:  .LBB2_19: @ %cond.load22
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bpl .LBB2_17
+; CHECK-NEXT:  .LBB2_20: @ %cond.load22
+; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vmovx.f16 s28, s24
 ; CHECK-NEXT:    vmov r4, s24
 ; CHECK-NEXT:    vldr.16 s0, [r1, #4]
@@ -508,19 +517,9 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    vmov.16 q7[3], r4
 ; CHECK-NEXT:    lsls r2, r2, #28
 ; CHECK-NEXT:    bmi.w .LBB2_2
-; CHECK-NEXT:  .LBB2_20: @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:  .LBB2_21: @ in Loop: Header=BB2_5 Depth=1
 ; CHECK-NEXT:    vmov q6, q7
 ; CHECK-NEXT:    b .LBB2_3
-; CHECK-NEXT:  .LBB2_21: @ %middle.block
-; CHECK-NEXT:    vdup.32 q0, r12
-; CHECK-NEXT:    vcmp.u32 cs, q0, q4
-; CHECK-NEXT:    vpsel q0, q5, q3
-; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s5, s3
-; CHECK-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vadd.f32 q0, q0, r0
-; CHECK-NEXT:    b .LBB2_23
 ; CHECK-NEXT:  .LBB2_22:
 ; CHECK-NEXT:    vldr s0, .LCPI2_0
 ; CHECK-NEXT:  .LBB2_23: @ %for.cond.cleanup

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
index 2b6c067b1f13..872e9bd848ec 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
@@ -1468,7 +1468,7 @@ define arm_aapcs_vfpcc float @half_half_mac(half* nocapture readonly %a, half* n
 ; CHECK-NEXT:    b .LBB9_6
 ; CHECK-NEXT:  .LBB9_3:
 ; CHECK-NEXT:    vldr s0, .LCPI9_0
-; CHECK-NEXT:    b .LBB9_9
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:  .LBB9_4: @ %for.body.preheader.new
 ; CHECK-NEXT:    bic r2, r2, #3
 ; CHECK-NEXT:    movs r3, #1
@@ -1625,7 +1625,7 @@ define arm_aapcs_vfpcc float @half_half_acc(half* nocapture readonly %a, half* n
 ; CHECK-NEXT:    b .LBB10_6
 ; CHECK-NEXT:  .LBB10_3:
 ; CHECK-NEXT:    vldr s0, .LCPI10_0
-; CHECK-NEXT:    b .LBB10_9
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:  .LBB10_4: @ %for.body.preheader.new
 ; CHECK-NEXT:    bic r2, r2, #3
 ; CHECK-NEXT:    movs r3, #1
@@ -1782,7 +1782,7 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
 ; CHECK-NEXT:    b .LBB11_6
 ; CHECK-NEXT:  .LBB11_3:
 ; CHECK-NEXT:    vldr s0, .LCPI11_0
-; CHECK-NEXT:    b .LBB11_9
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-NEXT:  .LBB11_4: @ %for.body.preheader.new
 ; CHECK-NEXT:    bic r2, r2, #3
 ; CHECK-NEXT:    movs r3, #1

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
index 4e72918f63f6..9f0554cc4e23 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
@@ -12,41 +12,43 @@ define arm_aapcs_vfpcc void @test(i16* noalias nocapture readonly %off, i16* noa
 ; CHECK-NEXT:    lsl.w r12, r3, #1
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    mov r4, r1
-; CHECK-NEXT:  .LBB0_2: @ %for.cond1.preheader.us
+; CHECK-NEXT:    b .LBB0_4
+; CHECK-NEXT:  .LBB0_2: @ %for.body15.us
+; CHECK-NEXT:    @ Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    ldrh.w r7, [r0, r6, lsl #1]
+; CHECK-NEXT:    ldrh.w r5, [r1, r6, lsl #1]
+; CHECK-NEXT:    add r5, r7
+; CHECK-NEXT:    strh.w r5, [r2, r6, lsl #1]
+; CHECK-NEXT:    adds r6, #1
+; CHECK-NEXT:    le lr, .LBB0_2
+; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup14.us
+; CHECK-NEXT:    @ in Loop: Header=BB0_4 Depth=1
+; CHECK-NEXT:    adds r3, #1
+; CHECK-NEXT:    add r2, r12
+; CHECK-NEXT:    add r4, r12
+; CHECK-NEXT:    cmp r3, r8
+; CHECK-NEXT:    beq .LBB0_7
+; CHECK-NEXT:  .LBB0_4: @ %for.cond1.preheader.us
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB0_3 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB0_5 Depth 2
+; CHECK-NEXT:    @ Child Loop BB0_2 Depth 2
 ; CHECK-NEXT:    dls lr, r8
 ; CHECK-NEXT:    movs r6, #0
-; CHECK-NEXT:  .LBB0_3: @ %for.body4.us
-; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:  .LBB0_5: @ %for.body4.us
+; CHECK-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrh.w r5, [r0, r6, lsl #1]
 ; CHECK-NEXT:    ldrh.w r7, [r1, r6, lsl #1]
 ; CHECK-NEXT:    add r5, r7
 ; CHECK-NEXT:    strh.w r5, [r4, r6, lsl #1]
 ; CHECK-NEXT:    adds r6, #1
-; CHECK-NEXT:    le lr, .LBB0_3
-; CHECK-NEXT:  @ %bb.4: @ %for.body15.us.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    le lr, .LBB0_5
+; CHECK-NEXT:  @ %bb.6: @ %for.body15.us.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; CHECK-NEXT:    dls lr, r8
 ; CHECK-NEXT:    movs r6, #0
-; CHECK-NEXT:  .LBB0_5: @ %for.body15.us
-; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
-; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh.w r7, [r0, r6, lsl #1]
-; CHECK-NEXT:    ldrh.w r5, [r1, r6, lsl #1]
-; CHECK-NEXT:    add r5, r7
-; CHECK-NEXT:    strh.w r5, [r2, r6, lsl #1]
-; CHECK-NEXT:    adds r6, #1
-; CHECK-NEXT:    le lr, .LBB0_5
-; CHECK-NEXT:  @ %bb.6: @ %for.cond.cleanup14.us
-; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    adds r3, #1
-; CHECK-NEXT:    add r2, r12
-; CHECK-NEXT:    add r4, r12
-; CHECK-NEXT:    cmp r3, r8
-; CHECK-NEXT:    bne .LBB0_2
+; CHECK-NEXT:    b .LBB0_2
 ; CHECK-NEXT:  .LBB0_7: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 1ea183d4a5ff..0521594d1edc 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -30,11 +30,28 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; ENABLED-NEXT:    mov r9, r12
 ; ENABLED-NEXT:    uxth r0, r0
 ; ENABLED-NEXT:    rsbs r5, r0, #0
-; ENABLED-NEXT:    b .LBB0_4
-; ENABLED-NEXT:  .LBB0_2: @ in Loop: Header=BB0_4 Depth=1
-; ENABLED-NEXT:    movs r0, #0
-; ENABLED-NEXT:  .LBB0_3: @ %for.end
-; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
+; ENABLED-NEXT:    b .LBB0_5
+; ENABLED-NEXT:  .LBB0_2: @ %vector.body
+; ENABLED-NEXT:    @ Parent Loop BB0_5 Depth=1
+; ENABLED-NEXT:    @ => This Inner Loop Header: Depth=2
+; ENABLED-NEXT:    vctp.32 r4
+; ENABLED-NEXT:    vmov q0, q1
+; ENABLED-NEXT:    vpstt
+; ENABLED-NEXT:    vldrht.s32 q1, [r0], #8
+; ENABLED-NEXT:    vldrht.s32 q2, [r7], #8
+; ENABLED-NEXT:    mov lr, r6
+; ENABLED-NEXT:    vmul.i32 q1, q2, q1
+; ENABLED-NEXT:    subs r6, #1
+; ENABLED-NEXT:    vshl.s32 q1, r5
+; ENABLED-NEXT:    subs r4, #4
+; ENABLED-NEXT:    vadd.i32 q1, q1, q0
+; ENABLED-NEXT:    le lr, .LBB0_2
+; ENABLED-NEXT:  @ %bb.3: @ %middle.block
+; ENABLED-NEXT:    @ in Loop: Header=BB0_5 Depth=1
+; ENABLED-NEXT:    vpsel q0, q1, q0
+; ENABLED-NEXT:    vaddv.u32 r0, q0
+; ENABLED-NEXT:  .LBB0_4: @ %for.end
+; ENABLED-NEXT:    @ in Loop: Header=BB0_5 Depth=1
 ; ENABLED-NEXT:    lsrs r0, r0, #16
 ; ENABLED-NEXT:    sub.w r9, r9, #1
 ; ENABLED-NEXT:    strh.w r0, [r1, r8, lsl #1]
@@ -42,13 +59,13 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; ENABLED-NEXT:    add.w r10, r10, #2
 ; ENABLED-NEXT:    cmp r8, r3
 ; ENABLED-NEXT:    beq .LBB0_8
-; ENABLED-NEXT:  .LBB0_4: @ %for.body
+; ENABLED-NEXT:  .LBB0_5: @ %for.body
 ; ENABLED-NEXT:    @ =>This Loop Header: Depth=1
-; ENABLED-NEXT:    @ Child Loop BB0_6 Depth 2
+; ENABLED-NEXT:    @ Child Loop BB0_2 Depth 2
 ; ENABLED-NEXT:    cmp r2, r8
-; ENABLED-NEXT:    ble .LBB0_2
-; ENABLED-NEXT:  @ %bb.5: @ %vector.ph
-; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
+; ENABLED-NEXT:    ble .LBB0_7
+; ENABLED-NEXT:  @ %bb.6: @ %vector.ph
+; ENABLED-NEXT:    @ in Loop: Header=BB0_5 Depth=1
 ; ENABLED-NEXT:    bic r0, r9, #3
 ; ENABLED-NEXT:    movs r7, #1
 ; ENABLED-NEXT:    subs r0, #4
@@ -62,26 +79,10 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; ENABLED-NEXT:    mov r7, r10
 ; ENABLED-NEXT:    dls lr, r0
 ; ENABLED-NEXT:    ldr r0, [sp] @ 4-byte Reload
-; ENABLED-NEXT:  .LBB0_6: @ %vector.body
-; ENABLED-NEXT:    @ Parent Loop BB0_4 Depth=1
-; ENABLED-NEXT:    @ => This Inner Loop Header: Depth=2
-; ENABLED-NEXT:    vctp.32 r4
-; ENABLED-NEXT:    vmov q0, q1
-; ENABLED-NEXT:    vpstt
-; ENABLED-NEXT:    vldrht.s32 q1, [r0], #8
-; ENABLED-NEXT:    vldrht.s32 q2, [r7], #8
-; ENABLED-NEXT:    mov lr, r6
-; ENABLED-NEXT:    vmul.i32 q1, q2, q1
-; ENABLED-NEXT:    subs r6, #1
-; ENABLED-NEXT:    vshl.s32 q1, r5
-; ENABLED-NEXT:    subs r4, #4
-; ENABLED-NEXT:    vadd.i32 q1, q1, q0
-; ENABLED-NEXT:    le lr, .LBB0_6
-; ENABLED-NEXT:  @ %bb.7: @ %middle.block
-; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
-; ENABLED-NEXT:    vpsel q0, q1, q0
-; ENABLED-NEXT:    vaddv.u32 r0, q0
-; ENABLED-NEXT:    b .LBB0_3
+; ENABLED-NEXT:    b .LBB0_2
+; ENABLED-NEXT:  .LBB0_7: @ in Loop: Header=BB0_5 Depth=1
+; ENABLED-NEXT:    movs r0, #0
+; ENABLED-NEXT:    b .LBB0_4
 ; ENABLED-NEXT:  .LBB0_8: @ %for.end17
 ; ENABLED-NEXT:    add sp, #4
 ; ENABLED-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
@@ -101,11 +102,28 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; NOREDUCTIONS-NEXT:    mov r9, r12
 ; NOREDUCTIONS-NEXT:    uxth r0, r0
 ; NOREDUCTIONS-NEXT:    rsbs r5, r0, #0
-; NOREDUCTIONS-NEXT:    b .LBB0_4
-; NOREDUCTIONS-NEXT:  .LBB0_2: @ in Loop: Header=BB0_4 Depth=1
-; NOREDUCTIONS-NEXT:    movs r0, #0
-; NOREDUCTIONS-NEXT:  .LBB0_3: @ %for.end
-; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
+; NOREDUCTIONS-NEXT:    b .LBB0_5
+; NOREDUCTIONS-NEXT:  .LBB0_2: @ %vector.body
+; NOREDUCTIONS-NEXT:    @ Parent Loop BB0_5 Depth=1
+; NOREDUCTIONS-NEXT:    @ => This Inner Loop Header: Depth=2
+; NOREDUCTIONS-NEXT:    vctp.32 r4
+; NOREDUCTIONS-NEXT:    vmov q0, q1
+; NOREDUCTIONS-NEXT:    vpstt
+; NOREDUCTIONS-NEXT:    vldrht.s32 q1, [r0], #8
+; NOREDUCTIONS-NEXT:    vldrht.s32 q2, [r7], #8
+; NOREDUCTIONS-NEXT:    mov lr, r6
+; NOREDUCTIONS-NEXT:    vmul.i32 q1, q2, q1
+; NOREDUCTIONS-NEXT:    subs r6, #1
+; NOREDUCTIONS-NEXT:    vshl.s32 q1, r5
+; NOREDUCTIONS-NEXT:    subs r4, #4
+; NOREDUCTIONS-NEXT:    vadd.i32 q1, q1, q0
+; NOREDUCTIONS-NEXT:    le lr, .LBB0_2
+; NOREDUCTIONS-NEXT:  @ %bb.3: @ %middle.block
+; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_5 Depth=1
+; NOREDUCTIONS-NEXT:    vpsel q0, q1, q0
+; NOREDUCTIONS-NEXT:    vaddv.u32 r0, q0
+; NOREDUCTIONS-NEXT:  .LBB0_4: @ %for.end
+; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_5 Depth=1
 ; NOREDUCTIONS-NEXT:    lsrs r0, r0, #16
 ; NOREDUCTIONS-NEXT:    sub.w r9, r9, #1
 ; NOREDUCTIONS-NEXT:    strh.w r0, [r1, r8, lsl #1]
@@ -113,13 +131,13 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; NOREDUCTIONS-NEXT:    add.w r10, r10, #2
 ; NOREDUCTIONS-NEXT:    cmp r8, r3
 ; NOREDUCTIONS-NEXT:    beq .LBB0_8
-; NOREDUCTIONS-NEXT:  .LBB0_4: @ %for.body
+; NOREDUCTIONS-NEXT:  .LBB0_5: @ %for.body
 ; NOREDUCTIONS-NEXT:    @ =>This Loop Header: Depth=1
-; NOREDUCTIONS-NEXT:    @ Child Loop BB0_6 Depth 2
+; NOREDUCTIONS-NEXT:    @ Child Loop BB0_2 Depth 2
 ; NOREDUCTIONS-NEXT:    cmp r2, r8
-; NOREDUCTIONS-NEXT:    ble .LBB0_2
-; NOREDUCTIONS-NEXT:  @ %bb.5: @ %vector.ph
-; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
+; NOREDUCTIONS-NEXT:    ble .LBB0_7
+; NOREDUCTIONS-NEXT:  @ %bb.6: @ %vector.ph
+; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_5 Depth=1
 ; NOREDUCTIONS-NEXT:    bic r0, r9, #3
 ; NOREDUCTIONS-NEXT:    movs r7, #1
 ; NOREDUCTIONS-NEXT:    subs r0, #4
@@ -133,26 +151,10 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; NOREDUCTIONS-NEXT:    mov r7, r10
 ; NOREDUCTIONS-NEXT:    dls lr, r0
 ; NOREDUCTIONS-NEXT:    ldr r0, [sp] @ 4-byte Reload
-; NOREDUCTIONS-NEXT:  .LBB0_6: @ %vector.body
-; NOREDUCTIONS-NEXT:    @ Parent Loop BB0_4 Depth=1
-; NOREDUCTIONS-NEXT:    @ => This Inner Loop Header: Depth=2
-; NOREDUCTIONS-NEXT:    vctp.32 r4
-; NOREDUCTIONS-NEXT:    vmov q0, q1
-; NOREDUCTIONS-NEXT:    vpstt
-; NOREDUCTIONS-NEXT:    vldrht.s32 q1, [r0], #8
-; NOREDUCTIONS-NEXT:    vldrht.s32 q2, [r7], #8
-; NOREDUCTIONS-NEXT:    mov lr, r6
-; NOREDUCTIONS-NEXT:    vmul.i32 q1, q2, q1
-; NOREDUCTIONS-NEXT:    subs r6, #1
-; NOREDUCTIONS-NEXT:    vshl.s32 q1, r5
-; NOREDUCTIONS-NEXT:    subs r4, #4
-; NOREDUCTIONS-NEXT:    vadd.i32 q1, q1, q0
-; NOREDUCTIONS-NEXT:    le lr, .LBB0_6
-; NOREDUCTIONS-NEXT:  @ %bb.7: @ %middle.block
-; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
-; NOREDUCTIONS-NEXT:    vpsel q0, q1, q0
-; NOREDUCTIONS-NEXT:    vaddv.u32 r0, q0
-; NOREDUCTIONS-NEXT:    b .LBB0_3
+; NOREDUCTIONS-NEXT:    b .LBB0_2
+; NOREDUCTIONS-NEXT:  .LBB0_7: @ in Loop: Header=BB0_5 Depth=1
+; NOREDUCTIONS-NEXT:    movs r0, #0
+; NOREDUCTIONS-NEXT:    b .LBB0_4
 ; NOREDUCTIONS-NEXT:  .LBB0_8: @ %for.end17
 ; NOREDUCTIONS-NEXT:    add sp, #4
 ; NOREDUCTIONS-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
index c8001df58e8c..854f45bef455 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
@@ -66,8 +66,7 @@ define i32 @vcmp_new_vpst_combination(i32 %len, i32* nocapture readonly %arr) {
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB1_4:
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp7 = icmp sgt i32 %len, 0

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
index 12e63f1b1edf..c40c30726a47 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll
@@ -60,39 +60,39 @@ define void @nested(i32* nocapture readonly %x, i32* nocapture readnone %y, i32*
 ; CHECK-NEXT:    ldr r5, [sp, #28]
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    b .LBB1_4
-; CHECK-NEXT:  .LBB1_2: @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    mov r4, r3
-; CHECK-NEXT:  .LBB1_3: @ %if.end
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    str.w r4, [r2, r1, lsl #2]
-; CHECK-NEXT:    adds r1, #1
-; CHECK-NEXT:    cmp r1, r3
-; CHECK-NEXT:    beq .LBB1_8
-; CHECK-NEXT:  .LBB1_4: @ %for.body
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB1_6 Depth 2
-; CHECK-NEXT:    adds r7, r5, #3
-; CHECK-NEXT:    cmp.w r12, r7, lsr #2
-; CHECK-NEXT:    beq .LBB1_2
-; CHECK-NEXT:  @ %bb.5: @ %do.body.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
+; CHECK-NEXT:    b .LBB1_6
+; CHECK-NEXT:  .LBB1_2: @ %do.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
 ; CHECK-NEXT:    bic r9, r7, #3
 ; CHECK-NEXT:    mov r7, r5
 ; CHECK-NEXT:    mov r4, r3
 ; CHECK-NEXT:    add.w r8, r0, r9, lsl #2
 ; CHECK-NEXT:    dlstp.32 lr, r5
-; CHECK-NEXT:  .LBB1_6: @ %do.body
-; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
+; CHECK-NEXT:  .LBB1_3: @ %do.body
+; CHECK-NEXT:    @ Parent Loop BB1_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vaddva.s32 r4, q0
-; CHECK-NEXT:    letp lr, .LBB1_6
-; CHECK-NEXT:  @ %bb.7: @ %if.end.loopexit
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
+; CHECK-NEXT:    letp lr, .LBB1_3
+; CHECK-NEXT:  @ %bb.4: @ %if.end.loopexit
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
 ; CHECK-NEXT:    sub.w r5, r5, r9
 ; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    b .LBB1_3
+; CHECK-NEXT:  .LBB1_5: @ %if.end
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
+; CHECK-NEXT:    str.w r4, [r2, r1, lsl #2]
+; CHECK-NEXT:    adds r1, #1
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    beq .LBB1_8
+; CHECK-NEXT:  .LBB1_6: @ %for.body
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB1_3 Depth 2
+; CHECK-NEXT:    adds r7, r5, #3
+; CHECK-NEXT:    cmp.w r12, r7, lsr #2
+; CHECK-NEXT:    bne .LBB1_2
+; CHECK-NEXT:  @ %bb.7: @ in Loop: Header=BB1_6 Depth=1
+; CHECK-NEXT:    mov r4, r3
+; CHECK-NEXT:    b .LBB1_5
 ; CHECK-NEXT:  .LBB1_8: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index 5bf4ebf92f14..f0a1b2b001d1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1102,9 +1102,20 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    add.w r3, r12, #16
 ; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    b .LBB16_4
-; CHECK-NEXT:  .LBB16_3: @ %while.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    b .LBB16_6
+; CHECK-NEXT:  .LBB16_3: @ %while.body76
+; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    ldrh r1, [r6], #2
+; CHECK-NEXT:    vldrh.u16 q1, [r0], #2
+; CHECK-NEXT:    subs.w lr, lr, #1
+; CHECK-NEXT:    vfma.f16 q0, q1, r1
+; CHECK-NEXT:    bne .LBB16_3
+; CHECK-NEXT:  @ %bb.4: @ %while.end.loopexit
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    add.w r5, r5, r8, lsl #1
+; CHECK-NEXT:  .LBB16_5: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r9, r9, #1
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
@@ -1112,10 +1123,10 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    add.w r0, r5, r0, lsl #1
 ; CHECK-NEXT:    add.w r5, r0, #8
 ; CHECK-NEXT:    beq.w .LBB16_12
-; CHECK-NEXT:  .LBB16_4: @ %while.body
+; CHECK-NEXT:  .LBB16_6: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB16_6 Depth 2
-; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_8 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_3 Depth 2
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #8
 ; CHECK-NEXT:    ldrh.w lr, [r12, #14]
 ; CHECK-NEXT:    ldrh.w r0, [r12, #12]
@@ -1152,14 +1163,14 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    adds r5, #16
 ; CHECK-NEXT:    vfma.f16 q0, q1, lr
 ; CHECK-NEXT:    cmp r0, #16
-; CHECK-NEXT:    blo .LBB16_7
-; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    blo .LBB16_11
+; CHECK-NEXT:  @ %bb.7: @ %for.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:  .LBB16_6: @ %for.body
-; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
+; CHECK-NEXT:  .LBB16_8: @ %for.body
+; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrh r0, [r6], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r5]
@@ -1190,32 +1201,19 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    adds r5, #16
 ; CHECK-NEXT:    vfma.f16 q0, q1, r1
-; CHECK-NEXT:    le lr, .LBB16_6
-; CHECK-NEXT:    b .LBB16_8
-; CHECK-NEXT:  .LBB16_7: @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:  .LBB16_8: @ %for.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    le lr, .LBB16_8
+; CHECK-NEXT:  .LBB16_9: @ %for.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    cmp.w r8, #0
-; CHECK-NEXT:    beq.w .LBB16_3
-; CHECK-NEXT:    b .LBB16_9
-; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    beq.w .LBB16_5
+; CHECK-NEXT:  @ %bb.10: @ %while.body76.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    mov lr, r8
-; CHECK-NEXT:  .LBB16_10: @ %while.body76
-; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
-; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh r1, [r6], #2
-; CHECK-NEXT:    vldrh.u16 q1, [r0], #2
-; CHECK-NEXT:    subs.w lr, lr, #1
-; CHECK-NEXT:    vfma.f16 q0, q1, r1
-; CHECK-NEXT:    bne .LBB16_10
-; CHECK-NEXT:    b .LBB16_11
-; CHECK-NEXT:  .LBB16_11: @ %while.end.loopexit
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    add.w r5, r5, r8, lsl #1
 ; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:  .LBB16_11: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    b .LBB16_9
 ; CHECK-NEXT:  .LBB16_12: @ %if.end
 ; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 3986b53cab21..8344f6361114 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1074,19 +1074,30 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    str r4, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    b .LBB16_4
-; CHECK-NEXT:  .LBB16_3: @ %while.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    b .LBB16_6
+; CHECK-NEXT:  .LBB16_3: @ %while.body76
+; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    ldr r0, [r7], #4
+; CHECK-NEXT:    vldrw.u32 q1, [r6], #4
+; CHECK-NEXT:    subs.w lr, lr, #1
+; CHECK-NEXT:    vfma.f32 q0, q1, r0
+; CHECK-NEXT:    bne .LBB16_3
+; CHECK-NEXT:  @ %bb.4: @ %while.end.loopexit
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    add.w r5, r5, r3, lsl #2
+; CHECK-NEXT:  .LBB16_5: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r10, r10, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 ; CHECK-NEXT:    add.w r0, r5, r0, lsl #2
 ; CHECK-NEXT:    add.w r5, r0, #16
 ; CHECK-NEXT:    beq .LBB16_12
-; CHECK-NEXT:  .LBB16_4: @ %while.body
+; CHECK-NEXT:  .LBB16_6: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB16_6 Depth 2
-; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_8 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_3 Depth 2
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    ldrd r7, r6, [r12]
 ; CHECK-NEXT:    ldrd r0, r4, [r12, #8]
@@ -1112,14 +1123,14 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
 ; CHECK-NEXT:    cmp r0, #16
 ; CHECK-NEXT:    vfma.f32 q0, q1, r8
-; CHECK-NEXT:    blo .LBB16_7
-; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    blo .LBB16_11
+; CHECK-NEXT:  @ %bb.7: @ %for.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:  .LBB16_6: @ %for.body
-; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
+; CHECK-NEXT:  .LBB16_8: @ %for.body
+; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldm.w r7, {r0, r3, r4, r6}
 ; CHECK-NEXT:    vldrw.u32 q1, [r5], #32
@@ -1142,34 +1153,21 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    adds r7, #32
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
 ; CHECK-NEXT:    vfma.f32 q0, q1, r9
-; CHECK-NEXT:    le lr, .LBB16_6
-; CHECK-NEXT:    b .LBB16_8
-; CHECK-NEXT:  .LBB16_7: @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:  .LBB16_8: @ %for.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    le lr, .LBB16_8
+; CHECK-NEXT:  .LBB16_9: @ %for.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldrd r9, r1, [sp, #24] @ 8-byte Folded Reload
 ; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    cmp.w r3, #0
-; CHECK-NEXT:    beq .LBB16_3
-; CHECK-NEXT:    b .LBB16_9
-; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
+; CHECK-NEXT:    beq .LBB16_5
+; CHECK-NEXT:  @ %bb.10: @ %while.body76.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    mov r6, r5
 ; CHECK-NEXT:    mov lr, r3
-; CHECK-NEXT:  .LBB16_10: @ %while.body76
-; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
-; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldr r0, [r7], #4
-; CHECK-NEXT:    vldrw.u32 q1, [r6], #4
-; CHECK-NEXT:    subs.w lr, lr, #1
-; CHECK-NEXT:    vfma.f32 q0, q1, r0
-; CHECK-NEXT:    bne .LBB16_10
-; CHECK-NEXT:    b .LBB16_11
-; CHECK-NEXT:  .LBB16_11: @ %while.end.loopexit
-; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
-; CHECK-NEXT:    add.w r5, r5, r3, lsl #2
 ; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:  .LBB16_11: @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    b .LBB16_9
 ; CHECK-NEXT:  .LBB16_12: @ %if.end
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
@@ -1581,25 +1579,27 @@ define arm_aapcs_vfpcc void @fms(float* nocapture readonly %pSrc1, float* nocapt
 ; CHECK-NEXT:  @ %bb.1: @ %do.body.preheader
 ; CHECK-NEXT:    ldr.w r12, [sp, #20]
 ; CHECK-NEXT:    lsr.w r5, lr, #2
-; CHECK-NEXT:  .LBB18_2: @ %do.body
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB18_3 Depth 2
-; CHECK-NEXT:    ldr r4, [r2]
-; CHECK-NEXT:    dls lr, r5
-; CHECK-NEXT:    vdup.32 q0, r4
-; CHECK-NEXT:  .LBB18_3: @ %while.body
-; CHECK-NEXT:    @ Parent Loop BB18_2 Depth=1
+; CHECK-NEXT:    b .LBB18_4
+; CHECK-NEXT:  .LBB18_2: @ %while.body
+; CHECK-NEXT:    @ Parent Loop BB18_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
 ; CHECK-NEXT:    vfms.f32 q2, q1, q0
 ; CHECK-NEXT:    vstrb.8 q2, [r3], #16
-; CHECK-NEXT:    le lr, .LBB18_3
-; CHECK-NEXT:  @ %bb.4: @ %while.end
-; CHECK-NEXT:    @ in Loop: Header=BB18_2 Depth=1
+; CHECK-NEXT:    le lr, .LBB18_2
+; CHECK-NEXT:  @ %bb.3: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB18_4 Depth=1
 ; CHECK-NEXT:    subs.w r12, r12, #1
 ; CHECK-NEXT:    add.w r2, r2, #4
-; CHECK-NEXT:    bne .LBB18_2
+; CHECK-NEXT:    beq .LBB18_5
+; CHECK-NEXT:  .LBB18_4: @ %do.body
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB18_2 Depth 2
+; CHECK-NEXT:    ldr r4, [r2]
+; CHECK-NEXT:    dls lr, r5
+; CHECK-NEXT:    vdup.32 q0, r4
+; CHECK-NEXT:    b .LBB18_2
 ; CHECK-NEXT:  .LBB18_5: @ %do.end
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index c4f68959ecf4..81a6779f885d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -325,23 +325,25 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(i32* noalias nocapture read
 ; CHECK-NEXT:    adr r3, .LCPI8_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:  .LBB8_2: @ %vector.ph
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB8_3 Depth 2
-; CHECK-NEXT:    dls lr, r4
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:  .LBB8_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB8_2 Depth=1
+; CHECK-NEXT:    b .LBB8_4
+; CHECK-NEXT:  .LBB8_2: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB8_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q2, [q1, #16]!
 ; CHECK-NEXT:    vstrb.8 q2, [r0], #16
-; CHECK-NEXT:    le lr, .LBB8_3
-; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB8_2 Depth=1
+; CHECK-NEXT:    le lr, .LBB8_2
+; CHECK-NEXT:  @ %bb.3: @ %middle.block
+; CHECK-NEXT:    @ in Loop: Header=BB8_4 Depth=1
 ; CHECK-NEXT:    cmp r12, r2
-; CHECK-NEXT:    bne .LBB8_2
-; CHECK-NEXT:  @ %bb.5: @ %for.cond.cleanup
+; CHECK-NEXT:    beq .LBB8_5
+; CHECK-NEXT:  .LBB8_4: @ %vector.ph
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB8_2 Depth 2
+; CHECK-NEXT:    dls lr, r4
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    b .LBB8_2
+; CHECK-NEXT:  .LBB8_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
@@ -402,16 +404,9 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(i32* noalias nocapture rea
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:  .LBB9_2: @ %vector.ph
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB9_3 Depth 2
-; CHECK-NEXT:    dls lr, r3
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vmov q5, q2
-; CHECK-NEXT:  .LBB9_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB9_2 Depth=1
+; CHECK-NEXT:    b .LBB9_4
+; CHECK-NEXT:  .LBB9_2: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB9_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q6, [q5, #48]!
 ; CHECK-NEXT:    vldrw.u32 q7, [q3, #48]!
@@ -419,11 +414,20 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(i32* noalias nocapture rea
 ; CHECK-NEXT:    vldrw.u32 q7, [q4, #48]!
 ; CHECK-NEXT:    vadd.i32 q6, q6, q7
 ; CHECK-NEXT:    vstrb.8 q6, [r0], #16
-; CHECK-NEXT:    le lr, .LBB9_3
-; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB9_2 Depth=1
+; CHECK-NEXT:    le lr, .LBB9_2
+; CHECK-NEXT:  @ %bb.3: @ %middle.block
+; CHECK-NEXT:    @ in Loop: Header=BB9_4 Depth=1
 ; CHECK-NEXT:    cmp r12, r2
-; CHECK-NEXT:    bne .LBB9_2
+; CHECK-NEXT:    beq .LBB9_5
+; CHECK-NEXT:  .LBB9_4: @ %vector.ph
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB9_2 Depth 2
+; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmov q5, q2
+; CHECK-NEXT:    b .LBB9_2
 ; CHECK-NEXT:  .LBB9_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
@@ -498,23 +502,25 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_large(i32* noalias nocapture reado
 ; CHECK-NEXT:    adr r3, .LCPI10_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:  .LBB10_2: @ %vector.ph
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB10_3 Depth 2
-; CHECK-NEXT:    dls lr, r4
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:  .LBB10_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB10_2 Depth=1
+; CHECK-NEXT:    b .LBB10_4
+; CHECK-NEXT:  .LBB10_2: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB10_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q2, [q1, #508]!
 ; CHECK-NEXT:    vstrb.8 q2, [r0], #16
-; CHECK-NEXT:    le lr, .LBB10_3
-; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB10_2 Depth=1
+; CHECK-NEXT:    le lr, .LBB10_2
+; CHECK-NEXT:  @ %bb.3: @ %middle.block
+; CHECK-NEXT:    @ in Loop: Header=BB10_4 Depth=1
 ; CHECK-NEXT:    cmp r12, r2
-; CHECK-NEXT:    bne .LBB10_2
-; CHECK-NEXT:  @ %bb.5: @ %for.cond.cleanup
+; CHECK-NEXT:    beq .LBB10_5
+; CHECK-NEXT:  .LBB10_4: @ %vector.ph
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB10_2 Depth 2
+; CHECK-NEXT:    dls lr, r4
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    b .LBB10_2
+; CHECK-NEXT:  .LBB10_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:
@@ -578,15 +584,9 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture read
 ; CHECK-NEXT:    adr r6, .LCPI11_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
-; CHECK-NEXT:  .LBB11_2: @ %vector.ph
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB11_3 Depth 2
-; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    dls lr, r1
-; CHECK-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:  .LBB11_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=1
+; CHECK-NEXT:    b .LBB11_4
+; CHECK-NEXT:  .LBB11_2: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB11_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vmov.u16 r7, q2[6]
 ; CHECK-NEXT:    vmov.u16 r3, q2[4]
@@ -632,11 +632,19 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture read
 ; CHECK-NEXT:    vmov.16 q3[6], r5
 ; CHECK-NEXT:    vmov.16 q3[7], r6
 ; CHECK-NEXT:    vstrb.8 q3, [r4], #16
-; CHECK-NEXT:    le lr, .LBB11_3
-; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB11_2 Depth=1
+; CHECK-NEXT:    le lr, .LBB11_2
+; CHECK-NEXT:  @ %bb.3: @ %middle.block
+; CHECK-NEXT:    @ in Loop: Header=BB11_4 Depth=1
 ; CHECK-NEXT:    cmp r8, r2
-; CHECK-NEXT:    bne .LBB11_2
+; CHECK-NEXT:    beq .LBB11_5
+; CHECK-NEXT:  .LBB11_4: @ %vector.ph
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB11_2 Depth 2
+; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
+; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    dls lr, r1
+; CHECK-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    b .LBB11_2
 ; CHECK-NEXT:  .LBB11_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    vpop {d8, d9}
@@ -717,17 +725,9 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    str r1, [sp, #52] @ 4-byte Spill
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:  .LBB12_2: @ %vector.ph
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB12_3 Depth 2
-; CHECK-NEXT:    ldr r1, [sp, #52] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, r1
-; CHECK-NEXT:    ldr r4, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q6, [sp] @ 16-byte Reload
-; CHECK-NEXT:  .LBB12_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB12_2 Depth=1
+; CHECK-NEXT:    b .LBB12_4
+; CHECK-NEXT:  .LBB12_2: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB12_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vmov.u16 r3, q5[2]
 ; CHECK-NEXT:    vmov.u16 r5, q5[0]
@@ -864,12 +864,22 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture rea
 ; CHECK-NEXT:    vadd.i16 q0, q0, q2
 ; CHECK-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-NEXT:    vstrb.8 q0, [r4], #16
-; CHECK-NEXT:    le lr, .LBB12_3
-; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB12_2 Depth=1
+; CHECK-NEXT:    le lr, .LBB12_2
+; CHECK-NEXT:  @ %bb.3: @ %middle.block
+; CHECK-NEXT:    @ in Loop: Header=BB12_4 Depth=1
 ; CHECK-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r1, r2
-; CHECK-NEXT:    bne.w .LBB12_2
+; CHECK-NEXT:    beq .LBB12_5
+; CHECK-NEXT:  .LBB12_4: @ %vector.ph
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB12_2 Depth 2
+; CHECK-NEXT:    ldr r1, [sp, #52] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, r1
+; CHECK-NEXT:    ldr r4, [sp, #60] @ 4-byte Reload
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp] @ 16-byte Reload
+; CHECK-NEXT:    b .LBB12_2
 ; CHECK-NEXT:  .LBB12_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #104
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index cfed9ccaebae..535affceaf3f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -460,29 +460,23 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32*
 ; CHECK-NEXT:    vldrw.u32 q2, [r7]
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:  .LBB9_1: @ %for.cond8.preheader.us.us.preheader
+; CHECK-NEXT:    b .LBB9_2
+; CHECK-NEXT:  .LBB9_1: @ %for.cond4.for.cond.cleanup6_crit_edge.us
+; CHECK-NEXT:    @ in Loop: Header=BB9_2 Depth=1
+; CHECK-NEXT:    add.w r8, r8, #1
+; CHECK-NEXT:    cmp r8, r3
+; CHECK-NEXT:    beq .LBB9_6
+; CHECK-NEXT:  .LBB9_2: @ %for.cond8.preheader.us.us.preheader
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB9_2 Depth 2
+; CHECK-NEXT:    @ Child Loop BB9_5 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB9_3 Depth 3
 ; CHECK-NEXT:    mul r11, r8, r9
 ; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:    mul r7, r8, r12
-; CHECK-NEXT:  .LBB9_2: @ %vector.ph
-; CHECK-NEXT:    @ Parent Loop BB9_1 Depth=1
-; CHECK-NEXT:    @ => This Loop Header: Depth=2
-; CHECK-NEXT:    @ Child Loop BB9_3 Depth 3
-; CHECK-NEXT:    vdup.32 q5, r7
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vshl.i32 q5, q5, #2
-; CHECK-NEXT:    vmov q6, q1
-; CHECK-NEXT:    vadd.i32 q5, q5, r0
-; CHECK-NEXT:    dls lr, r10
-; CHECK-NEXT:    vmov.i32 q4, #0x0
-; CHECK-NEXT:    vadd.i32 q5, q5, q0
-; CHECK-NEXT:    vmlas.u32 q6, q2, r5
+; CHECK-NEXT:    b .LBB9_5
 ; CHECK-NEXT:  .LBB9_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB9_1 Depth=1
-; CHECK-NEXT:    @ Parent Loop BB9_2 Depth=2
+; CHECK-NEXT:    @ Parent Loop BB9_2 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB9_5 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
 ; CHECK-NEXT:    vadd.i32 q7, q6, q3
 ; CHECK-NEXT:    vldrw.u32 q0, [r1, q6, uxtw #2]
@@ -492,19 +486,28 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32*
 ; CHECK-NEXT:    vadd.i32 q4, q0, q4
 ; CHECK-NEXT:    le lr, .LBB9_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB9_2 Depth=2
+; CHECK-NEXT:    @ in Loop: Header=BB9_5 Depth=2
 ; CHECK-NEXT:    add.w r4, r5, r11
 ; CHECK-NEXT:    adds r5, #1
 ; CHECK-NEXT:    vaddv.u32 r6, q4
 ; CHECK-NEXT:    cmp r5, r9
 ; CHECK-NEXT:    str.w r6, [r2, r4, lsl #2]
-; CHECK-NEXT:    bne .LBB9_2
-; CHECK-NEXT:  @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us
-; CHECK-NEXT:    @ in Loop: Header=BB9_1 Depth=1
-; CHECK-NEXT:    add.w r8, r8, #1
-; CHECK-NEXT:    cmp r8, r3
-; CHECK-NEXT:    bne .LBB9_1
-; CHECK-NEXT:  @ %bb.6: @ %for.end25
+; CHECK-NEXT:    beq .LBB9_1
+; CHECK-NEXT:  .LBB9_5: @ %vector.ph
+; CHECK-NEXT:    @ Parent Loop BB9_2 Depth=1
+; CHECK-NEXT:    @ => This Loop Header: Depth=2
+; CHECK-NEXT:    @ Child Loop BB9_3 Depth 3
+; CHECK-NEXT:    vdup.32 q5, r7
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vshl.i32 q5, q5, #2
+; CHECK-NEXT:    vmov q6, q1
+; CHECK-NEXT:    vadd.i32 q5, q5, r0
+; CHECK-NEXT:    dls lr, r10
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    vadd.i32 q5, q5, q0
+; CHECK-NEXT:    vmlas.u32 q6, q2, r5
+; CHECK-NEXT:    b .LBB9_3
+; CHECK-NEXT:  .LBB9_6: @ %for.end25
 ; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
@@ -861,36 +864,43 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
 ; CHECK-NEXT:    movs r6, #11
 ; CHECK-NEXT:    vshl.i32 q1, q1, #2
 ; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:  .LBB11_1: @ %for.body10.i
+; CHECK-NEXT:    b .LBB11_2
+; CHECK-NEXT:  .LBB11_1: @ %for.cond.cleanup20.i
+; CHECK-NEXT:    @ in Loop: Header=BB11_2 Depth=1
+; CHECK-NEXT:    ldr r5, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #148]
+; CHECK-NEXT:    adds r5, #1
+; CHECK-NEXT:    cmp r5, r7
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    moveq r5, #0
+; CHECK-NEXT:  .LBB11_2: @ %for.body10.i
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB11_2 Depth 2
-; CHECK-NEXT:    @ Child Loop BB11_3 Depth 3
-; CHECK-NEXT:    @ Child Loop BB11_4 Depth 4
-; CHECK-NEXT:    @ Child Loop BB11_5 Depth 5
+; CHECK-NEXT:    @ Child Loop BB11_4 Depth 2
+; CHECK-NEXT:    @ Child Loop BB11_9 Depth 3
+; CHECK-NEXT:    @ Child Loop BB11_5 Depth 4
+; CHECK-NEXT:    @ Child Loop BB11_6 Depth 5
 ; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    str r5, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:  .LBB11_2: @ %for.cond22.preheader.i
-; CHECK-NEXT:    @ Parent Loop BB11_1 Depth=1
+; CHECK-NEXT:    b .LBB11_4
+; CHECK-NEXT:  .LBB11_3: @ %for.cond.cleanup26.i
+; CHECK-NEXT:    @ in Loop: Header=BB11_4 Depth=2
+; CHECK-NEXT:    adds r7, #1
+; CHECK-NEXT:    cmp r7, r3
+; CHECK-NEXT:    beq .LBB11_1
+; CHECK-NEXT:  .LBB11_4: @ %for.cond22.preheader.i
+; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=1
 ; CHECK-NEXT:    @ => This Loop Header: Depth=2
-; CHECK-NEXT:    @ Child Loop BB11_3 Depth 3
-; CHECK-NEXT:    @ Child Loop BB11_4 Depth 4
-; CHECK-NEXT:    @ Child Loop BB11_5 Depth 5
+; CHECK-NEXT:    @ Child Loop BB11_9 Depth 3
+; CHECK-NEXT:    @ Child Loop BB11_5 Depth 4
+; CHECK-NEXT:    @ Child Loop BB11_6 Depth 5
 ; CHECK-NEXT:    movs r5, #0
-; CHECK-NEXT:  .LBB11_3: @ %for.body27.i
-; CHECK-NEXT:    @ Parent Loop BB11_1 Depth=1
-; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=2
-; CHECK-NEXT:    @ => This Loop Header: Depth=3
-; CHECK-NEXT:    @ Child Loop BB11_4 Depth 4
-; CHECK-NEXT:    @ Child Loop BB11_5 Depth 5
-; CHECK-NEXT:    dls lr, r9
-; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    mov.w r11, #4
-; CHECK-NEXT:  .LBB11_4: @ %for.body78.us.i
-; CHECK-NEXT:    @ Parent Loop BB11_1 Depth=1
-; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=2
-; CHECK-NEXT:    @ Parent Loop BB11_3 Depth=3
+; CHECK-NEXT:    b .LBB11_9
+; CHECK-NEXT:  .LBB11_5: @ %for.body78.us.i
+; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB11_4 Depth=2
+; CHECK-NEXT:    @ Parent Loop BB11_9 Depth=3
 ; CHECK-NEXT:    @ => This Loop Header: Depth=4
-; CHECK-NEXT:    @ Child Loop BB11_5 Depth 5
+; CHECK-NEXT:    @ Child Loop BB11_6 Depth 5
 ; CHECK-NEXT:    mul r4, r11, r6
 ; CHECK-NEXT:    vdup.32 q3, r5
 ; CHECK-NEXT:    vdup.32 q2, r7
@@ -900,11 +910,11 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
 ; CHECK-NEXT:    vadd.i32 q4, q0, r4
 ; CHECK-NEXT:    mov r4, r8
 ; CHECK-NEXT:    vmla.u32 q2, q4, r2
-; CHECK-NEXT:  .LBB11_5: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB11_1 Depth=1
-; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=2
-; CHECK-NEXT:    @ Parent Loop BB11_3 Depth=3
-; CHECK-NEXT:    @ Parent Loop BB11_4 Depth=4
+; CHECK-NEXT:  .LBB11_6: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB11_4 Depth=2
+; CHECK-NEXT:    @ Parent Loop BB11_9 Depth=3
+; CHECK-NEXT:    @ Parent Loop BB11_5 Depth=4
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=5
 ; CHECK-NEXT:    vldrb.s32 q6, [r0, q2]
 ; CHECK-NEXT:    vadd.i32 q5, q2, q1
@@ -915,31 +925,27 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
 ; CHECK-NEXT:    vmov q3, q4
 ; CHECK-NEXT:    vmlava.u32 r12, q2, q6
 ; CHECK-NEXT:    vmov q2, q5
-; CHECK-NEXT:    bne .LBB11_5
-; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB11_4 Depth=4
+; CHECK-NEXT:    bne .LBB11_6
+; CHECK-NEXT:  @ %bb.7: @ %middle.block
+; CHECK-NEXT:    @ in Loop: Header=BB11_5 Depth=4
 ; CHECK-NEXT:    add.w r11, r11, #1
-; CHECK-NEXT:    le lr, .LBB11_4
-; CHECK-NEXT:  @ %bb.7: @ %for.cond.cleanup77.i
-; CHECK-NEXT:    @ in Loop: Header=BB11_3 Depth=3
+; CHECK-NEXT:    le lr, .LBB11_5
+; CHECK-NEXT:  @ %bb.8: @ %for.cond.cleanup77.i
+; CHECK-NEXT:    @ in Loop: Header=BB11_9 Depth=3
 ; CHECK-NEXT:    adds r5, #1
 ; CHECK-NEXT:    add.w r10, r10, #1
 ; CHECK-NEXT:    cmp r5, r2
-; CHECK-NEXT:    bne .LBB11_3
-; CHECK-NEXT:  @ %bb.8: @ %for.cond.cleanup26.i
-; CHECK-NEXT:    @ in Loop: Header=BB11_2 Depth=2
-; CHECK-NEXT:    adds r7, #1
-; CHECK-NEXT:    cmp r7, r3
-; CHECK-NEXT:    bne .LBB11_2
-; CHECK-NEXT:  @ %bb.9: @ %for.cond.cleanup20.i
-; CHECK-NEXT:    @ in Loop: Header=BB11_1 Depth=1
-; CHECK-NEXT:    ldr r5, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    ldr r7, [sp, #148]
-; CHECK-NEXT:    adds r5, #1
-; CHECK-NEXT:    cmp r5, r7
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    moveq r5, #0
-; CHECK-NEXT:    b .LBB11_1
+; CHECK-NEXT:    beq .LBB11_3
+; CHECK-NEXT:  .LBB11_9: @ %for.body27.i
+; CHECK-NEXT:    @ Parent Loop BB11_2 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB11_4 Depth=2
+; CHECK-NEXT:    @ => This Loop Header: Depth=3
+; CHECK-NEXT:    @ Child Loop BB11_5 Depth 4
+; CHECK-NEXT:    @ Child Loop BB11_6 Depth 5
+; CHECK-NEXT:    dls lr, r9
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    mov.w r11, #4
+; CHECK-NEXT:    b .LBB11_5
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.10:
 ; CHECK-NEXT:  .LCPI11_0:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
index b7e1c340fc5e..74bdd64e976b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-tailpred.ll
@@ -17,23 +17,25 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(i32* noalias nocapture read
 ; CHECK-NEXT:    adr r3, .LCPI0_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:  .LBB0_2: @ %vector.ph
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB0_3 Depth 2
-; CHECK-NEXT:    dls lr, r4
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:  .LBB0_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:    b .LBB0_4
+; CHECK-NEXT:  .LBB0_2: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q2, [q1, #16]!
 ; CHECK-NEXT:    vstrb.8 q2, [r0], #16
-; CHECK-NEXT:    le lr, .LBB0_3
-; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    le lr, .LBB0_2
+; CHECK-NEXT:  @ %bb.3: @ %middle.block
+; CHECK-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; CHECK-NEXT:    cmp r12, r2
-; CHECK-NEXT:    bne .LBB0_2
-; CHECK-NEXT:  @ %bb.5: @ %for.cond.cleanup
+; CHECK-NEXT:    beq .LBB0_5
+; CHECK-NEXT:  .LBB0_4: @ %vector.ph
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB0_2 Depth 2
+; CHECK-NEXT:    dls lr, r4
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    b .LBB0_2
+; CHECK-NEXT:  .LBB0_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.6:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index 95a0c9458c8e..7da0903e08ed 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -1866,8 +1866,7 @@ define arm_aapcs_vfpcc void @usatmul_4_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    beq .LBB11_8
+; CHECK-NEXT:    cbz r3, .LBB11_8
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bhi .LBB11_3
@@ -2132,8 +2131,7 @@ define arm_aapcs_vfpcc void @ssatmul_4_q7(i8* nocapture readonly %pSrcA, i8* noc
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    beq .LBB13_8
+; CHECK-NEXT:    cbz r3, .LBB13_8
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bhi .LBB13_3

diff  --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
index 006413638205..280d218e9337 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
@@ -160,24 +160,26 @@ define arm_aapcs_vfpcc void @scatter_inc_v4i32_complex(<4 x i32> %data1, <4 x i3
 ; CHECK-NEXT:    vadd.i32 q4, q3, r0
 ; CHECK-NEXT:    vldrw.u32 q3, [r12]
 ; CHECK-NEXT:    vadd.i32 q3, q3, r0
-; CHECK-NEXT:  .LBB3_2: @ %vector.ph
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB3_3 Depth 2
-; CHECK-NEXT:    dls lr, r3
-; CHECK-NEXT:    vmov q6, q4
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov q5, q3
-; CHECK-NEXT:  .LBB3_3: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB3_2 Depth=1
+; CHECK-NEXT:    b .LBB3_4
+; CHECK-NEXT:  .LBB3_2: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB3_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vstrw.32 q0, [q5, #48]!
 ; CHECK-NEXT:    vstrw.32 q1, [q6, #48]!
 ; CHECK-NEXT:    vstrw.32 q2, [q7, #48]!
-; CHECK-NEXT:    le lr, .LBB3_3
-; CHECK-NEXT:  @ %bb.4: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB3_2 Depth=1
+; CHECK-NEXT:    le lr, .LBB3_2
+; CHECK-NEXT:  @ %bb.3: @ %middle.block
+; CHECK-NEXT:    @ in Loop: Header=BB3_4 Depth=1
 ; CHECK-NEXT:    cmp r2, r1
-; CHECK-NEXT:    bne .LBB3_2
+; CHECK-NEXT:    beq .LBB3_5
+; CHECK-NEXT:  .LBB3_4: @ %vector.ph
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB3_2 Depth 2
+; CHECK-NEXT:    dls lr, r3
+; CHECK-NEXT:    vmov q6, q4
+; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov q5, q3
+; CHECK-NEXT:    b .LBB3_2
 ; CHECK-NEXT:  .LBB3_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
index 728328ac9cba..7186db6cda89 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
@@ -18,7 +18,7 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    b .LBB0_7
 ; CHECK-NEXT:  .LBB0_3:
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    b .LBB0_9
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB0_4: @ %vector.ph
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
@@ -45,7 +45,7 @@ define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    ldr r1, [r2], #4
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    le lr, .LBB0_8
-; CHECK-NEXT:  .LBB0_9: @ %for.cond.cleanup
+; CHECK-NEXT:  @ %bb.9: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp6 = icmp sgt i32 %n, 0
@@ -206,8 +206,8 @@ define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    b .LBB2_7
 ; CHECK-NEXT:  .LBB2_3:
-; CHECK-NEXT:    mov.w r2, #-1
-; CHECK-NEXT:    b .LBB2_9
+; CHECK-NEXT:    mov.w r0, #-1
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB2_4: @ %vector.ph
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
@@ -306,8 +306,8 @@ define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    b .LBB3_7
 ; CHECK-NEXT:  .LBB3_3:
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    b .LBB3_9
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB3_4: @ %vector.ph
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
@@ -406,8 +406,8 @@ define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    b .LBB4_7
 ; CHECK-NEXT:  .LBB4_3:
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    b .LBB4_9
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB4_4: @ %vector.ph
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
@@ -507,7 +507,8 @@ define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    b .LBB5_7
 ; CHECK-NEXT:  .LBB5_3:
 ; CHECK-NEXT:    vldr s0, .LCPI5_0
-; CHECK-NEXT:    b .LBB5_9
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB5_4: @ %vector.ph
 ; CHECK-NEXT:    bic r2, r1, #3
 ; CHECK-NEXT:    movs r3, #1
@@ -608,7 +609,8 @@ define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    b .LBB6_7
 ; CHECK-NEXT:  .LBB6_3:
 ; CHECK-NEXT:    vmov.f32 s0, #1.000000e+00
-; CHECK-NEXT:    b .LBB6_9
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB6_4: @ %vector.ph
 ; CHECK-NEXT:    bic r2, r1, #3
 ; CHECK-NEXT:    movs r3, #1
@@ -704,8 +706,8 @@ define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    b .LBB7_7
 ; CHECK-NEXT:  .LBB7_3:
-; CHECK-NEXT:    mvn r2, #-2147483648
-; CHECK-NEXT:    b .LBB7_9
+; CHECK-NEXT:    mvn r0, #-2147483648
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB7_4: @ %vector.ph
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
@@ -804,7 +806,7 @@ define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    b .LBB8_7
 ; CHECK-NEXT:  .LBB8_3:
 ; CHECK-NEXT:    mvn r0, #-2147483648
-; CHECK-NEXT:    b .LBB8_9
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB8_4: @ %vector.ph
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
@@ -832,7 +834,7 @@ define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    csel r0, r0, r1, lt
 ; CHECK-NEXT:    le lr, .LBB8_8
-; CHECK-NEXT:  .LBB8_9: @ %for.cond.cleanup
+; CHECK-NEXT:  @ %bb.9: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp6 = icmp sgt i32 %n, 0
@@ -900,8 +902,8 @@ define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    b .LBB9_7
 ; CHECK-NEXT:  .LBB9_3:
-; CHECK-NEXT:    mov.w r2, #-2147483648
-; CHECK-NEXT:    b .LBB9_9
+; CHECK-NEXT:    mov.w r0, #-2147483648
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB9_4: @ %vector.ph
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
@@ -1000,7 +1002,7 @@ define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    b .LBB10_7
 ; CHECK-NEXT:  .LBB10_3:
 ; CHECK-NEXT:    mov.w r0, #-2147483648
-; CHECK-NEXT:    b .LBB10_9
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB10_4: @ %vector.ph
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
@@ -1028,7 +1030,7 @@ define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    csel r0, r0, r1, gt
 ; CHECK-NEXT:    le lr, .LBB10_8
-; CHECK-NEXT:  .LBB10_9: @ %for.cond.cleanup
+; CHECK-NEXT:  @ %bb.9: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp6 = icmp sgt i32 %n, 0
@@ -1096,8 +1098,8 @@ define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    b .LBB11_7
 ; CHECK-NEXT:  .LBB11_3:
-; CHECK-NEXT:    mov.w r2, #-1
-; CHECK-NEXT:    b .LBB11_9
+; CHECK-NEXT:    mov.w r0, #-1
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB11_4: @ %vector.ph
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
@@ -1196,7 +1198,7 @@ define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    b .LBB12_7
 ; CHECK-NEXT:  .LBB12_3:
 ; CHECK-NEXT:    mov.w r0, #-1
-; CHECK-NEXT:    b .LBB12_9
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB12_4: @ %vector.ph
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
@@ -1224,7 +1226,7 @@ define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    csel r0, r0, r1, hi
 ; CHECK-NEXT:    le lr, .LBB12_8
-; CHECK-NEXT:  .LBB12_9: @ %for.cond.cleanup
+; CHECK-NEXT:  @ %bb.9: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp6 = icmp sgt i32 %n, 0
@@ -1292,8 +1294,8 @@ define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    b .LBB13_7
 ; CHECK-NEXT:  .LBB13_3:
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    b .LBB13_9
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB13_4: @ %vector.ph
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
@@ -1392,7 +1394,7 @@ define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    b .LBB14_7
 ; CHECK-NEXT:  .LBB14_3:
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    b .LBB14_9
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB14_4: @ %vector.ph
 ; CHECK-NEXT:    bic r3, r1, #3
 ; CHECK-NEXT:    movs r2, #1
@@ -1420,7 +1422,7 @@ define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    csel r0, r0, r1, hi
 ; CHECK-NEXT:    le lr, .LBB14_8
-; CHECK-NEXT:  .LBB14_9: @ %for.cond.cleanup
+; CHECK-NEXT:  @ %bb.9: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp6 = icmp sgt i32 %n, 0
@@ -1489,7 +1491,8 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    b .LBB15_7
 ; CHECK-NEXT:  .LBB15_3:
 ; CHECK-NEXT:    vldr s0, .LCPI15_0
-; CHECK-NEXT:    b .LBB15_9
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB15_4: @ %vector.ph
 ; CHECK-NEXT:    bic r2, r1, #3
 ; CHECK-NEXT:    movs r3, #1
@@ -1594,7 +1597,8 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    b .LBB16_7
 ; CHECK-NEXT:  .LBB16_3:
 ; CHECK-NEXT:    vldr s0, .LCPI16_0
-; CHECK-NEXT:    b .LBB16_9
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB16_4: @ %vector.ph
 ; CHECK-NEXT:    bic r2, r1, #3
 ; CHECK-NEXT:    movs r3, #1
@@ -1701,8 +1705,7 @@ define i32 @add4i32(i32* noalias nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB17_4:
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp6.not = icmp eq i32 %n, 0
@@ -1752,8 +1755,7 @@ define i32 @mla4i32(i32* noalias nocapture readonly %x, i32* noalias nocapture r
 ; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB18_4:
-; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp8.not = icmp eq i32 %n, 0
@@ -1806,8 +1808,7 @@ define i32 @add8i32(i16* noalias nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB19_4:
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp6.not = icmp eq i32 %n, 0
@@ -1858,8 +1859,7 @@ define i32 @mla8i32(i16* noalias nocapture readonly %x, i16* noalias nocapture r
 ; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB20_4:
-; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp9.not = icmp eq i32 %n, 0
@@ -1914,8 +1914,7 @@ define i32 @add16i32(i8* noalias nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB21_4:
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp6.not = icmp eq i32 %n, 0
@@ -1966,8 +1965,7 @@ define i32 @mla16i32(i8* noalias nocapture readonly %x, i8* noalias nocapture re
 ; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB22_4:
-; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    mov r0, r12
+; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp9.not = icmp eq i32 %n, 0
@@ -2327,7 +2325,7 @@ define i64 @add4i64(i32* noalias nocapture readonly %x, i32 %n) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    cbz r1, .LBB29_3
+; CHECK-NEXT:    cbz r1, .LBB29_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    mov r3, r2
@@ -2337,14 +2335,14 @@ define i64 @add4i64(i32* noalias nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vaddlva.s32 r2, r3, q0
 ; CHECK-NEXT:    letp lr, .LBB29_2
-; CHECK-NEXT:    b .LBB29_4
-; CHECK-NEXT:  .LBB29_3:
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:  .LBB29_4: @ %for.cond.cleanup
+; CHECK-NEXT:  .LBB29_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:  .LBB29_4:
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    mov r3, r2
+; CHECK-NEXT:    b .LBB29_3
 entry:
   %cmp6.not = icmp eq i32 %n, 0
   br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
@@ -2380,7 +2378,7 @@ define i64 @mla4i64(i32* noalias nocapture readonly %x, i32* noalias nocapture r
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    cbz r2, .LBB30_3
+; CHECK-NEXT:    cbz r2, .LBB30_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    mov r3, r12
@@ -2391,14 +2389,14 @@ define i64 @mla4i64(i32* noalias nocapture readonly %x, i32* noalias nocapture r
 ; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 ; CHECK-NEXT:    vmlalva.s32 r12, r3, q1, q0
 ; CHECK-NEXT:    letp lr, .LBB30_2
-; CHECK-NEXT:    b .LBB30_4
-; CHECK-NEXT:  .LBB30_3:
-; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    mov r3, r12
-; CHECK-NEXT:  .LBB30_4: @ %for.cond.cleanup
+; CHECK-NEXT:  .LBB30_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:  .LBB30_4:
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    b .LBB30_3
 entry:
   %cmp9.not = icmp eq i32 %n, 0
   br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
@@ -2439,7 +2437,7 @@ define i64 @mla8i64(i16* noalias nocapture readonly %x, i16* noalias nocapture r
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    cbz r2, .LBB31_3
+; CHECK-NEXT:    cbz r2, .LBB31_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    mov r3, r12
@@ -2450,14 +2448,14 @@ define i64 @mla8i64(i16* noalias nocapture readonly %x, i16* noalias nocapture r
 ; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
 ; CHECK-NEXT:    vmlalva.s16 r12, r3, q1, q0
 ; CHECK-NEXT:    letp lr, .LBB31_2
-; CHECK-NEXT:    b .LBB31_4
-; CHECK-NEXT:  .LBB31_3:
-; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    mov r3, r12
-; CHECK-NEXT:  .LBB31_4: @ %for.cond.cleanup
+; CHECK-NEXT:  .LBB31_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:  .LBB31_4:
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    mov r3, r12
+; CHECK-NEXT:    b .LBB31_3
 entry:
   %cmp9.not = icmp eq i32 %n, 0
   br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
index a3f1ea295f7c..54ea792184b4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
@@ -6,8 +6,7 @@ define void @arm_cmplx_mag_squared_f16(half* nocapture readonly %pSrc, half* noc
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    beq .LBB0_8
+; CHECK-NEXT:    cbz r2, .LBB0_8
 ; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
 ; CHECK-NEXT:    cmp r2, #8
 ; CHECK-NEXT:    blo .LBB0_9


        


More information about the llvm-branch-commits mailing list