[llvm] Bfi precision (PR #66285)

Matthias Braun via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 25 13:41:53 PDT 2023


https://github.com/MatzeB updated https://github.com/llvm/llvm-project/pull/66285

>From 5a951b5c0df4c5fe443faae4ad2ee265db9e5c6c Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Fri, 22 Sep 2023 11:00:01 -0700
Subject: [PATCH 1/4] Switch some tests to use update_llc_test_checks.py

---
 .../CodeGen/AMDGPU/optimize-negated-cond.ll   | 127 ++++++++++++++----
 llvm/test/CodeGen/VE/Scalar/br_jt.ll          |  93 ++++++-------
 .../CodeGen/X86/2008-04-17-CoalescerBug.ll    |   1 +
 llvm/test/CodeGen/X86/dup-cost.ll             |  54 ++++++--
 4 files changed, 193 insertions(+), 82 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index 52ae259be44f0d4..2a65426d876dcfb 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -1,16 +1,66 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}negated_cond:
-; GCN: .LBB0_1:
-; GCN:   v_cmp_eq_u32_e64 [[CC:[^,]+]],
-; GCN: .LBB0_3:
-; GCN-NOT: v_cndmask_b32
-; GCN-NOT: v_cmp
-; GCN:   s_andn2_b64 vcc, exec, [[CC]]
-; GCN:   s_lshl_b32 s12, s12, 5
-; GCN:   s_cbranch_vccz .LBB0_6
 define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
+; GCN-LABEL: negated_cond:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_branch .LBB0_2
+; GCN-NEXT:  .LBB0_1: ; %loop.exit.guard
+; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; GCN-NEXT:    s_and_b64 vcc, exec, s[14:15]
+; GCN-NEXT:    s_cbranch_vccnz .LBB0_9
+; GCN-NEXT:  .LBB0_2: ; %bb1
+; GCN-NEXT:    ; =>This Loop Header: Depth=1
+; GCN-NEXT:    ; Child Loop BB0_4 Depth 2
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    buffer_load_dword v1, off, s[8:11], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v1
+; GCN-NEXT:    s_mov_b32 s12, s6
+; GCN-NEXT:    s_branch .LBB0_4
+; GCN-NEXT:  .LBB0_3: ; %Flow1
+; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
+; GCN-NEXT:    s_cbranch_vccz .LBB0_1
+; GCN-NEXT:  .LBB0_4: ; %bb2
+; GCN-NEXT:    ; Parent Loop BB0_2 Depth=1
+; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
+; GCN-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GCN-NEXT:    s_lshl_b32 s12, s12, 5
+; GCN-NEXT:    s_cbranch_vccz .LBB0_6
+; GCN-NEXT:  ; %bb.5: ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_mov_b64 s[14:15], s[2:3]
+; GCN-NEXT:    s_branch .LBB0_7
+; GCN-NEXT:  .LBB0_6: ; %bb3
+; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_add_i32 s12, s12, 1
+; GCN-NEXT:    s_mov_b64 s[14:15], -1
+; GCN-NEXT:  .LBB0_7: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
+; GCN-NEXT:    s_mov_b64 s[16:17], -1
+; GCN-NEXT:    s_cbranch_vccnz .LBB0_3
+; GCN-NEXT:  ; %bb.8: ; %bb4
+; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_ashr_i32 s13, s12, 31
+; GCN-NEXT:    s_lshl_b64 s[16:17], s[12:13], 2
+; GCN-NEXT:    s_mov_b64 s[14:15], 0
+; GCN-NEXT:    v_mov_b32_e32 v1, s16
+; GCN-NEXT:    v_mov_b32_e32 v2, s17
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_cmp_eq_u32 s12, 32
+; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT:    s_branch .LBB0_3
+; GCN-NEXT:  .LBB0_9: ; %DummyReturnBlock
+; GCN-NEXT:    s_endpgm
 bb:
   br label %bb1
 
@@ -36,20 +86,51 @@ bb4:
   br i1 %tmp7, label %bb1, label %bb2
 }
 
-; GCN-LABEL: {{^}}negated_cond_dominated_blocks:
-; GCN:   s_cmp_lg_u32
-; GCN:   s_cselect_b64  [[CC1:[^,]+]], -1, 0
-; GCN:   s_branch [[BB1:.LBB[0-9]+_[0-9]+]]
-; GCN: [[BB0:.LBB[0-9]+_[0-9]+]]
-; GCN-NOT: v_cndmask_b32
-; GCN-NOT: v_cmp
-; GCN: [[BB1]]:
-; GCN:   s_mov_b64 vcc, [[CC1]]
-; GCN:   s_cbranch_vccz [[BB2:.LBB[0-9]+_[0-9]+]]
-; GCN:   s_mov_b64 vcc, exec
-; GCN:   s_cbranch_execnz [[BB0]]
-; GCN: [[BB2]]:
 define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1) {
+; GCN-LABEL: negated_cond_dominated_blocks:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u32 s0, 0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_mov_b32 s3, s6
+; GCN-NEXT:    s_branch .LBB1_2
+; GCN-NEXT:  .LBB1_1: ; %bb7
+; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
+; GCN-NEXT:    s_ashr_i32 s3, s2, 31
+; GCN-NEXT:    s_lshl_b64 s[8:9], s[2:3], 2
+; GCN-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-NEXT:    s_cmp_eq_u32 s2, 32
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b32 s3, s2
+; GCN-NEXT:    s_cbranch_scc1 .LBB1_6
+; GCN-NEXT:  .LBB1_2: ; %bb4
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_mov_b64 vcc, s[0:1]
+; GCN-NEXT:    s_cbranch_vccz .LBB1_4
+; GCN-NEXT:  ; %bb.3: ; %bb6
+; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
+; GCN-NEXT:    s_add_i32 s2, s3, 1
+; GCN-NEXT:    s_mov_b64 vcc, exec
+; GCN-NEXT:    s_cbranch_execnz .LBB1_1
+; GCN-NEXT:    s_branch .LBB1_5
+; GCN-NEXT:  .LBB1_4: ; in Loop: Header=BB1_2 Depth=1
+; GCN-NEXT:    ; implicit-def: $sgpr2
+; GCN-NEXT:    s_mov_b64 vcc, 0
+; GCN-NEXT:  .LBB1_5: ; %bb5
+; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
+; GCN-NEXT:    s_lshl_b32 s2, s3, 5
+; GCN-NEXT:    s_or_b32 s2, s2, 1
+; GCN-NEXT:    s_branch .LBB1_1
+; GCN-NEXT:  .LBB1_6: ; %bb3
+; GCN-NEXT:    s_endpgm
 bb:
   br label %bb2
 
diff --git a/llvm/test/CodeGen/VE/Scalar/br_jt.ll b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
index bc7b26abe7e046f..216d4cca097001c 100644
--- a/llvm/test/CodeGen/VE/Scalar/br_jt.ll
+++ b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple=ve | FileCheck %s
 ; RUN: llc < %s -mtriple=ve -relocation-model=pic \
 ; RUN:     | FileCheck %s -check-prefix=PIC
@@ -11,22 +12,22 @@ define signext i32 @br_jt3(i32 signext %0) {
 ; CHECK-LABEL: br_jt3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    and %s0, %s0, (32)0
-; CHECK-NEXT:    breq.w 1, %s0, .LBB{{[0-9]+}}_1
+; CHECK-NEXT:    breq.w 1, %s0, .LBB0_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    breq.w 4, %s0, .LBB{{[0-9]+}}_5
+; CHECK-NEXT:    breq.w 4, %s0, .LBB0_5
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    brne.w 2, %s0, .LBB{{[0-9]+}}_6
+; CHECK-NEXT:    brne.w 2, %s0, .LBB0_6
 ; CHECK-NEXT:  # %bb.4:
 ; CHECK-NEXT:    or %s0, 0, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_1:
+; CHECK-NEXT:  .LBB0_1:
 ; CHECK-NEXT:    or %s0, 3, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_5:
+; CHECK-NEXT:  .LBB0_5:
 ; CHECK-NEXT:    or %s0, 7, (0)1
-; CHECK-NEXT:  .LBB{{[0-9]+}}_6:
+; CHECK-NEXT:  .LBB0_6:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
 ;
@@ -78,7 +79,7 @@ define signext i32 @br_jt4(i32 signext %0) {
 ; CHECK-NEXT:    and %s0, %s0, (32)0
 ; CHECK-NEXT:    adds.w.sx %s1, -1, %s0
 ; CHECK-NEXT:    cmpu.w %s2, 3, %s1
-; CHECK-NEXT:    brgt.w 0, %s2, .LBB{{[0-9]+}}_2
+; CHECK-NEXT:    brgt.w 0, %s2, .LBB1_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    adds.w.sx %s0, %s1, (0)1
 ; CHECK-NEXT:    sll %s0, %s0, 2
@@ -87,7 +88,7 @@ define signext i32 @br_jt4(i32 signext %0) {
 ; CHECK-NEXT:    lea.sl %s1, .Lswitch.table.br_jt4 at hi(, %s1)
 ; CHECK-NEXT:    ldl.sx %s0, (%s0, %s1)
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
 ;
@@ -138,18 +139,18 @@ define signext i32 @br_jt7(i32 signext %0) {
 ; CHECK-NEXT:    and %s0, %s0, (32)0
 ; CHECK-NEXT:    adds.w.sx %s1, -1, %s0
 ; CHECK-NEXT:    cmpu.w %s2, 8, %s1
-; CHECK-NEXT:    brgt.w 0, %s2, .LBB{{[0-9]+}}_3
+; CHECK-NEXT:    brgt.w 0, %s2, .LBB2_3
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    and %s2, %s1, (48)0
 ; CHECK-NEXT:    lea %s3, 463
 ; CHECK-NEXT:    and %s3, %s3, (32)0
 ; CHECK-NEXT:    srl %s2, %s3, %s2
 ; CHECK-NEXT:    and %s2, 1, %s2
-; CHECK-NEXT:    brne.w 0, %s2, .LBB{{[0-9]+}}_2
-; CHECK-NEXT:  .LBB{{[0-9]+}}_3:
+; CHECK-NEXT:    brne.w 0, %s2, .LBB2_2
+; CHECK-NEXT:  .LBB2_3:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:  .LBB2_2:
 ; CHECK-NEXT:    adds.w.sx %s0, %s1, (0)1
 ; CHECK-NEXT:    sll %s0, %s0, 2
 ; CHECK-NEXT:    lea %s1, .Lswitch.table.br_jt7 at lo
@@ -219,18 +220,18 @@ define signext i32 @br_jt8(i32 signext %0) {
 ; CHECK-NEXT:    and %s0, %s0, (32)0
 ; CHECK-NEXT:    adds.w.sx %s1, -1, %s0
 ; CHECK-NEXT:    cmpu.w %s2, 8, %s1
-; CHECK-NEXT:    brgt.w 0, %s2, .LBB{{[0-9]+}}_3
+; CHECK-NEXT:    brgt.w 0, %s2, .LBB3_3
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    and %s2, %s1, (48)0
 ; CHECK-NEXT:    lea %s3, 495
 ; CHECK-NEXT:    and %s3, %s3, (32)0
 ; CHECK-NEXT:    srl %s2, %s3, %s2
 ; CHECK-NEXT:    and %s2, 1, %s2
-; CHECK-NEXT:    brne.w 0, %s2, .LBB{{[0-9]+}}_2
-; CHECK-NEXT:  .LBB{{[0-9]+}}_3:
+; CHECK-NEXT:    brne.w 0, %s2, .LBB3_2
+; CHECK-NEXT:  .LBB3_3:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:  .LBB3_2:
 ; CHECK-NEXT:    adds.w.sx %s0, %s1, (0)1
 ; CHECK-NEXT:    sll %s0, %s0, 2
 ; CHECK-NEXT:    lea %s1, .Lswitch.table.br_jt8 at lo
@@ -298,23 +299,23 @@ define signext i32 @br_jt3_m(i32 signext %0, i32 signext %1) {
 ; CHECK-LABEL: br_jt3_m:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    and %s0, %s0, (32)0
-; CHECK-NEXT:    breq.w 1, %s0, .LBB{{[0-9]+}}_1
+; CHECK-NEXT:    breq.w 1, %s0, .LBB4_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    breq.w 4, %s0, .LBB{{[0-9]+}}_5
+; CHECK-NEXT:    breq.w 4, %s0, .LBB4_5
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    brne.w 2, %s0, .LBB{{[0-9]+}}_6
+; CHECK-NEXT:    brne.w 2, %s0, .LBB4_6
 ; CHECK-NEXT:  # %bb.4:
 ; CHECK-NEXT:    or %s0, 0, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_1:
+; CHECK-NEXT:  .LBB4_1:
 ; CHECK-NEXT:    or %s0, 3, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_5:
+; CHECK-NEXT:  .LBB4_5:
 ; CHECK-NEXT:    and %s0, %s1, (32)0
 ; CHECK-NEXT:    adds.w.sx %s0, 3, %s0
-; CHECK-NEXT:  .LBB{{[0-9]+}}_6:
+; CHECK-NEXT:  .LBB4_6:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
 ;
@@ -368,7 +369,7 @@ define signext i32 @br_jt4_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    and %s0, %s0, (32)0
 ; CHECK-NEXT:    adds.w.sx %s2, -1, %s0
 ; CHECK-NEXT:    cmpu.w %s3, 3, %s2
-; CHECK-NEXT:    brgt.w 0, %s3, .LBB{{[0-9]+}}_5
+; CHECK-NEXT:    brgt.w 0, %s3, .LBB5_5
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    adds.w.zx %s0, %s2, (0)1
 ; CHECK-NEXT:    sll %s0, %s0, 3
@@ -378,18 +379,18 @@ define signext i32 @br_jt4_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    ld %s2, (%s2, %s0)
 ; CHECK-NEXT:    or %s0, 3, (0)1
 ; CHECK-NEXT:    b.l.t (, %s2)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:  .LBB5_2:
 ; CHECK-NEXT:    or %s0, 0, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_3:
+; CHECK-NEXT:  .LBB5_3:
 ; CHECK-NEXT:    or %s0, 4, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_4:
+; CHECK-NEXT:  .LBB5_4:
 ; CHECK-NEXT:    and %s0, %s1, (32)0
 ; CHECK-NEXT:    adds.w.sx %s0, 3, %s0
-; CHECK-NEXT:  .LBB{{[0-9]+}}_5:
+; CHECK-NEXT:  .LBB5_5:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
 ;
@@ -455,7 +456,7 @@ define signext i32 @br_jt7_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    and %s2, %s0, (32)0
 ; CHECK-NEXT:    adds.w.sx %s0, -1, %s2
 ; CHECK-NEXT:    cmpu.w %s3, 8, %s0
-; CHECK-NEXT:    brgt.w 0, %s3, .LBB{{[0-9]+}}_8
+; CHECK-NEXT:    brgt.w 0, %s3, .LBB6_8
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
 ; CHECK-NEXT:    sll %s0, %s0, 3
@@ -466,32 +467,32 @@ define signext i32 @br_jt7_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    and %s1, %s1, (32)0
 ; CHECK-NEXT:    or %s0, 3, (0)1
 ; CHECK-NEXT:    b.l.t (, %s3)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:  .LBB6_2:
 ; CHECK-NEXT:    or %s0, 0, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_3:
+; CHECK-NEXT:  .LBB6_3:
 ; CHECK-NEXT:    or %s0, 4, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_4:
+; CHECK-NEXT:  .LBB6_4:
 ; CHECK-NEXT:    adds.w.sx %s0, 3, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_8:
+; CHECK-NEXT:  .LBB6_8:
 ; CHECK-NEXT:    or %s0, 0, %s2
-; CHECK-NEXT:  .LBB{{[0-9]+}}_9:
+; CHECK-NEXT:  .LBB6_9:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_7:
+; CHECK-NEXT:  .LBB6_7:
 ; CHECK-NEXT:    or %s0, 11, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_6:
+; CHECK-NEXT:  .LBB6_6:
 ; CHECK-NEXT:    or %s0, 10, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_5:
+; CHECK-NEXT:  .LBB6_5:
 ; CHECK-NEXT:    adds.w.sx %s0, -2, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
@@ -591,7 +592,7 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    and %s2, %s0, (32)0
 ; CHECK-NEXT:    adds.w.sx %s0, -1, %s2
 ; CHECK-NEXT:    cmpu.w %s3, 8, %s0
-; CHECK-NEXT:    brgt.w 0, %s3, .LBB{{[0-9]+}}_9
+; CHECK-NEXT:    brgt.w 0, %s3, .LBB7_9
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
 ; CHECK-NEXT:    sll %s0, %s0, 3
@@ -602,36 +603,36 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    and %s1, %s1, (32)0
 ; CHECK-NEXT:    or %s0, 3, (0)1
 ; CHECK-NEXT:    b.l.t (, %s3)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:  .LBB7_2:
 ; CHECK-NEXT:    or %s0, 0, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_3:
+; CHECK-NEXT:  .LBB7_3:
 ; CHECK-NEXT:    or %s0, 4, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_4:
+; CHECK-NEXT:  .LBB7_4:
 ; CHECK-NEXT:    adds.w.sx %s0, 3, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_9:
+; CHECK-NEXT:  .LBB7_9:
 ; CHECK-NEXT:    or %s0, 0, %s2
-; CHECK-NEXT:  .LBB{{[0-9]+}}_10:
+; CHECK-NEXT:  .LBB7_10:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_5:
+; CHECK-NEXT:  .LBB7_5:
 ; CHECK-NEXT:    adds.w.sx %s0, -5, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_6:
+; CHECK-NEXT:  .LBB7_6:
 ; CHECK-NEXT:    adds.w.sx %s0, -2, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_8:
+; CHECK-NEXT:  .LBB7_8:
 ; CHECK-NEXT:    or %s0, 11, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_7:
+; CHECK-NEXT:  .LBB7_7:
 ; CHECK-NEXT:    or %s0, 10, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
diff --git a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
index 6d596195fe7f696..bf939c4131080d3 100644
--- a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
+++ b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
 ; Make sure xorl operands are 32-bit registers.
 
diff --git a/llvm/test/CodeGen/X86/dup-cost.ll b/llvm/test/CodeGen/X86/dup-cost.ll
index 523f0f1154e94d3..ec9d36aa2a11b65 100644
--- a/llvm/test/CodeGen/X86/dup-cost.ll
+++ b/llvm/test/CodeGen/X86/dup-cost.ll
@@ -1,14 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
 ; Cold function, %dup should not be duplicated into predecessors.
 define i32 @cold(i32 %a, ptr %p, ptr %q) !prof !21 {
-; CHECK-LABEL: cold
-; CHECK:       %entry
-; CHECK:       %true1
-; CHECK:       %dup
-; CHECK:       %true2
-; CHECK:       %false1
-; CHECK:       %false2
+; CHECK-LABEL: cold:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $2, %edi
+; CHECK-NEXT:    jl .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %true1
+; CHECK-NEXT:    movl (%rsi), %eax
+; CHECK-NEXT:    addl $2, %eax
+; CHECK-NEXT:  .LBB0_3: # %dup
+; CHECK-NEXT:    cmpl $5, %eax
+; CHECK-NEXT:    jl .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %true2
+; CHECK-NEXT:    xorl %edi, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_2: # %false1
+; CHECK-NEXT:    movl (%rdx), %eax
+; CHECK-NEXT:    addl $-3, %eax
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_5: # %false2
+; CHECK-NEXT:    andl %edi, %eax
+; CHECK-NEXT:    retq
 entry:
   %cond1 = icmp sgt i32 %a, 1
   br i1 %cond1, label %true1, label %false1, !prof !30
@@ -44,12 +58,26 @@ exit:
 ; Same code as previous function, but with hot profile count.
 ; So %dup should be duplicated into predecessors.
 define i32 @hot(i32 %a, ptr %p, ptr %q) !prof !22 {
-; CHECK-LABEL: hot
-; CHECK:       %entry
-; CHECK:       %true1
-; CHECK:       %false2
-; CHECK:       %false1
-; CHECK:       %true2
+; CHECK-LABEL: hot:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $2, %edi
+; CHECK-NEXT:    jl .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %true1
+; CHECK-NEXT:    movl (%rsi), %eax
+; CHECK-NEXT:    addl $2, %eax
+; CHECK-NEXT:    cmpl $5, %eax
+; CHECK-NEXT:    jge .LBB1_4
+; CHECK-NEXT:  .LBB1_5: # %false2
+; CHECK-NEXT:    andl %edi, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB1_2: # %false1
+; CHECK-NEXT:    movl (%rdx), %eax
+; CHECK-NEXT:    addl $-3, %eax
+; CHECK-NEXT:    cmpl $5, %eax
+; CHECK-NEXT:    jl .LBB1_5
+; CHECK-NEXT:  .LBB1_4: # %true2
+; CHECK-NEXT:    xorl %edi, %eax
+; CHECK-NEXT:    retq
 entry:
   %cond1 = icmp sgt i32 %a, 1
   br i1 %cond1, label %true1, label %false1, !prof !30

>From 35362d0dcde40f8f29849ea7d671a7a2b93a18b3 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Thu, 21 Sep 2023 11:18:57 -0700
Subject: [PATCH 2/4] MachineBlockPlacement: Add tolerance to comparisons

The LLVM BFI data is susceptible to numerical "noise" in the lower bits:
To get some feeling for this consider that we store `BlockFrequency` as
64-bit integers and branch weights as 32-bit. There is a natural
numerical difference when the branch weights divisions are not naturally
representable as integers and because block frequencies are scaled by
loop factors the absolute values of those errors can appear big.
Regardless it should be a small relative difference relative to the
frequency numbers.

This changes make the MachineBlockPlacement algorithm more tolerant
against small relative changes in block frequency numbers:

- Add `BlockFrequency::greaterWithTolerance` and
  `BlockFrequency::lessWithTolerance` functions that allow for
  greater/smaller comparisons for the top N significant bits only
  excluding leading zeros.
- Changes `MachineBlockPlacement::selectBestCandidateBlock` and
  `MachineBlockPlacement::findBestLoopTopHelper` to only consider
  something a better candidate when it passes the tolerance check.
- Blocks that have already been in place prior to the algorithm are
  picked as best candidate when they are not worse than the current best
  with tolerance. This incentivizes to keep the placement as-is when
  multiple candidates have similar frequencies.

The currently value of 20 significant bits works well to reduce
unnecessary changes in my upcoming BFI precision changes. It is likely a
good idea to reduce this number further to accomodate for noise in PGO
data but I did not perform any deeper analysis on what number would fit
this goal.
---
 llvm/include/llvm/Support/BlockFrequency.h    |  36 +++
 llvm/lib/CodeGen/MachineBlockPlacement.cpp    |  46 +++-
 llvm/test/CodeGen/AArch64/cfi-fixup.ll        |  24 +-
 .../CodeGen/AArch64/machine-licm-sub-loop.ll  |  67 +++--
 llvm/test/CodeGen/AArch64/ragreedy-csr.ll     | 234 +++++++++---------
 llvm/test/CodeGen/AArch64/sink-and-fold.ll    |  52 ++--
 .../divergent-branch-uniform-condition.ll     |  31 +--
 .../CodeGen/AMDGPU/loop_header_nopred.mir     |  83 ++++---
 ...p-var-out-of-divergent-loop-swdev407790.ll |  70 +++---
 .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll |  68 ++---
 llvm/test/CodeGen/AMDGPU/multilevel-break.ll  |  51 ++--
 .../CodeGen/AMDGPU/optimize-negated-cond.ll   |  62 ++---
 .../AMDGPU/promote-constOffset-to-imm.ll      | 125 +++++-----
 llvm/test/CodeGen/ARM/code-placement.ll       |  11 +-
 llvm/test/CodeGen/ARM/loop-indexing.ll        |  23 +-
 llvm/test/CodeGen/AVR/rot.ll                  |  32 +--
 llvm/test/CodeGen/AVR/rotate.ll               |  18 +-
 llvm/test/CodeGen/AVR/shift.ll                |  16 +-
 .../Generic/machine-function-splitter.ll      |   1 -
 .../Hexagon/lsr-postinc-nested-loop.ll        |   4 +-
 llvm/test/CodeGen/Hexagon/prof-early-if.ll    |   2 +-
 llvm/test/CodeGen/Hexagon/swp-multi-loops.ll  |   8 +-
 .../Mips/indirect-jump-hazard/jumptables.ll   | 208 ++++++++--------
 llvm/test/CodeGen/Mips/jump-table-mul.ll      |  32 +--
 llvm/test/CodeGen/Mips/nacl-align.ll          |  10 +-
 llvm/test/CodeGen/Mips/pseudo-jump-fill.ll    |  14 +-
 .../PowerPC/2007-11-16-landingpad-split.ll    |  28 ++-
 llvm/test/CodeGen/PowerPC/branch-opt.ll       |   9 +-
 .../PowerPC/jump-tables-collapse-rotate.ll    |  39 ++-
 llvm/test/CodeGen/PowerPC/licm-tocReg.ll      |  44 ++--
 .../CodeGen/PowerPC/lsr-profitable-chain.ll   |  34 +--
 llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll   |   8 +-
 llvm/test/CodeGen/PowerPC/p10-spill-crlt.ll   |  20 +-
 llvm/test/CodeGen/PowerPC/p10-spill-crun.ll   |  32 +--
 llvm/test/CodeGen/PowerPC/pr43527.ll          |   8 +-
 llvm/test/CodeGen/PowerPC/pr45448.ll          |  14 +-
 llvm/test/CodeGen/PowerPC/pr46759.ll          |  16 +-
 llvm/test/CodeGen/PowerPC/pr48519.ll          |  44 ++--
 .../CodeGen/PowerPC/stack-clash-prologue.ll   |  81 +++---
 .../CodeGen/RISCV/shrinkwrap-jump-table.ll    |  20 +-
 llvm/test/CodeGen/Thumb/pr42760.ll            |  33 +--
 .../Thumb2/LowOverheadLoops/memcall.ll        |  50 ++--
 .../CodeGen/Thumb2/bti-indirect-branches.ll   |  37 +--
 llvm/test/CodeGen/Thumb2/constant-hoisting.ll |  18 +-
 .../test/CodeGen/Thumb2/mve-blockplacement.ll |  72 +++---
 .../CodeGen/Thumb2/mve-float16regloops.ll     |  68 ++---
 .../CodeGen/Thumb2/mve-float32regloops.ll     | 132 +++++-----
 .../Thumb2/mve-gather-scatter-optimisation.ll | 153 ++++++------
 llvm/test/CodeGen/Thumb2/mve-memtp-branch.ll  |  78 +++---
 llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll    |  64 +++--
 llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll   |  70 +++---
 llvm/test/CodeGen/VE/Scalar/br_jt.ll          |  64 ++---
 .../CodeGen/X86/2007-01-13-StackPtrIndex.ll   | 105 ++++----
 .../CodeGen/X86/2008-04-17-CoalescerBug.ll    |  18 +-
 llvm/test/CodeGen/X86/2009-08-12-badswitch.ll |   6 +-
 llvm/test/CodeGen/X86/block-placement.ll      |   2 +-
 llvm/test/CodeGen/X86/callbr-asm-outputs.ll   |  20 +-
 .../X86/code_placement_loop_rotation2.ll      |   2 +-
 .../CodeGen/X86/dag-update-nodetomatch.ll     |  47 ++--
 llvm/test/CodeGen/X86/dup-cost.ll             |   6 +-
 llvm/test/CodeGen/X86/hoist-invariant-load.ll |  30 ++-
 .../test/CodeGen/X86/loop-strength-reduce7.ll |  11 +-
 llvm/test/CodeGen/X86/mul-constant-result.ll  | 112 ++++-----
 llvm/test/CodeGen/X86/pic.ll                  |  22 +-
 llvm/test/CodeGen/X86/postalloc-coalescing.ll |   9 +-
 llvm/test/CodeGen/X86/pr38743.ll              |  10 +-
 llvm/test/CodeGen/X86/pr38795.ll              |  79 +++---
 llvm/test/CodeGen/X86/pr63692.ll              |   9 +-
 llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll |   8 +-
 .../speculative-load-hardening-indirect.ll    |  24 +-
 .../CodeGen/X86/speculative-load-hardening.ll |  32 +--
 llvm/test/CodeGen/X86/switch.ll               |  21 +-
 .../X86/tail-dup-merge-loop-headers.ll        |  84 +++----
 llvm/test/CodeGen/X86/win-catchpad.ll         |   8 +-
 llvm/unittests/Support/BlockFrequencyTest.cpp |  50 ++++
 75 files changed, 1716 insertions(+), 1563 deletions(-)

diff --git a/llvm/include/llvm/Support/BlockFrequency.h b/llvm/include/llvm/Support/BlockFrequency.h
index 12a753301b36aba..eb5f48e58073881 100644
--- a/llvm/include/llvm/Support/BlockFrequency.h
+++ b/llvm/include/llvm/Support/BlockFrequency.h
@@ -15,6 +15,7 @@
 
 #include <cassert>
 #include <cstdint>
+#include <limits.h>
 #include <optional>
 
 namespace llvm {
@@ -93,6 +94,41 @@ class BlockFrequency {
     return *this;
   }
 
+  /// Returns true if `this` is greater than `other` and the magnitude of the
+  /// difference falls within the topmost `SignificantBits` of `this`.
+  ///
+  /// The formula is related to comparing a "relative change" to a threshold.
+  /// When choosing the threshold as a negative power-of-two things can be
+  /// computed cheaply:
+  ///   with A = this->Frequency; B = Other.Frequency; T = SignificantBits - 1;
+  ///   relative_change(A, B) = (A - B) / A
+  ///         (A - B) / A  >  2**(-T)
+  ///   <=>   A - B        >  2**(-T) * A
+  ///   <=>   A - B        >  shr(A, T)
+  bool greaterWithTolerance(BlockFrequency Other,
+                            unsigned SignificantBits) const {
+    assert(0 < SignificantBits &&
+           SignificantBits <= sizeof(Frequency) * CHAR_BIT &&
+           "invalid SignificantBits value");
+    return Frequency > Other.Frequency &&
+           (Frequency - Other.Frequency) >=
+               (Frequency >> (SignificantBits - 1));
+  }
+
+  /// Returns true if `this` is less than `other` and the magnitude of the
+  /// difference falls within the topmost `SignificantBits` of `this`.
+  ///
+  /// See `greaterWithTolerance` for how this is related to the
+  /// "relative change" formula.
+  bool lessWithTolerance(BlockFrequency Other, unsigned SignificantBits) const {
+    assert(0 < SignificantBits &&
+           SignificantBits <= sizeof(Frequency) * CHAR_BIT &&
+           "invalid SignificantBits value");
+    return Frequency < Other.Frequency &&
+           (Other.Frequency - Frequency) >=
+               (Frequency >> (SignificantBits - 1));
+  }
+
   bool operator<(BlockFrequency RHS) const {
     return Frequency < RHS.Frequency;
   }
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 603c0e9600afc32..d939cadb81d9b8d 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -213,6 +213,9 @@ static cl::opt<bool> RenumberBlocksBeforeView(
         "into a dot graph. Only used when a function is being printed."),
     cl::init(false), cl::Hidden);
 
+// Number of significant bits when comparing BlockFrequencies with tolerance.
+static constexpr unsigned BlockFreqCompBits = 20;
+
 namespace llvm {
 extern cl::opt<bool> EnableExtTspBlockPlacement;
 extern cl::opt<bool> ApplyExtTspWithoutProfile;
@@ -1715,6 +1718,7 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
 
   bool IsEHPad = WorkList[0]->isEHPad();
 
+  MachineBasicBlock *ChainEnd = *std::prev(Chain.end());
   MachineBasicBlock *BestBlock = nullptr;
   BlockFrequency BestFreq;
   for (MachineBasicBlock *MBB : WorkList) {
@@ -1730,7 +1734,9 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
 
     BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
     LLVM_DEBUG(dbgs() << "    " << getBlockName(MBB) << " -> ";
-               MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
+               MBFI->printBlockFreq(dbgs(), CandidateFreq)
+               << " (freq) layoutsucc " << ChainEnd->isLayoutSuccessor(MBB)
+               << '\n');
 
     // For ehpad, we layout the least probable first as to avoid jumping back
     // from least probable landingpads to more probable ones.
@@ -1750,11 +1756,26 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
     //                 +-------------------------------------+
     //                 V                                     |
     // OuterLp -> OuterCleanup -> Resume     InnerLp -> InnerCleanup
-    if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
-      continue;
-
-    BestBlock = MBB;
-    BestFreq = CandidateFreq;
+    bool Better = false;
+    if (BestBlock == nullptr) {
+      Better = true;
+    } else if (!IsEHPad) {
+      // Be tolerant to numerical noise: Pick block if clearly better or
+      //  for the current successor if it is not not clearly worse.
+      if (CandidateFreq.greaterWithTolerance(BestFreq, BlockFreqCompBits) ||
+          (ChainEnd->isLayoutSuccessor(MBB) &&
+           !CandidateFreq.lessWithTolerance(BestFreq, BlockFreqCompBits)))
+        Better = true;
+    } else {
+      if (CandidateFreq.lessWithTolerance(BestFreq, BlockFreqCompBits) ||
+          (ChainEnd->isLayoutSuccessor(MBB) &&
+           !CandidateFreq.greaterWithTolerance(BestFreq, BlockFreqCompBits)))
+        Better = true;
+    }
+    if (Better) {
+      BestBlock = MBB;
+      BestFreq = CandidateFreq;
+    }
   }
 
   return BestBlock;
@@ -2095,8 +2116,8 @@ MachineBlockPlacement::findBestLoopTopHelper(
     if (Pred == L.getHeader())
       continue;
     LLVM_DEBUG(dbgs() << "   old top pred: " << getBlockName(Pred) << ", has "
-                      << Pred->succ_size() << " successors, ";
-               MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
+                      << Pred->succ_size() << " successors, freq ";
+               MBFI->printBlockFreq(dbgs(), Pred) << '\n');
     if (Pred->succ_size() > 2)
       continue;
 
@@ -2112,8 +2133,13 @@ MachineBlockPlacement::findBestLoopTopHelper(
 
     BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
                                             LoopBlockSet);
-    if ((Gains > 0) && (Gains > BestGains ||
-        ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
+    LLVM_DEBUG(dbgs() << "    Candidate: " << getBlockName(Pred) << " freq ";
+               MBFI->printBlockFreq(dbgs(), Pred);
+               dbgs() << " layoutsucc " << Pred->isLayoutSuccessor(OldTop)
+                      << " gains " << Gains.getFrequency() << '\n');
+    if (Gains.greaterWithTolerance(BestGains, BlockFreqCompBits) ||
+        (Pred->isLayoutSuccessor(OldTop) &&
+         !Gains.lessWithTolerance(BestGains, BlockFreqCompBits))) {
       BestPred = Pred;
       BestGains = Gains;
     }
diff --git a/llvm/test/CodeGen/AArch64/cfi-fixup.ll b/llvm/test/CodeGen/AArch64/cfi-fixup.ll
index 9a4ad3bb07ee364..e746ea745b92a8c 100644
--- a/llvm/test/CodeGen/AArch64/cfi-fixup.ll
+++ b/llvm/test/CodeGen/AArch64/cfi-fixup.ll
@@ -8,13 +8,13 @@ define i32 @f0(i32 %x) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    .cfi_remember_state
-; CHECK-NEXT:    cbz w0, .LBB0_4
+; CHECK-NEXT:    cbz w0, .LBB0_6
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    cmp w0, #2
-; CHECK-NEXT:    b.eq .LBB0_5
+; CHECK-NEXT:    b.eq .LBB0_4
 ; CHECK-NEXT:  // %bb.2: // %entry
 ; CHECK-NEXT:    cmp w0, #1
-; CHECK-NEXT:    b.ne .LBB0_6
+; CHECK-NEXT:    b.ne .LBB0_5
 ; CHECK-NEXT:  // %bb.3: // %if.then2
 ; CHECK-NEXT:    bl g1
 ; CHECK-NEXT:    add w0, w0, #1
@@ -22,27 +22,27 @@ define i32 @f0(i32 %x) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:  .LBB0_4: // %if.then5
 ; CHECK-NEXT:    .cfi_restore_state
 ; CHECK-NEXT:    .cfi_remember_state
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    bl g0
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    sub w0, w8, w0
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_5: // %if.then5
+; CHECK-NEXT:  .LBB0_5: // %if.end7
 ; CHECK-NEXT:    .cfi_restore_state
 ; CHECK-NEXT:    .cfi_remember_state
-; CHECK-NEXT:    bl g0
-; CHECK-NEXT:    mov w8, #1
-; CHECK-NEXT:    sub w0, w8, w0
+; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_6: // %if.end7
+; CHECK-NEXT:  .LBB0_6:
 ; CHECK-NEXT:    .cfi_restore_state
-; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w30
@@ -115,7 +115,7 @@ define i32 @f2(i32 %x) #0 {
 ; CHECK-NEXT:    cbz w0, .LBB2_2
 ; CHECK-NEXT:  // %bb.1: // %if.end
 ; CHECK-NEXT:    bl g1
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    sub w0, w8, w0
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
index d4da4f28bb4a3c7..04fd80cd8213fdd 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
@@ -15,31 +15,45 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
 ; CHECK-NEXT:    and x12, x10, #0xfffffff0
 ; CHECK-NEXT:    add x13, x1, #32
 ; CHECK-NEXT:    add x14, x2, #16
-; CHECK-NEXT:    b .LBB0_3
-; CHECK-NEXT:  .LBB0_2: // %for.cond1.for.cond.cleanup3_crit_edge.us
-; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    b .LBB0_6
+; CHECK-NEXT:  .LBB0_2: // in Loop: Header=BB0_6 Depth=1
+; CHECK-NEXT:    mov x18, xzr
+; CHECK-NEXT:  .LBB0_3: // %for.body4.us.preheader
+; CHECK-NEXT:    // in Loop: Header=BB0_6 Depth=1
+; CHECK-NEXT:    add x16, x18, x8
+; CHECK-NEXT:    add x17, x2, x18, lsl #1
+; CHECK-NEXT:    sub x18, x10, x18
+; CHECK-NEXT:    add x16, x1, x16, lsl #2
+; CHECK-NEXT:  .LBB0_4: // %for.body4.us
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=1
+; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    ldrsh w3, [x17], #2
+; CHECK-NEXT:    ldr w4, [x16]
+; CHECK-NEXT:    subs x18, x18, #1
+; CHECK-NEXT:    madd w3, w3, w15, w4
+; CHECK-NEXT:    str w3, [x16], #4
+; CHECK-NEXT:    b.ne .LBB0_4
+; CHECK-NEXT:  .LBB0_5: // %for.cond1.for.cond.cleanup3_crit_edge.us
+; CHECK-NEXT:    // in Loop: Header=BB0_6 Depth=1
 ; CHECK-NEXT:    add x9, x9, #1
 ; CHECK-NEXT:    add x13, x13, x11
 ; CHECK-NEXT:    add x8, x8, x10
 ; CHECK-NEXT:    cmp x9, x10
 ; CHECK-NEXT:    b.eq .LBB0_10
-; CHECK-NEXT:  .LBB0_3: // %for.cond1.preheader.us
+; CHECK-NEXT:  .LBB0_6: // %for.cond1.preheader.us
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
-; CHECK-NEXT:    // Child Loop BB0_6 Depth 2
-; CHECK-NEXT:    // Child Loop BB0_9 Depth 2
+; CHECK-NEXT:    // Child Loop BB0_8 Depth 2
+; CHECK-NEXT:    // Child Loop BB0_4 Depth 2
 ; CHECK-NEXT:    ldrsh w15, [x2, x9, lsl #1]
 ; CHECK-NEXT:    cmp w0, #16
-; CHECK-NEXT:    b.hs .LBB0_5
-; CHECK-NEXT:  // %bb.4: // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    mov x18, xzr
-; CHECK-NEXT:    b .LBB0_8
-; CHECK-NEXT:  .LBB0_5: // %vector.ph
-; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    b.lo .LBB0_2
+; CHECK-NEXT:  // %bb.7: // %vector.ph
+; CHECK-NEXT:    // in Loop: Header=BB0_6 Depth=1
 ; CHECK-NEXT:    mov x16, x14
 ; CHECK-NEXT:    mov x17, x13
 ; CHECK-NEXT:    mov x18, x12
-; CHECK-NEXT:  .LBB0_6: // %vector.body
-; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
+; CHECK-NEXT:  .LBB0_8: // %vector.body
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    dup v0.8h, w15
 ; CHECK-NEXT:    ldp q1, q4, [x16, #-16]
@@ -53,28 +67,13 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
 ; CHECK-NEXT:    smlal v6.4s, v0.4h, v4.4h
 ; CHECK-NEXT:    stp q3, q2, [x17, #-32]
 ; CHECK-NEXT:    stp q6, q5, [x17], #64
-; CHECK-NEXT:    b.ne .LBB0_6
-; CHECK-NEXT:  // %bb.7: // %middle.block
-; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    b.ne .LBB0_8
+; CHECK-NEXT:  // %bb.9: // %middle.block
+; CHECK-NEXT:    // in Loop: Header=BB0_6 Depth=1
 ; CHECK-NEXT:    cmp x12, x10
 ; CHECK-NEXT:    mov x18, x12
-; CHECK-NEXT:    b.eq .LBB0_2
-; CHECK-NEXT:  .LBB0_8: // %for.body4.us.preheader
-; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    add x16, x18, x8
-; CHECK-NEXT:    add x17, x2, x18, lsl #1
-; CHECK-NEXT:    sub x18, x10, x18
-; CHECK-NEXT:    add x16, x1, x16, lsl #2
-; CHECK-NEXT:  .LBB0_9: // %for.body4.us
-; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
-; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrsh w3, [x17], #2
-; CHECK-NEXT:    ldr w4, [x16]
-; CHECK-NEXT:    subs x18, x18, #1
-; CHECK-NEXT:    madd w3, w3, w15, w4
-; CHECK-NEXT:    str w3, [x16], #4
-; CHECK-NEXT:    b.ne .LBB0_9
-; CHECK-NEXT:    b .LBB0_2
+; CHECK-NEXT:    b.ne .LBB0_3
+; CHECK-NEXT:    b .LBB0_5
 ; CHECK-NEXT:  .LBB0_10: // %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/ragreedy-csr.ll b/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
index 5b501762418ef50..fa0ec49907658b5 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-csr.ll
@@ -24,7 +24,7 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    ldrh w8, [x0]
 ; CHECK-NEXT:    ldrh w9, [x1]
 ; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne LBB0_47
+; CHECK-NEXT:    b.ne LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %if.end
 ; CHECK-NEXT:    sub sp, sp, #64
 ; CHECK-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
@@ -41,8 +41,12 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:  Lloh1:
 ; CHECK-NEXT:    ldr x14, [x14, __DefaultRuneLocale at GOTPAGEOFF]
 ; CHECK-NEXT:    ldrsb x8, [x9, x11]
-; CHECK-NEXT:    tbz x8, #63, LBB0_3
-; CHECK-NEXT:  LBB0_2: ; %cond.false.i.i
+; CHECK-NEXT:    tbz x8, #63, LBB0_7
+; CHECK-NEXT:    b LBB0_6
+; CHECK-NEXT:  LBB0_2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB0_3: ; %cond.false.i.i219
 ; CHECK-NEXT:    stp x9, x0, [sp, #32] ; 16-byte Folded Spill
 ; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    mov w1, #32768 ; =0x8000
@@ -61,32 +65,17 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    ldr w12, [sp, #4] ; 4-byte Folded Reload
 ; CHECK-NEXT:    ldr x10, [sp, #8] ; 8-byte Folded Reload
 ; CHECK-NEXT:    ldr x0, [sp, #40] ; 8-byte Folded Reload
-; CHECK-NEXT:    cbz w8, LBB0_4
-; CHECK-NEXT:    b LBB0_6
-; CHECK-NEXT:  LBB0_3: ; %cond.true.i.i
-; CHECK-NEXT:    add x8, x14, x8, lsl #2
-; CHECK-NEXT:    ldr w8, [x8, #60]
-; CHECK-NEXT:    and w8, w8, #0x8000
-; CHECK-NEXT:    cbnz w8, LBB0_6
-; CHECK-NEXT:  LBB0_4: ; %lor.rhs
-; CHECK-NEXT:    ldrsb x8, [x10, x11]
-; CHECK-NEXT:    tbnz x8, #63, LBB0_8
-; CHECK-NEXT:  ; %bb.5: ; %cond.true.i.i217
-; CHECK-NEXT:    add x8, x14, x8, lsl #2
-; CHECK-NEXT:    ldr w8, [x8, #60]
-; CHECK-NEXT:    and w8, w8, #0x8000
-; CHECK-NEXT:    cbz w8, LBB0_9
-; CHECK-NEXT:  LBB0_6: ; %while.body
+; CHECK-NEXT:    cbz w8, LBB0_10
+; CHECK-NEXT:  LBB0_4: ; %while.body
 ; CHECK-NEXT:    ldrb w8, [x9, x11]
 ; CHECK-NEXT:    ldrb w15, [x10, x11]
 ; CHECK-NEXT:    cmp w8, w15
-; CHECK-NEXT:    b.ne LBB0_42
-; CHECK-NEXT:  ; %bb.7: ; %if.end17
+; CHECK-NEXT:    b.ne LBB0_43
+; CHECK-NEXT:  ; %bb.5: ; %if.end17
 ; CHECK-NEXT:    add x11, x11, #1
 ; CHECK-NEXT:    ldrsb x8, [x9, x11]
-; CHECK-NEXT:    tbz x8, #63, LBB0_3
-; CHECK-NEXT:    b LBB0_2
-; CHECK-NEXT:  LBB0_8: ; %cond.false.i.i219
+; CHECK-NEXT:    tbz x8, #63, LBB0_7
+; CHECK-NEXT:  LBB0_6: ; %cond.false.i.i
 ; CHECK-NEXT:    stp x9, x0, [sp, #32] ; 16-byte Folded Spill
 ; CHECK-NEXT:    mov w0, w8
 ; CHECK-NEXT:    mov w1, #32768 ; =0x8000
@@ -105,163 +94,174 @@ define fastcc i32 @prune_match(ptr nocapture readonly %a, ptr nocapture readonly
 ; CHECK-NEXT:    ldr w12, [sp, #4] ; 4-byte Folded Reload
 ; CHECK-NEXT:    ldr x10, [sp, #8] ; 8-byte Folded Reload
 ; CHECK-NEXT:    ldr x0, [sp, #40] ; 8-byte Folded Reload
-; CHECK-NEXT:    cbnz w8, LBB0_6
-; CHECK-NEXT:  LBB0_9: ; %while.end
+; CHECK-NEXT:    cbz w8, LBB0_8
+; CHECK-NEXT:    b LBB0_4
+; CHECK-NEXT:  LBB0_7: ; %cond.true.i.i
+; CHECK-NEXT:    add x8, x14, x8, lsl #2
+; CHECK-NEXT:    ldr w8, [x8, #60]
+; CHECK-NEXT:    and w8, w8, #0x8000
+; CHECK-NEXT:    cbnz w8, LBB0_4
+; CHECK-NEXT:  LBB0_8: ; %lor.rhs
+; CHECK-NEXT:    ldrsb x8, [x10, x11]
+; CHECK-NEXT:    tbnz x8, #63, LBB0_3
+; CHECK-NEXT:  ; %bb.9: ; %cond.true.i.i217
+; CHECK-NEXT:    add x8, x14, x8, lsl #2
+; CHECK-NEXT:    ldr w8, [x8, #60]
+; CHECK-NEXT:    and w8, w8, #0x8000
+; CHECK-NEXT:    cbnz w8, LBB0_4
+; CHECK-NEXT:  LBB0_10: ; %while.end
 ; CHECK-NEXT:    orr w8, w13, w12
-; CHECK-NEXT:    cbnz w8, LBB0_24
-; CHECK-NEXT:  ; %bb.10: ; %if.then23
+; CHECK-NEXT:    cbnz w8, LBB0_25
+; CHECK-NEXT:  ; %bb.11: ; %if.then23
 ; CHECK-NEXT:    ldr x12, [x0, #16]
 ; CHECK-NEXT:    ldrb w8, [x9, x11]
 ; CHECK-NEXT:    ldrb w13, [x12]
 ; CHECK-NEXT:    cmp w13, #83
-; CHECK-NEXT:    b.eq LBB0_19
-; CHECK-NEXT:  LBB0_11: ; %while.cond59.preheader
-; CHECK-NEXT:    cbz w8, LBB0_23
-; CHECK-NEXT:  LBB0_12: ; %land.rhs.preheader
+; CHECK-NEXT:    b.eq LBB0_20
+; CHECK-NEXT:  LBB0_12: ; %while.cond59.preheader
+; CHECK-NEXT:    cbz w8, LBB0_24
+; CHECK-NEXT:  LBB0_13: ; %land.rhs.preheader
 ; CHECK-NEXT:    add x12, x9, x11
 ; CHECK-NEXT:    add x9, x10, x11
 ; CHECK-NEXT:    add x10, x12, #1
-; CHECK-NEXT:  LBB0_13: ; %land.rhs
+; CHECK-NEXT:  LBB0_14: ; %land.rhs
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrb w11, [x9], #1
-; CHECK-NEXT:    cbz w11, LBB0_23
-; CHECK-NEXT:  ; %bb.14: ; %while.body66
-; CHECK-NEXT:    ; in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    cmp w8, #42
-; CHECK-NEXT:    b.eq LBB0_18
+; CHECK-NEXT:    cbz w11, LBB0_24
 ; CHECK-NEXT:  ; %bb.15: ; %while.body66
-; CHECK-NEXT:    ; in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    ; in Loop: Header=BB0_14 Depth=1
+; CHECK-NEXT:    cmp w8, #42
+; CHECK-NEXT:    b.eq LBB0_19
+; CHECK-NEXT:  ; %bb.16: ; %while.body66
+; CHECK-NEXT:    ; in Loop: Header=BB0_14 Depth=1
 ; CHECK-NEXT:    cmp w11, #42
-; CHECK-NEXT:    b.eq LBB0_18
-; CHECK-NEXT:  ; %bb.16: ; %lor.lhs.false74
-; CHECK-NEXT:    ; in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    b.eq LBB0_19
+; CHECK-NEXT:  ; %bb.17: ; %lor.lhs.false74
+; CHECK-NEXT:    ; in Loop: Header=BB0_14 Depth=1
 ; CHECK-NEXT:    cmp w8, w11
 ; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    b.ne LBB0_43
-; CHECK-NEXT:  ; %bb.17: ; %lor.lhs.false74
-; CHECK-NEXT:    ; in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    b.ne LBB0_44
+; CHECK-NEXT:  ; %bb.18: ; %lor.lhs.false74
+; CHECK-NEXT:    ; in Loop: Header=BB0_14 Depth=1
 ; CHECK-NEXT:    cmp w8, #94
-; CHECK-NEXT:    b.eq LBB0_43
-; CHECK-NEXT:  LBB0_18: ; %if.then83
-; CHECK-NEXT:    ; in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    b.eq LBB0_44
+; CHECK-NEXT:  LBB0_19: ; %if.then83
+; CHECK-NEXT:    ; in Loop: Header=BB0_14 Depth=1
 ; CHECK-NEXT:    ldrb w8, [x10], #1
 ; CHECK-NEXT:    mov w0, #1 ; =0x1
-; CHECK-NEXT:    cbnz w8, LBB0_13
-; CHECK-NEXT:    b LBB0_43
-; CHECK-NEXT:  LBB0_19: ; %land.lhs.true28
-; CHECK-NEXT:    cbz w8, LBB0_23
-; CHECK-NEXT:  ; %bb.20: ; %land.lhs.true28
+; CHECK-NEXT:    cbnz w8, LBB0_14
+; CHECK-NEXT:    b LBB0_44
+; CHECK-NEXT:  LBB0_20: ; %land.lhs.true28
+; CHECK-NEXT:    cbz w8, LBB0_24
+; CHECK-NEXT:  ; %bb.21: ; %land.lhs.true28
 ; CHECK-NEXT:    cmp w8, #112
-; CHECK-NEXT:    b.ne LBB0_12
-; CHECK-NEXT:  ; %bb.21: ; %land.lhs.true35
+; CHECK-NEXT:    b.ne LBB0_13
+; CHECK-NEXT:  ; %bb.22: ; %land.lhs.true35
 ; CHECK-NEXT:    ldrb w13, [x10, x11]
 ; CHECK-NEXT:    cmp w13, #112
-; CHECK-NEXT:    b.ne LBB0_12
-; CHECK-NEXT:  ; %bb.22: ; %land.lhs.true43
+; CHECK-NEXT:    b.ne LBB0_13
+; CHECK-NEXT:  ; %bb.23: ; %land.lhs.true43
 ; CHECK-NEXT:    sub x12, x9, x12
 ; CHECK-NEXT:    add x12, x12, x11
 ; CHECK-NEXT:    cmp x12, #1
-; CHECK-NEXT:    b.ne LBB0_44
-; CHECK-NEXT:  LBB0_23:
+; CHECK-NEXT:    b.ne LBB0_45
+; CHECK-NEXT:  LBB0_24:
 ; CHECK-NEXT:    mov w0, #1 ; =0x1
-; CHECK-NEXT:    b LBB0_43
-; CHECK-NEXT:  LBB0_24: ; %if.else88
+; CHECK-NEXT:    b LBB0_44
+; CHECK-NEXT:  LBB0_25: ; %if.else88
 ; CHECK-NEXT:    cmp w12, #1
-; CHECK-NEXT:    b.ne LBB0_33
-; CHECK-NEXT:  ; %bb.25: ; %if.else88
+; CHECK-NEXT:    b.ne LBB0_34
+; CHECK-NEXT:  ; %bb.26: ; %if.else88
 ; CHECK-NEXT:    cmp w13, #2
-; CHECK-NEXT:    b.ne LBB0_33
-; CHECK-NEXT:  ; %bb.26: ; %while.cond95.preheader
+; CHECK-NEXT:    b.ne LBB0_34
+; CHECK-NEXT:  ; %bb.27: ; %while.cond95.preheader
 ; CHECK-NEXT:    ldrb w12, [x9, x11]
-; CHECK-NEXT:    cbz w12, LBB0_23
-; CHECK-NEXT:  ; %bb.27: ; %land.rhs99.preheader
+; CHECK-NEXT:    cbz w12, LBB0_24
+; CHECK-NEXT:  ; %bb.28: ; %land.rhs99.preheader
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    mov w0, #1 ; =0x1
-; CHECK-NEXT:    b LBB0_29
-; CHECK-NEXT:  LBB0_28: ; %if.then117
-; CHECK-NEXT:    ; in Loop: Header=BB0_29 Depth=1
+; CHECK-NEXT:    b LBB0_30
+; CHECK-NEXT:  LBB0_29: ; %if.then117
+; CHECK-NEXT:    ; in Loop: Header=BB0_30 Depth=1
 ; CHECK-NEXT:    add x12, x9, x8
 ; CHECK-NEXT:    add x8, x8, #1
 ; CHECK-NEXT:    add x12, x12, x11
 ; CHECK-NEXT:    ldrb w12, [x12, #1]
-; CHECK-NEXT:    cbz w12, LBB0_43
-; CHECK-NEXT:  LBB0_29: ; %land.rhs99
+; CHECK-NEXT:    cbz w12, LBB0_44
+; CHECK-NEXT:  LBB0_30: ; %land.rhs99
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x13, x10, x8
 ; CHECK-NEXT:    ldrb w13, [x13, x11]
-; CHECK-NEXT:    cbz w13, LBB0_23
-; CHECK-NEXT:  ; %bb.30: ; %while.body104
-; CHECK-NEXT:    ; in Loop: Header=BB0_29 Depth=1
-; CHECK-NEXT:    cmp w12, w13
-; CHECK-NEXT:    b.eq LBB0_28
+; CHECK-NEXT:    cbz w13, LBB0_24
 ; CHECK-NEXT:  ; %bb.31: ; %while.body104
-; CHECK-NEXT:    ; in Loop: Header=BB0_29 Depth=1
-; CHECK-NEXT:    cmp w12, #42
-; CHECK-NEXT:    b.eq LBB0_28
+; CHECK-NEXT:    ; in Loop: Header=BB0_30 Depth=1
+; CHECK-NEXT:    cmp w12, w13
+; CHECK-NEXT:    b.eq LBB0_29
 ; CHECK-NEXT:  ; %bb.32: ; %while.body104
-; CHECK-NEXT:    ; in Loop: Header=BB0_29 Depth=1
+; CHECK-NEXT:    ; in Loop: Header=BB0_30 Depth=1
+; CHECK-NEXT:    cmp w12, #42
+; CHECK-NEXT:    b.eq LBB0_29
+; CHECK-NEXT:  ; %bb.33: ; %while.body104
+; CHECK-NEXT:    ; in Loop: Header=BB0_30 Depth=1
 ; CHECK-NEXT:    cmp w13, #94
-; CHECK-NEXT:    b.eq LBB0_28
-; CHECK-NEXT:    b LBB0_42
-; CHECK-NEXT:  LBB0_33: ; %if.else123
+; CHECK-NEXT:    b.eq LBB0_29
+; CHECK-NEXT:    b LBB0_43
+; CHECK-NEXT:  LBB0_34: ; %if.else123
 ; CHECK-NEXT:    cmp w13, #1
 ; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    b.ne LBB0_43
-; CHECK-NEXT:  ; %bb.34: ; %if.else123
+; CHECK-NEXT:    b.ne LBB0_44
+; CHECK-NEXT:  ; %bb.35: ; %if.else123
 ; CHECK-NEXT:    cmp w12, #2
-; CHECK-NEXT:    b.ne LBB0_43
-; CHECK-NEXT:  ; %bb.35: ; %while.cond130.preheader
+; CHECK-NEXT:    b.ne LBB0_44
+; CHECK-NEXT:  ; %bb.36: ; %while.cond130.preheader
 ; CHECK-NEXT:    ldrb w8, [x9, x11]
-; CHECK-NEXT:    cbz w8, LBB0_23
-; CHECK-NEXT:  ; %bb.36: ; %land.rhs134.preheader
+; CHECK-NEXT:    cbz w8, LBB0_24
+; CHECK-NEXT:  ; %bb.37: ; %land.rhs134.preheader
 ; CHECK-NEXT:    mov x12, xzr
 ; CHECK-NEXT:    mov w0, #1 ; =0x1
-; CHECK-NEXT:    b LBB0_38
-; CHECK-NEXT:  LBB0_37: ; %if.then152
-; CHECK-NEXT:    ; in Loop: Header=BB0_38 Depth=1
+; CHECK-NEXT:    b LBB0_39
+; CHECK-NEXT:  LBB0_38: ; %if.then152
+; CHECK-NEXT:    ; in Loop: Header=BB0_39 Depth=1
 ; CHECK-NEXT:    add x8, x9, x12
 ; CHECK-NEXT:    add x12, x12, #1
 ; CHECK-NEXT:    add x8, x8, x11
 ; CHECK-NEXT:    ldrb w8, [x8, #1]
-; CHECK-NEXT:    cbz w8, LBB0_43
-; CHECK-NEXT:  LBB0_38: ; %land.rhs134
+; CHECK-NEXT:    cbz w8, LBB0_44
+; CHECK-NEXT:  LBB0_39: ; %land.rhs134
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x13, x10, x12
 ; CHECK-NEXT:    ldrb w13, [x13, x11]
-; CHECK-NEXT:    cbz w13, LBB0_23
-; CHECK-NEXT:  ; %bb.39: ; %while.body139
-; CHECK-NEXT:    ; in Loop: Header=BB0_38 Depth=1
-; CHECK-NEXT:    cmp w8, w13
-; CHECK-NEXT:    b.eq LBB0_37
+; CHECK-NEXT:    cbz w13, LBB0_24
 ; CHECK-NEXT:  ; %bb.40: ; %while.body139
-; CHECK-NEXT:    ; in Loop: Header=BB0_38 Depth=1
-; CHECK-NEXT:    cmp w13, #42
-; CHECK-NEXT:    b.eq LBB0_37
+; CHECK-NEXT:    ; in Loop: Header=BB0_39 Depth=1
+; CHECK-NEXT:    cmp w8, w13
+; CHECK-NEXT:    b.eq LBB0_38
 ; CHECK-NEXT:  ; %bb.41: ; %while.body139
-; CHECK-NEXT:    ; in Loop: Header=BB0_38 Depth=1
+; CHECK-NEXT:    ; in Loop: Header=BB0_39 Depth=1
+; CHECK-NEXT:    cmp w13, #42
+; CHECK-NEXT:    b.eq LBB0_38
+; CHECK-NEXT:  ; %bb.42: ; %while.body139
+; CHECK-NEXT:    ; in Loop: Header=BB0_39 Depth=1
 ; CHECK-NEXT:    cmp w8, #94
-; CHECK-NEXT:    b.eq LBB0_37
-; CHECK-NEXT:  LBB0_42:
-; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    b.eq LBB0_38
 ; CHECK-NEXT:  LBB0_43:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:  LBB0_44:
 ; CHECK-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  LBB0_44: ; %lor.lhs.false47
+; CHECK-NEXT:  LBB0_45: ; %lor.lhs.false47
 ; CHECK-NEXT:    cmp x12, #2
-; CHECK-NEXT:    b.ne LBB0_11
-; CHECK-NEXT:  ; %bb.45: ; %land.lhs.true52
+; CHECK-NEXT:    b.ne LBB0_12
+; CHECK-NEXT:  ; %bb.46: ; %land.lhs.true52
 ; CHECK-NEXT:    add x12, x9, x11
 ; CHECK-NEXT:    mov w0, #1 ; =0x1
 ; CHECK-NEXT:    ldurb w12, [x12, #-1]
 ; CHECK-NEXT:    cmp w12, #73
-; CHECK-NEXT:    b.eq LBB0_43
-; CHECK-NEXT:  ; %bb.46: ; %land.lhs.true52
-; CHECK-NEXT:    cbz w8, LBB0_43
-; CHECK-NEXT:    b LBB0_12
-; CHECK-NEXT:  LBB0_47:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    b.eq LBB0_44
+; CHECK-NEXT:  ; %bb.47: ; %land.lhs.true52
+; CHECK-NEXT:    cbz w8, LBB0_44
+; CHECK-NEXT:    b LBB0_13
 ; CHECK-NEXT:    .loh AdrpLdrGot Lloh0, Lloh1
 ; CHECK-NEXT:    .loh AdrpLdrGot Lloh2, Lloh3
 ; CHECK-NEXT:    .loh AdrpLdrGot Lloh4, Lloh5
diff --git a/llvm/test/CodeGen/AArch64/sink-and-fold.ll b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
index 632fdb391053121..66afb840b50db99 100644
--- a/llvm/test/CodeGen/AArch64/sink-and-fold.ll
+++ b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
@@ -142,40 +142,42 @@ define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast,+addr-
 ; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x1
 ; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    b .LBB4_3
-; CHECK-NEXT:  .LBB4_2: // %LI.latch
-; CHECK-NEXT:    // in Loop: Header=BB4_3 Depth=1
-; CHECK-NEXT:    cmp x22, x19
-; CHECK-NEXT:    mov x22, x23
-; CHECK-NEXT:    b.ge .LBB4_8
-; CHECK-NEXT:  .LBB4_3: // %LI
-; CHECK-NEXT:    // =>This Loop Header: Depth=1
-; CHECK-NEXT:    // Child Loop BB4_6 Depth 2
-; CHECK-NEXT:    mov x21, xzr
-; CHECK-NEXT:    add x23, x22, #1
-; CHECK-NEXT:    b .LBB4_6
-; CHECK-NEXT:  .LBB4_4: // %if.else
-; CHECK-NEXT:    // in Loop: Header=BB4_6 Depth=2
-; CHECK-NEXT:    ldr w0, [x20, x22, lsl #2]
-; CHECK-NEXT:  .LBB4_5: // %LJ.latch
-; CHECK-NEXT:    // in Loop: Header=BB4_6 Depth=2
+; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    b .LBB4_7
+; CHECK-NEXT:  .LBB4_2: // %if.else
+; CHECK-NEXT:    // in Loop: Header=BB4_4 Depth=2
+; CHECK-NEXT:    ldr w0, [x22]
+; CHECK-NEXT:  .LBB4_3: // %LJ.latch
+; CHECK-NEXT:    // in Loop: Header=BB4_4 Depth=2
 ; CHECK-NEXT:    add x8, x21, #1
 ; CHECK-NEXT:    str w0, [x20, x21, lsl #2]
 ; CHECK-NEXT:    sub x9, x8, #1
 ; CHECK-NEXT:    mov x21, x8
 ; CHECK-NEXT:    cmp x9, x19
-; CHECK-NEXT:    b.ge .LBB4_2
-; CHECK-NEXT:  .LBB4_6: // %LJ
-; CHECK-NEXT:    // Parent Loop BB4_3 Depth=1
+; CHECK-NEXT:    b.ge .LBB4_6
+; CHECK-NEXT:  .LBB4_4: // %LJ
+; CHECK-NEXT:    // Parent Loop BB4_7 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldr w8, [x20, x21, lsl #2]
-; CHECK-NEXT:    tbz w8, #31, .LBB4_4
-; CHECK-NEXT:  // %bb.7: // %if.then
-; CHECK-NEXT:    // in Loop: Header=BB4_6 Depth=2
-; CHECK-NEXT:    add x0, x20, x22, lsl #2
+; CHECK-NEXT:    tbz w8, #31, .LBB4_2
+; CHECK-NEXT:  // %bb.5: // %if.then
+; CHECK-NEXT:    // in Loop: Header=BB4_4 Depth=2
+; CHECK-NEXT:    mov x0, x22
 ; CHECK-NEXT:    mov x1, x21
 ; CHECK-NEXT:    bl use
-; CHECK-NEXT:    b .LBB4_5
+; CHECK-NEXT:    b .LBB4_3
+; CHECK-NEXT:  .LBB4_6: // %LI.latch
+; CHECK-NEXT:    // in Loop: Header=BB4_7 Depth=1
+; CHECK-NEXT:    cmp x23, x19
+; CHECK-NEXT:    mov x23, x24
+; CHECK-NEXT:    b.ge .LBB4_8
+; CHECK-NEXT:  .LBB4_7: // %LI
+; CHECK-NEXT:    // =>This Loop Header: Depth=1
+; CHECK-NEXT:    // Child Loop BB4_4 Depth 2
+; CHECK-NEXT:    add x22, x20, x23, lsl #2
+; CHECK-NEXT:    mov x21, xzr
+; CHECK-NEXT:    add x24, x23, #1
+; CHECK-NEXT:    b .LBB4_4
 ; CHECK-NEXT:  .LBB4_8:
 ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
index fadb774b0822296..7838f2b59fa4426 100644
--- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@@ -26,13 +26,18 @@ define amdgpu_ps void @main(i32 %0, float %1) {
 ; ISA-NEXT:    s_mov_b64 s[0:1], 0
 ; ISA-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; ISA-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; ISA-NEXT:    s_branch .LBB0_3
-; ISA-NEXT:  .LBB0_1: ; %Flow1
-; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; ISA-NEXT:    s_branch .LBB0_4
+; ISA-NEXT:  .LBB0_1: ; %endif1
+; ISA-NEXT:    ; in Loop: Header=BB0_4 Depth=1
+; ISA-NEXT:    s_mov_b64 s[4:5], -1
+; ISA-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; ISA-NEXT:    s_cbranch_execnz .LBB0_5
+; ISA-NEXT:  .LBB0_2: ; %Flow1
+; ISA-NEXT:    ; in Loop: Header=BB0_4 Depth=1
 ; ISA-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; ISA-NEXT:    s_mov_b64 s[6:7], 0
-; ISA-NEXT:  .LBB0_2: ; %Flow
-; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; ISA-NEXT:  .LBB0_3: ; %Flow
+; ISA-NEXT:    ; in Loop: Header=BB0_4 Depth=1
 ; ISA-NEXT:    s_and_b64 s[10:11], exec, s[4:5]
 ; ISA-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
 ; ISA-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
@@ -40,22 +45,18 @@ define amdgpu_ps void @main(i32 %0, float %1) {
 ; ISA-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; ISA-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; ISA-NEXT:    s_cbranch_execz .LBB0_6
-; ISA-NEXT:  .LBB0_3: ; %loop
+; ISA-NEXT:  .LBB0_4: ; %loop
 ; ISA-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ISA-NEXT:    s_or_b64 s[4:5], s[4:5], exec
 ; ISA-NEXT:    s_cmp_lt_u32 s8, 32
 ; ISA-NEXT:    s_mov_b64 s[6:7], -1
-; ISA-NEXT:    s_cbranch_scc0 .LBB0_2
-; ISA-NEXT:  ; %bb.4: ; %endif1
-; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; ISA-NEXT:    s_mov_b64 s[4:5], -1
-; ISA-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; ISA-NEXT:    s_cbranch_execz .LBB0_1
-; ISA-NEXT:  ; %bb.5: ; %endif2
-; ISA-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; ISA-NEXT:    s_cbranch_scc1 .LBB0_1
+; ISA-NEXT:    s_branch .LBB0_3
+; ISA-NEXT:  .LBB0_5: ; %endif2
+; ISA-NEXT:    ; in Loop: Header=BB0_4 Depth=1
 ; ISA-NEXT:    s_add_i32 s8, s8, 1
 ; ISA-NEXT:    s_xor_b64 s[4:5], exec, -1
-; ISA-NEXT:    s_branch .LBB0_1
+; ISA-NEXT:    s_branch .LBB0_2
 ; ISA-NEXT:  .LBB0_6: ; %Flow2
 ; ISA-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; ISA-NEXT:    v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir b/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir
index b7e1dfb2eda28c0..eb07b05daca5e82 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir
+++ b/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir
@@ -10,22 +10,17 @@ name:            loop_header_nopred
 body:             |
   ; GFX10-LABEL: name: loop_header_nopred
   ; GFX10: bb.0:
-  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   S_BRANCH %bb.2
-  ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT: bb.1 (align 64):
-  ; GFX10-NEXT:   successors: %bb.7(0x04000000), %bb.2(0x7c000000)
-  ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   S_CBRANCH_VCCNZ %bb.7, implicit $vcc_lo
+  ; GFX10-NEXT:   S_BRANCH %bb.1
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.1(0x40000000)
+  ; GFX10-NEXT: bb.3 (align 64):
+  ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; GFX10-NEXT:   S_CBRANCH_EXECZ %bb.6, implicit $exec
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT: bb.5:
-  ; GFX10-NEXT:   successors: %bb.1(0x04000000), %bb.5(0x7c000000)
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   S_NOP 0
   ; GFX10-NEXT:   S_NOP 0
@@ -38,29 +33,43 @@ body:             |
   ; GFX10-NEXT:   S_NOP 0
   ; GFX10-NEXT:   S_NOP 0
   ; GFX10-NEXT:   S_NOP 0
-  ; GFX10-NEXT:   S_CBRANCH_EXECZ %bb.5, implicit $exec
-  ; GFX10-NEXT:   S_BRANCH %bb.1
+  ; GFX10-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5:
+  ; GFX10-NEXT:   successors: %bb.6(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.6:
+  ; GFX10-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   S_CBRANCH_VCCNZ %bb.7, implicit $vcc_lo
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   S_CBRANCH_VCCZ %bb.3, implicit $vcc_lo
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   S_BRANCH %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   S_ENDPGM 0
+  ;
   ; GFX11-LABEL: name: loop_header_nopred
   ; GFX11: bb.0:
-  ; GFX11-NEXT:   successors: %bb.2(0x80000000)
-  ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT:   S_BRANCH %bb.2
+  ; GFX11-NEXT:   successors: %bb.1(0x80000000)
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT: bb.1:
-  ; GFX11-NEXT:   successors: %bb.7(0x04000000), %bb.2(0x7c000000)
-  ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT:   S_CBRANCH_VCCNZ %bb.7, implicit $vcc_lo
+  ; GFX11-NEXT:   S_BRANCH %bb.1
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT: bb.2:
-  ; GFX11-NEXT:   successors: %bb.5(0x40000000), %bb.1(0x40000000)
+  ; GFX11-NEXT: bb.3:
+  ; GFX11-NEXT:   successors: %bb.4(0x40000000), %bb.6(0x40000000)
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; GFX11-NEXT:   S_CBRANCH_EXECZ %bb.6, implicit $exec
   ; GFX11-NEXT: {{  $}}
-  ; GFX11-NEXT: bb.5:
-  ; GFX11-NEXT:   successors: %bb.1(0x04000000), %bb.5(0x7c000000)
+  ; GFX11-NEXT: bb.4:
+  ; GFX11-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT:   S_NOP 0
   ; GFX11-NEXT:   S_NOP 0
@@ -73,8 +82,26 @@ body:             |
   ; GFX11-NEXT:   S_NOP 0
   ; GFX11-NEXT:   S_NOP 0
   ; GFX11-NEXT:   S_NOP 0
-  ; GFX11-NEXT:   S_CBRANCH_EXECZ %bb.5, implicit $exec
-  ; GFX11-NEXT:   S_BRANCH %bb.1
+  ; GFX11-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
+  ; GFX11-NEXT: {{  $}}
+  ; GFX11-NEXT: bb.5:
+  ; GFX11-NEXT:   successors: %bb.6(0x80000000)
+  ; GFX11-NEXT: {{  $}}
+  ; GFX11-NEXT: {{  $}}
+  ; GFX11-NEXT: bb.6:
+  ; GFX11-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+  ; GFX11-NEXT: {{  $}}
+  ; GFX11-NEXT:   S_CBRANCH_VCCNZ %bb.7, implicit $vcc_lo
+  ; GFX11-NEXT: {{  $}}
+  ; GFX11-NEXT: bb.1:
+  ; GFX11-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX11-NEXT: {{  $}}
+  ; GFX11-NEXT:   S_CBRANCH_VCCZ %bb.3, implicit $vcc_lo
+  ; GFX11-NEXT: {{  $}}
+  ; GFX11-NEXT: bb.2:
+  ; GFX11-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX11-NEXT: {{  $}}
+  ; GFX11-NEXT:   S_BRANCH %bb.3
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.7:
   ; GFX11-NEXT:   S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
index e2456b74f7ef1fa..363428335b99345 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll
@@ -16,36 +16,10 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
 ; CHECK-NEXT:    s_xor_b32 s6, s4, -1
 ; CHECK-NEXT:    s_inst_prefetch 0x1
-; CHECK-NEXT:    s_branch .LBB0_3
-; CHECK-NEXT:    .p2align 6
-; CHECK-NEXT:  .LBB0_1: ; %Flow
-; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; CHECK-NEXT:  .LBB0_2: ; %Flow1
-; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s4, 0, v3
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; j lastloop entry
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_or_b32 s5, s4, s5
-; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; CHECK-NEXT:    s_cbranch_execz .LBB0_8
-; CHECK-NEXT:  .LBB0_3: ; %for.body33
-; CHECK-NEXT:    ; =>This Loop Header: Depth=1
-; CHECK-NEXT:    ; Child Loop BB0_6 Depth 2
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    s_and_saveexec_b32 s7, s6
-; CHECK-NEXT:    s_cbranch_execz .LBB0_2
-; CHECK-NEXT:  ; %bb.4: ; %for.body51.preheader
-; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_mov_b32 s8, 0
-; CHECK-NEXT:    s_mov_b32 s9, 0
 ; CHECK-NEXT:    s_branch .LBB0_6
 ; CHECK-NEXT:    .p2align 6
-; CHECK-NEXT:  .LBB0_5: ; %if.end118
-; CHECK-NEXT:    ; in Loop: Header=BB0_6 Depth=2
+; CHECK-NEXT:  .LBB0_1: ; %if.end118
+; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; CHECK-NEXT:    s_add_i32 s9, s9, 4
 ; CHECK-NEXT:    ;;#ASMSTART
@@ -56,20 +30,46 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
 ; CHECK-NEXT:    v_add_nc_u32_e32 v4, -4, v4
 ; CHECK-NEXT:    s_or_b32 s8, s4, s8
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s8
-; CHECK-NEXT:    s_cbranch_execz .LBB0_1
-; CHECK-NEXT:  .LBB0_6: ; %for.body51
-; CHECK-NEXT:    ; Parent Loop BB0_3 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_4
+; CHECK-NEXT:  .LBB0_2: ; %for.body51
+; CHECK-NEXT:    ; Parent Loop BB0_6 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 1
 ; CHECK-NEXT:    s_and_saveexec_b32 s4, vcc_lo
-; CHECK-NEXT:    s_cbranch_execz .LBB0_5
-; CHECK-NEXT:  ; %bb.7: ; %if.then112
-; CHECK-NEXT:    ; in Loop: Header=BB0_6 Depth=2
+; CHECK-NEXT:    s_cbranch_execz .LBB0_1
+; CHECK-NEXT:  ; %bb.3: ; %if.then112
+; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=2
 ; CHECK-NEXT:    s_add_i32 s10, s9, 4
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, s10
 ; CHECK-NEXT:    ds_write_b32 v1, v4
-; CHECK-NEXT:    s_branch .LBB0_5
+; CHECK-NEXT:    s_branch .LBB0_1
+; CHECK-NEXT:    .p2align 6
+; CHECK-NEXT:  .LBB0_4: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB0_6 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; CHECK-NEXT:  .LBB0_5: ; %Flow1
+; CHECK-NEXT:    ; in Loop: Header=BB0_6 Depth=1
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s7
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s4, 0, v3
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; j lastloop entry
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_or_b32 s5, s4, s5
+; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; CHECK-NEXT:    s_cbranch_execz .LBB0_8
+; CHECK-NEXT:  .LBB0_6: ; %for.body33
+; CHECK-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NEXT:    ; Child Loop BB0_2 Depth 2
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_and_saveexec_b32 s7, s6
+; CHECK-NEXT:    s_cbranch_execz .LBB0_5
+; CHECK-NEXT:  ; %bb.7: ; %for.body51.preheader
+; CHECK-NEXT:    ; in Loop: Header=BB0_6 Depth=1
+; CHECK-NEXT:    s_mov_b32 s8, 0
+; CHECK-NEXT:    s_mov_b32 s9, 0
+; CHECK-NEXT:    s_branch .LBB0_2
 ; CHECK-NEXT:  .LBB0_8: ; %for.body159.preheader
 ; CHECK-NEXT:    s_inst_prefetch 0x2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s5
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index 0e6c1aecb6774a3..c777d1183deb669 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -102,28 +102,14 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[6:7]
 ; CHECK-NEXT:    v_add_co_u32_e32 v8, vcc, v2, v0
 ; CHECK-NEXT:    v_addc_co_u32_e32 v9, vcc, v3, v1, vcc
-; CHECK-NEXT:    s_branch .LBB0_12
-; CHECK-NEXT:  .LBB0_10: ; %Flow19
-; CHECK-NEXT:    ; in Loop: Header=BB0_12 Depth=1
-; CHECK-NEXT:    s_or_b64 exec, exec, s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[8:9], 0
-; CHECK-NEXT:  .LBB0_11: ; %Flow21
-; CHECK-NEXT:    ; in Loop: Header=BB0_12 Depth=1
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_20
-; CHECK-NEXT:  .LBB0_12: ; %while.cond
-; CHECK-NEXT:    ; =>This Loop Header: Depth=1
-; CHECK-NEXT:    ; Child Loop BB0_14 Depth 2
-; CHECK-NEXT:    ; Child Loop BB0_18 Depth 2
-; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
-; CHECK-NEXT:    s_cbranch_execz .LBB0_15
-; CHECK-NEXT:  ; %bb.13: ; %loop-memcpy-expansion2.preheader
-; CHECK-NEXT:    ; in Loop: Header=BB0_12 Depth=1
+; CHECK-NEXT:    s_branch .LBB0_19
+; CHECK-NEXT:  .LBB0_10: ; %loop-memcpy-expansion2.preheader
+; CHECK-NEXT:    ; in Loop: Header=BB0_19 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[10:11], 0
 ; CHECK-NEXT:    s_mov_b64 s[12:13], 0
 ; CHECK-NEXT:    s_mov_b64 s[14:15], 0
-; CHECK-NEXT:  .LBB0_14: ; %loop-memcpy-expansion2
-; CHECK-NEXT:    ; Parent Loop BB0_12 Depth=1
+; CHECK-NEXT:  .LBB0_11: ; %loop-memcpy-expansion2
+; CHECK-NEXT:    ; Parent Loop BB0_19 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v10, s10
 ; CHECK-NEXT:    v_mov_b32_e32 v11, s11
@@ -169,23 +155,23 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    flat_store_byte v[10:11], v20 offset:13
 ; CHECK-NEXT:    flat_store_byte v[10:11], v27 offset:12
 ; CHECK-NEXT:    s_andn2_b64 exec, exec, s[12:13]
-; CHECK-NEXT:    s_cbranch_execnz .LBB0_14
-; CHECK-NEXT:  .LBB0_15: ; %Flow20
-; CHECK-NEXT:    ; in Loop: Header=BB0_12 Depth=1
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_11
+; CHECK-NEXT:  .LBB0_12: ; %Flow20
+; CHECK-NEXT:    ; in Loop: Header=BB0_19 Depth=1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; CHECK-NEXT:    s_mov_b64 s[8:9], -1
-; CHECK-NEXT:    s_cbranch_execz .LBB0_11
-; CHECK-NEXT:  ; %bb.16: ; %loop-memcpy-residual-header5
-; CHECK-NEXT:    ; in Loop: Header=BB0_12 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_18
+; CHECK-NEXT:  ; %bb.13: ; %loop-memcpy-residual-header5
+; CHECK-NEXT:    ; in Loop: Header=BB0_19 Depth=1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
 ; CHECK-NEXT:    s_xor_b64 s[10:11], exec, s[8:9]
-; CHECK-NEXT:    s_cbranch_execz .LBB0_10
-; CHECK-NEXT:  ; %bb.17: ; %loop-memcpy-residual4.preheader
-; CHECK-NEXT:    ; in Loop: Header=BB0_12 Depth=1
+; CHECK-NEXT:    s_cbranch_execz .LBB0_17
+; CHECK-NEXT:  ; %bb.14: ; %loop-memcpy-residual4.preheader
+; CHECK-NEXT:    ; in Loop: Header=BB0_19 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[12:13], 0
 ; CHECK-NEXT:    s_mov_b64 s[14:15], 0
-; CHECK-NEXT:  .LBB0_18: ; %loop-memcpy-residual4
-; CHECK-NEXT:    ; Parent Loop BB0_12 Depth=1
+; CHECK-NEXT:  .LBB0_15: ; %loop-memcpy-residual4
+; CHECK-NEXT:    ; Parent Loop BB0_19 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v12, s15
 ; CHECK-NEXT:    v_add_co_u32_e32 v10, vcc, s14, v0
@@ -200,11 +186,25 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[10:11], v13
 ; CHECK-NEXT:    s_andn2_b64 exec, exec, s[12:13]
-; CHECK-NEXT:    s_cbranch_execnz .LBB0_18
-; CHECK-NEXT:  ; %bb.19: ; %Flow
-; CHECK-NEXT:    ; in Loop: Header=BB0_12 Depth=1
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_15
+; CHECK-NEXT:  ; %bb.16: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB0_19 Depth=1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[12:13]
-; CHECK-NEXT:    s_branch .LBB0_10
+; CHECK-NEXT:  .LBB0_17: ; %Flow19
+; CHECK-NEXT:    ; in Loop: Header=BB0_19 Depth=1
+; CHECK-NEXT:    s_or_b64 exec, exec, s[10:11]
+; CHECK-NEXT:    s_mov_b64 s[8:9], 0
+; CHECK-NEXT:  .LBB0_18: ; %Flow21
+; CHECK-NEXT:    ; in Loop: Header=BB0_19 Depth=1
+; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_20
+; CHECK-NEXT:  .LBB0_19: ; %while.cond
+; CHECK-NEXT:    ; =>This Loop Header: Depth=1
+; CHECK-NEXT:    ; Child Loop BB0_11 Depth 2
+; CHECK-NEXT:    ; Child Loop BB0_15 Depth 2
+; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; CHECK-NEXT:    s_cbranch_execnz .LBB0_10
+; CHECK-NEXT:    s_branch .LBB0_12
 ; CHECK-NEXT:  .LBB0_20: ; %DummyReturnBlock
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index f7479c847559254..9e383ec0b1bfcf2 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow < %s | FileCheck -check-prefix=OPT %s
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
@@ -46,46 +46,45 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 ; GCN:       ; %bb.0: ; %main_body
 ; GCN-NEXT:    s_mov_b64 s[0:1], 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_branch .LBB0_2
-; GCN-NEXT:  .LBB0_1: ; %loop.exit.guard
-; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
-; GCN-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GCN-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; GCN-NEXT:    s_cbranch_execz .LBB0_6
-; GCN-NEXT:  .LBB0_2: ; %LOOP.outer
-; GCN-NEXT:    ; =>This Loop Header: Depth=1
-; GCN-NEXT:    ; Child Loop BB0_4 Depth 2
-; GCN-NEXT:    ; implicit-def: $sgpr6_sgpr7
-; GCN-NEXT:    ; implicit-def: $sgpr2_sgpr3
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    s_branch .LBB0_4
-; GCN-NEXT:  .LBB0_3: ; %Flow
-; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_branch .LBB0_5
+; GCN-NEXT:  .LBB0_1: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=2
 ; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_and_b64 s[8:9], exec, s[6:7]
 ; GCN-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
 ; GCN-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB0_1
-; GCN-NEXT:  .LBB0_4: ; %LOOP
-; GCN-NEXT:    ; Parent Loop BB0_2 Depth=1
+; GCN-NEXT:    s_cbranch_execz .LBB0_4
+; GCN-NEXT:  .LBB0_2: ; %LOOP
+; GCN-NEXT:    ; Parent Loop BB0_5 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, v0, v4
 ; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], exec
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GCN-NEXT:    s_cbranch_execz .LBB0_3
-; GCN-NEXT:  ; %bb.5: ; %ENDIF
-; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_cbranch_execz .LBB0_1
+; GCN-NEXT:  ; %bb.3: ; %ENDIF
+; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=2
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
 ; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v0
 ; GCN-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
 ; GCN-NEXT:    s_and_b64 s[10:11], vcc, exec
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
-; GCN-NEXT:    s_branch .LBB0_3
-; GCN-NEXT:  .LBB0_6: ; %IF
+; GCN-NEXT:    s_branch .LBB0_1
+; GCN-NEXT:  .LBB0_4: ; %loop.exit.guard
+; GCN-NEXT:    ; in Loop: Header=BB0_5 Depth=1
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
+; GCN-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GCN-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GCN-NEXT:  .LBB0_5: ; %LOOP.outer
+; GCN-NEXT:    ; =>This Loop Header: Depth=1
+; GCN-NEXT:    ; Child Loop BB0_2 Depth 2
+; GCN-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; GCN-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
+; GCN-NEXT:    s_branch .LBB0_2
+; GCN-NEXT:  ; %bb.6: ; %IF
 ; GCN-NEXT:    s_endpgm
 main_body:
   br label %LOOP.outer
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index 2a65426d876dcfb..5e8f17265a3d167 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -11,45 +11,31 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_branch .LBB0_2
-; GCN-NEXT:  .LBB0_1: ; %loop.exit.guard
-; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GCN-NEXT:    s_and_b64 vcc, exec, s[14:15]
-; GCN-NEXT:    s_cbranch_vccnz .LBB0_9
-; GCN-NEXT:  .LBB0_2: ; %bb1
-; GCN-NEXT:    ; =>This Loop Header: Depth=1
-; GCN-NEXT:    ; Child Loop BB0_4 Depth 2
-; GCN-NEXT:    s_mov_b32 s11, s7
-; GCN-NEXT:    buffer_load_dword v1, off, s[8:11], 0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v1
-; GCN-NEXT:    s_mov_b32 s12, s6
-; GCN-NEXT:    s_branch .LBB0_4
-; GCN-NEXT:  .LBB0_3: ; %Flow1
-; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_branch .LBB0_8
+; GCN-NEXT:  .LBB0_1: ; %Flow1
+; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=2
 ; GCN-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
-; GCN-NEXT:    s_cbranch_vccz .LBB0_1
-; GCN-NEXT:  .LBB0_4: ; %bb2
-; GCN-NEXT:    ; Parent Loop BB0_2 Depth=1
+; GCN-NEXT:    s_cbranch_vccz .LBB0_7
+; GCN-NEXT:  .LBB0_2: ; %bb2
+; GCN-NEXT:    ; Parent Loop BB0_8 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GCN-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GCN-NEXT:    s_lshl_b32 s12, s12, 5
-; GCN-NEXT:    s_cbranch_vccz .LBB0_6
-; GCN-NEXT:  ; %bb.5: ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_cbranch_vccz .LBB0_4
+; GCN-NEXT:  ; %bb.3: ; in Loop: Header=BB0_2 Depth=2
 ; GCN-NEXT:    s_mov_b64 s[14:15], s[2:3]
-; GCN-NEXT:    s_branch .LBB0_7
-; GCN-NEXT:  .LBB0_6: ; %bb3
-; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_branch .LBB0_5
+; GCN-NEXT:  .LBB0_4: ; %bb3
+; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=2
 ; GCN-NEXT:    s_add_i32 s12, s12, 1
 ; GCN-NEXT:    s_mov_b64 s[14:15], -1
-; GCN-NEXT:  .LBB0_7: ; %Flow
-; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:  .LBB0_5: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=2
 ; GCN-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
 ; GCN-NEXT:    s_mov_b64 s[16:17], -1
-; GCN-NEXT:    s_cbranch_vccnz .LBB0_3
-; GCN-NEXT:  ; %bb.8: ; %bb4
-; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_cbranch_vccnz .LBB0_1
+; GCN-NEXT:  ; %bb.6: ; %bb4
+; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=2
 ; GCN-NEXT:    s_ashr_i32 s13, s12, 31
 ; GCN-NEXT:    s_lshl_b64 s[16:17], s[12:13], 2
 ; GCN-NEXT:    s_mov_b64 s[14:15], 0
@@ -58,7 +44,21 @@ define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
 ; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
 ; GCN-NEXT:    s_cmp_eq_u32 s12, 32
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_branch .LBB0_3
+; GCN-NEXT:    s_branch .LBB0_1
+; GCN-NEXT:  .LBB0_7: ; %loop.exit.guard
+; GCN-NEXT:    ; in Loop: Header=BB0_8 Depth=1
+; GCN-NEXT:    s_and_b64 vcc, exec, s[14:15]
+; GCN-NEXT:    s_cbranch_vccnz .LBB0_9
+; GCN-NEXT:  .LBB0_8: ; %bb1
+; GCN-NEXT:    ; =>This Loop Header: Depth=1
+; GCN-NEXT:    ; Child Loop BB0_2 Depth 2
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    buffer_load_dword v1, off, s[8:11], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v1
+; GCN-NEXT:    s_mov_b32 s12, s6
+; GCN-NEXT:    s_branch .LBB0_2
 ; GCN-NEXT:  .LBB0_9: ; %DummyReturnBlock
 ; GCN-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index a462c19ce645d4a..b23fa568c6466b4 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -451,14 +451,22 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    s_movk_i32 s0, 0x7f
-; GFX8-NEXT:  .LBB1_1: ; %for.cond.preheader
+; GFX8-NEXT:    s_branch .LBB1_3
+; GFX8-NEXT:  .LBB1_1: ; %while.cond.loopexit
+; GFX8-NEXT:    ; in Loop: Header=BB1_3 Depth=1
+; GFX8-NEXT:    s_add_i32 s1, s0, -1
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB1_5
+; GFX8-NEXT:  ; %bb.2: ; in Loop: Header=BB1_3 Depth=1
+; GFX8-NEXT:    s_mov_b32 s0, s1
+; GFX8-NEXT:  .LBB1_3: ; %for.cond.preheader
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
-; GFX8-NEXT:    ; Child Loop BB1_2 Depth 2
+; GFX8-NEXT:    ; Child Loop BB1_4 Depth 2
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    s_mov_b32 s1, 0
-; GFX8-NEXT:  .LBB1_2: ; %for.body
-; GFX8-NEXT:    ; Parent Loop BB1_1 Depth=1
+; GFX8-NEXT:  .LBB1_4: ; %for.body
+; GFX8-NEXT:    ; Parent Loop BB1_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0xffffb000, v5
 ; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, -1, v6, vcc
@@ -528,14 +536,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v27, v3
 ; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v28, v4, vcc
-; GFX8-NEXT:    s_cbranch_scc0 .LBB1_2
-; GFX8-NEXT:  ; %bb.3: ; %while.cond.loopexit
-; GFX8-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; GFX8-NEXT:    s_add_i32 s1, s0, -1
-; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX8-NEXT:    s_cbranch_scc1 .LBB1_5
-; GFX8-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
-; GFX8-NEXT:    s_mov_b32 s0, s1
+; GFX8-NEXT:    s_cbranch_scc0 .LBB1_4
 ; GFX8-NEXT:    s_branch .LBB1_1
 ; GFX8-NEXT:  .LBB1_5: ; %while.end
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s35
@@ -582,14 +583,22 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    s_movk_i32 s0, 0xd000
 ; GFX900-NEXT:    s_movk_i32 s1, 0xe000
 ; GFX900-NEXT:    s_movk_i32 s3, 0xf000
-; GFX900-NEXT:  .LBB1_1: ; %for.cond.preheader
+; GFX900-NEXT:    s_branch .LBB1_3
+; GFX900-NEXT:  .LBB1_1: ; %while.cond.loopexit
+; GFX900-NEXT:    ; in Loop: Header=BB1_3 Depth=1
+; GFX900-NEXT:    s_add_i32 s4, s2, -1
+; GFX900-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX900-NEXT:    s_cbranch_scc1 .LBB1_5
+; GFX900-NEXT:  ; %bb.2: ; in Loop: Header=BB1_3 Depth=1
+; GFX900-NEXT:    s_mov_b32 s2, s4
+; GFX900-NEXT:  .LBB1_3: ; %for.cond.preheader
 ; GFX900-NEXT:    ; =>This Loop Header: Depth=1
-; GFX900-NEXT:    ; Child Loop BB1_2 Depth 2
+; GFX900-NEXT:    ; Child Loop BB1_4 Depth 2
 ; GFX900-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX900-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX900-NEXT:    s_mov_b32 s4, 0
-; GFX900-NEXT:  .LBB1_2: ; %for.body
-; GFX900-NEXT:    ; Parent Loop BB1_1 Depth=1
+; GFX900-NEXT:  .LBB1_4: ; %for.body
+; GFX900-NEXT:    ; Parent Loop BB1_3 Depth=1
 ; GFX900-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX900-NEXT:    v_add_co_u32_e32 v7, vcc, 0xffffb000, v5
 ; GFX900-NEXT:    v_addc_co_u32_e32 v8, vcc, -1, v6, vcc
@@ -646,14 +655,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, v29, v3
 ; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, v30, v4, vcc
-; GFX900-NEXT:    s_cbranch_scc0 .LBB1_2
-; GFX900-NEXT:  ; %bb.3: ; %while.cond.loopexit
-; GFX900-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; GFX900-NEXT:    s_add_i32 s4, s2, -1
-; GFX900-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX900-NEXT:    s_cbranch_scc1 .LBB1_5
-; GFX900-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
-; GFX900-NEXT:    s_mov_b32 s2, s4
+; GFX900-NEXT:    s_cbranch_scc0 .LBB1_4
 ; GFX900-NEXT:    s_branch .LBB1_1
 ; GFX900-NEXT:  .LBB1_5: ; %while.end
 ; GFX900-NEXT:    v_mov_b32_e32 v1, s35
@@ -695,14 +697,22 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s35, v2, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, 0x5000, v1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:  .LBB1_1: ; %for.cond.preheader
+; GFX10-NEXT:    s_branch .LBB1_3
+; GFX10-NEXT:  .LBB1_1: ; %while.cond.loopexit
+; GFX10-NEXT:    ; in Loop: Header=BB1_3 Depth=1
+; GFX10-NEXT:    s_add_i32 s0, s1, -1
+; GFX10-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX10-NEXT:    s_cbranch_scc1 .LBB1_5
+; GFX10-NEXT:  ; %bb.2: ; in Loop: Header=BB1_3 Depth=1
+; GFX10-NEXT:    s_mov_b32 s1, s0
+; GFX10-NEXT:  .LBB1_3: ; %for.cond.preheader
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
-; GFX10-NEXT:    ; Child Loop BB1_2 Depth 2
+; GFX10-NEXT:    ; Child Loop BB1_4 Depth 2
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    s_mov_b32 s2, 0
-; GFX10-NEXT:  .LBB1_2: ; %for.body
-; GFX10-NEXT:    ; Parent Loop BB1_1 Depth=1
+; GFX10-NEXT:  .LBB1_4: ; %for.body
+; GFX10-NEXT:    ; Parent Loop BB1_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v5, 0xffffb800
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, -1, v6, vcc_lo
@@ -761,14 +771,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v31, v3
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v32, v4, vcc_lo
-; GFX10-NEXT:    s_cbranch_scc0 .LBB1_2
-; GFX10-NEXT:  ; %bb.3: ; %while.cond.loopexit
-; GFX10-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; GFX10-NEXT:    s_add_i32 s0, s1, -1
-; GFX10-NEXT:    s_cmp_eq_u32 s1, 0
-; GFX10-NEXT:    s_cbranch_scc1 .LBB1_5
-; GFX10-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
-; GFX10-NEXT:    s_mov_b32 s1, s0
+; GFX10-NEXT:    s_cbranch_scc0 .LBB1_4
 ; GFX10-NEXT:    s_branch .LBB1_1
 ; GFX10-NEXT:  .LBB1_5: ; %while.end
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, s34, v0
@@ -813,13 +816,21 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    s_movk_i32 s0, 0xd000
 ; GFX90A-NEXT:    s_movk_i32 s1, 0xe000
 ; GFX90A-NEXT:    s_movk_i32 s3, 0xf000
-; GFX90A-NEXT:  .LBB1_1: ; %for.cond.preheader
+; GFX90A-NEXT:    s_branch .LBB1_3
+; GFX90A-NEXT:  .LBB1_1: ; %while.cond.loopexit
+; GFX90A-NEXT:    ; in Loop: Header=BB1_3 Depth=1
+; GFX90A-NEXT:    s_add_i32 s4, s2, -1
+; GFX90A-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX90A-NEXT:    s_cbranch_scc1 .LBB1_5
+; GFX90A-NEXT:  ; %bb.2: ; in Loop: Header=BB1_3 Depth=1
+; GFX90A-NEXT:    s_mov_b32 s2, s4
+; GFX90A-NEXT:  .LBB1_3: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB1_2 Depth 2
+; GFX90A-NEXT:    ; Child Loop BB1_4 Depth 2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1]
 ; GFX90A-NEXT:    s_mov_b32 s4, 0
-; GFX90A-NEXT:  .LBB1_2: ; %for.body
-; GFX90A-NEXT:    ; Parent Loop BB1_1 Depth=1
+; GFX90A-NEXT:  .LBB1_4: ; %for.body
+; GFX90A-NEXT:    ; Parent Loop BB1_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_add_co_u32_e32 v12, vcc, 0xffffb000, v6
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, -1, v7, vcc
@@ -876,14 +887,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v30, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v31, v5, vcc
-; GFX90A-NEXT:    s_cbranch_scc0 .LBB1_2
-; GFX90A-NEXT:  ; %bb.3: ; %while.cond.loopexit
-; GFX90A-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; GFX90A-NEXT:    s_add_i32 s4, s2, -1
-; GFX90A-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX90A-NEXT:    s_cbranch_scc1 .LBB1_5
-; GFX90A-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
-; GFX90A-NEXT:    s_mov_b32 s2, s4
+; GFX90A-NEXT:    s_cbranch_scc0 .LBB1_4
 ; GFX90A-NEXT:    s_branch .LBB1_1
 ; GFX90A-NEXT:  .LBB1_5: ; %while.end
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s35
@@ -918,14 +922,22 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    v_add_co_u32 v1, vcc_lo, 0x5000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
-; GFX11-NEXT:  .LBB1_1: ; %for.cond.preheader
+; GFX11-NEXT:    s_branch .LBB1_3
+; GFX11-NEXT:  .LBB1_1: ; %while.cond.loopexit
+; GFX11-NEXT:    ; in Loop: Header=BB1_3 Depth=1
+; GFX11-NEXT:    s_add_i32 s0, s1, -1
+; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX11-NEXT:    s_cbranch_scc1 .LBB1_5
+; GFX11-NEXT:  ; %bb.2: ; in Loop: Header=BB1_3 Depth=1
+; GFX11-NEXT:    s_mov_b32 s1, s0
+; GFX11-NEXT:  .LBB1_3: ; %for.cond.preheader
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
-; GFX11-NEXT:    ; Child Loop BB1_2 Depth 2
+; GFX11-NEXT:    ; Child Loop BB1_4 Depth 2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v5, v1
 ; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:  .LBB1_2: ; %for.body
-; GFX11-NEXT:    ; Parent Loop BB1_1 Depth=1
+; GFX11-NEXT:  .LBB1_4: ; %for.body
+; GFX11-NEXT:    ; Parent Loop BB1_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v7, vcc_lo, v5, 0xffffc000
@@ -1000,14 +1012,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, v27, v3
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v28, v4, vcc_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB1_2
-; GFX11-NEXT:  ; %bb.3: ; %while.cond.loopexit
-; GFX11-NEXT:    ; in Loop: Header=BB1_1 Depth=1
-; GFX11-NEXT:    s_add_i32 s0, s1, -1
-; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
-; GFX11-NEXT:    s_cbranch_scc1 .LBB1_5
-; GFX11-NEXT:  ; %bb.4: ; in Loop: Header=BB1_1 Depth=1
-; GFX11-NEXT:    s_mov_b32 s1, s0
+; GFX11-NEXT:    s_cbranch_scc0 .LBB1_4
 ; GFX11-NEXT:    s_branch .LBB1_1
 ; GFX11-NEXT:  .LBB1_5: ; %while.end
 ; GFX11-NEXT:    v_add_co_u32 v0, s0, s34, v0
diff --git a/llvm/test/CodeGen/ARM/code-placement.ll b/llvm/test/CodeGen/ARM/code-placement.ll
index 01d72f134aacbd9..c93680b339f3d7b 100644
--- a/llvm/test/CodeGen/ARM/code-placement.ll
+++ b/llvm/test/CodeGen/ARM/code-placement.ll
@@ -36,9 +36,8 @@ entry:
   br i1 %0, label %bb5, label %bb.nph15
 
 bb1:                                              ; preds = %bb2.preheader, %bb1
-; CHECK: LBB1_[[BB3:.]]: @ %bb3
-; CHECK: LBB1_[[PREHDR:.]]: @ %bb2.preheader
-; CHECK: bmi LBB1_[[BB3]]
+; CHECK: LBB1_[[BB1:.]]: @ %bb1
+; CHECK: bne LBB1_[[BB1]]
   %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %bb2.preheader ] ; <i32> [#uses=2]
   %sum.08 = phi i32 [ %2, %bb1 ], [ %sum.110, %bb2.preheader ] ; <i32> [#uses=1]
   %tmp17 = sub i32 %i.07, %indvar                 ; <i32> [#uses=1]
@@ -50,8 +49,10 @@ bb1:                                              ; preds = %bb2.preheader, %bb1
   br i1 %exitcond, label %bb3, label %bb1
 
 bb3:                                              ; preds = %bb1, %bb2.preheader
-; CHECK: LBB1_[[BB1:.]]: @ %bb1
-; CHECK: bne LBB1_[[BB1]]
+; CHECK: LBB1_[[BB3:.]]: @ %bb3
+; CHECK: LBB1_[[PREHDR:.]]: @ %bb2.preheader
+; CHECK: bpl
+; CHECK-NEXT: b LBB1_[[BB3]]
   %sum.0.lcssa = phi i32 [ %sum.110, %bb2.preheader ], [ %2, %bb1 ] ; <i32> [#uses=2]
   %3 = add i32 %pass.011, 1                       ; <i32> [#uses=2]
   %exitcond18 = icmp eq i32 %3, %passes           ; <i1> [#uses=1]
diff --git a/llvm/test/CodeGen/ARM/loop-indexing.ll b/llvm/test/CodeGen/ARM/loop-indexing.ll
index 110342c7f3ba73e..5b353c09ccab34a 100644
--- a/llvm/test/CodeGen/ARM/loop-indexing.ll
+++ b/llvm/test/CodeGen/ARM/loop-indexing.ll
@@ -1,17 +1,17 @@
 ; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 %s -o - | \
-; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
+; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 --check-prefix=CHECK-T2-NOCOMPLEX
 
 ; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 -lsr-preferred-addressing-mode=none %s -o - | \
 ; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
 
 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 %s -o - | \
-; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
+; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 --check-prefix=CHECK-T2-NOCOMPLEX
 
 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-preferred-addressing-mode=postindexed %s -o - | \
 ; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
 
 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-preferred-addressing-mode=preindexed %s -o - | \
-; RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-T2
+; RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-T2 --check-prefix=CHECK-T2-NOCOMPLEX
 
 ; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
 ; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
@@ -620,10 +620,6 @@ for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.
 }
 
 ; CHECK-LABEL: mul_16x16_2d
-; CHECK: @ %for.body4.us
-
-; CHECK-DEFAULT: ldr{{.*}}, #16]!
-; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
 
 ; DISABLED-NOT: ldr{{.*}}]!
 ; DISABLED-NOT: str{{.*}}]!
@@ -632,6 +628,10 @@ for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.
 ; CHECK-T2: ldrsh{{.*}}, #2]!
 ; CHECK-T2: ldr{{.*}}, #4]!
 
+; CHECK: @ %for.body4.us
+; CHECK-DEFAULT: ldr{{.*}}, #16]!
+; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
+
 define void @mul_16x16_2d(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture readonly %C, i32 %N, i32 %M) {
 entry:
   %cmp24 = icmp eq i32 %N, 0
@@ -849,9 +849,12 @@ for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.
 }
 
 ; CHECK-LABEL: mac_16x16_2d
-; CHECK: @ %for.body4.us
+
+; CHECK-T2-NOCOMPLEX: @ %for.body4.us.epil
+; CHECK-T2-NOCOMPLEX: ldrsh{{.*}}, #2]!
 
 ; TODO: pre-indexed loads for both input arrays.
+; CHECK: @ %for.body4.us
 ; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
 ; CHECK-DEFAULT-NOT: ldr{{.*}}]!
 
@@ -860,8 +863,8 @@ for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.
 
 ; DISABLED-NOT: ldr{{.*}}]!
 
-; CHECK-T2: @ %for.body4.us.epil
-; CHECK-T2: ldrsh{{.*}}, #2]!
+; CHECK-COMPLEX: @ %for.body4.us.epil
+; CHECK-COMPLEX: ldrsh{{.*}}, #2]!
 
 define void @mac_16x16_2d(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture %C, i32 %N, i32 %M) {
 entry:
diff --git a/llvm/test/CodeGen/AVR/rot.ll b/llvm/test/CodeGen/AVR/rot.ll
index 1a6b917af95b7aa..fd99d02bd23787d 100644
--- a/llvm/test/CodeGen/AVR/rot.ll
+++ b/llvm/test/CodeGen/AVR/rot.ll
@@ -6,27 +6,27 @@ define i8 @rol8(i8 %val, i8 %amt) {
 ; CHECK-LABEL: rol8:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    andi r22, 7
-; CHECK-NEXT:    dec r22
-; CHECK-NEXT:    brmi .LBB0_2
-; CHECK-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    rjmp .LBB0_2
+; CHECK-NEXT:  .LBB0_1: ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    lsl r24
 ; CHECK-NEXT:    adc r24, r1
+; CHECK-NEXT:  .LBB0_2: ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    dec r22
 ; CHECK-NEXT:    brpl .LBB0_1
-; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:  ; %bb.3:
 ; CHECK-NEXT:    ret
 ;
 ; TINY-LABEL: rol8:
 ; TINY:       ; %bb.0:
 ; TINY-NEXT:    andi r22, 7
-; TINY-NEXT:    dec r22
-; TINY-NEXT:    brmi .LBB0_2
-; TINY-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; TINY-NEXT:    rjmp .LBB0_2
+; TINY-NEXT:  .LBB0_1: ; in Loop: Header=BB0_2 Depth=1
 ; TINY-NEXT:    lsl r24
 ; TINY-NEXT:    adc r24, r17
+; TINY-NEXT:  .LBB0_2: ; =>This Inner Loop Header: Depth=1
 ; TINY-NEXT:    dec r22
 ; TINY-NEXT:    brpl .LBB0_1
-; TINY-NEXT:  .LBB0_2:
+; TINY-NEXT:  ; %bb.3:
 ; TINY-NEXT:    ret
   %mod = urem i8 %amt, 8
   %inv = sub i8 8, %mod
@@ -41,29 +41,29 @@ define i8 @ror8(i8 %val, i8 %amt) {
 ; CHECK-LABEL: ror8:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    andi r22, 7
-; CHECK-NEXT:    dec r22
-; CHECK-NEXT:    brmi .LBB1_2
-; CHECK-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    rjmp .LBB1_2
+; CHECK-NEXT:  .LBB1_1: ; in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    bst r24, 0
 ; CHECK-NEXT:    ror r24
 ; CHECK-NEXT:    bld r24, 7
+; CHECK-NEXT:  .LBB1_2: ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    dec r22
 ; CHECK-NEXT:    brpl .LBB1_1
-; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:  ; %bb.3:
 ; CHECK-NEXT:    ret
 ;
 ; TINY-LABEL: ror8:
 ; TINY:       ; %bb.0:
 ; TINY-NEXT:    andi r22, 7
-; TINY-NEXT:    dec r22
-; TINY-NEXT:    brmi .LBB1_2
-; TINY-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; TINY-NEXT:    rjmp .LBB1_2
+; TINY-NEXT:  .LBB1_1: ; in Loop: Header=BB1_2 Depth=1
 ; TINY-NEXT:    bst r24, 0
 ; TINY-NEXT:    ror r24
 ; TINY-NEXT:    bld r24, 7
+; TINY-NEXT:  .LBB1_2: ; =>This Inner Loop Header: Depth=1
 ; TINY-NEXT:    dec r22
 ; TINY-NEXT:    brpl .LBB1_1
-; TINY-NEXT:  .LBB1_2:
+; TINY-NEXT:  ; %bb.3:
 ; TINY-NEXT:    ret
   %mod = urem i8 %amt, 8
   %inv = sub i8 8, %mod
diff --git a/llvm/test/CodeGen/AVR/rotate.ll b/llvm/test/CodeGen/AVR/rotate.ll
index 79ff7928b3a3984..38fbd3f5f0ee90a 100644
--- a/llvm/test/CodeGen/AVR/rotate.ll
+++ b/llvm/test/CodeGen/AVR/rotate.ll
@@ -53,15 +53,16 @@ define i8 @rotl8_dyn(i8 %x, i8 %y) {
 ; CHECK-LABEL: rotl8_dyn:
 ; CHECK:       ; %bb.0: ; %start
 ; CHECK-NEXT:    andi r22, 7
-; CHECK-NEXT:    dec r22
-; CHECK-NEXT:    brmi .LBB4_2
+; CHECK-NEXT:    rjmp .LBB4_2
 ; CHECK-NEXT:  .LBB4_1: ; %start
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ; in Loop: Header=BB4_2 Depth=1
 ; CHECK-NEXT:    lsl r24
 ; CHECK-NEXT:    adc r24, r1
+; CHECK-NEXT:  .LBB4_2: ; %start
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    dec r22
 ; CHECK-NEXT:    brpl .LBB4_1
-; CHECK-NEXT:  .LBB4_2: ; %start
+; CHECK-NEXT:  ; %bb.3: ; %start
 ; CHECK-NEXT:    ret
 start:
   %0 = call i8 @llvm.fshl.i8(i8 %x, i8 %x, i8 %y)
@@ -120,16 +121,17 @@ define i8 @rotr8_dyn(i8 %x, i8 %y) {
 ; CHECK-LABEL: rotr8_dyn:
 ; CHECK:       ; %bb.0: ; %start
 ; CHECK-NEXT:    andi r22, 7
-; CHECK-NEXT:    dec r22
-; CHECK-NEXT:    brmi .LBB9_2
+; CHECK-NEXT:    rjmp .LBB9_2
 ; CHECK-NEXT:  .LBB9_1: ; %start
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ; in Loop: Header=BB9_2 Depth=1
 ; CHECK-NEXT:    bst r24, 0
 ; CHECK-NEXT:    ror r24
 ; CHECK-NEXT:    bld r24, 7
+; CHECK-NEXT:  .LBB9_2: ; %start
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    dec r22
 ; CHECK-NEXT:    brpl .LBB9_1
-; CHECK-NEXT:  .LBB9_2: ; %start
+; CHECK-NEXT:  ; %bb.3: ; %start
 ; CHECK-NEXT:    ret
 start:
   %0 = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 %y)
diff --git a/llvm/test/CodeGen/AVR/shift.ll b/llvm/test/CodeGen/AVR/shift.ll
index c0abc77c9b14ae1..e2de4f5d40be269 100644
--- a/llvm/test/CodeGen/AVR/shift.ll
+++ b/llvm/test/CodeGen/AVR/shift.ll
@@ -5,13 +5,13 @@
 define i8 @shift_i8_i8_speed(i8 %a, i8 %b) {
 ; CHECK-LABEL: shift_i8_i8_speed:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    dec r22
-; CHECK-NEXT:    brmi .LBB0_2
-; CHECK-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    rjmp .LBB0_2
+; CHECK-NEXT:  .LBB0_1: ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    lsl r24
+; CHECK-NEXT:  .LBB0_2: ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    dec r22
 ; CHECK-NEXT:    brpl .LBB0_1
-; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:  ; %bb.3:
 ; CHECK-NEXT:    ret
   %result = shl i8 %a, %b
   ret i8 %result
@@ -36,14 +36,14 @@ define i8 @shift_i8_i8_size(i8 %a, i8 %b) optsize {
 define i16 @shift_i16_i16(i16 %a, i16 %b) {
 ; CHECK-LABEL: shift_i16_i16:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    dec r22
-; CHECK-NEXT:    brmi .LBB2_2
-; CHECK-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    rjmp .LBB2_2
+; CHECK-NEXT:  .LBB2_1: ; in Loop: Header=BB2_2 Depth=1
 ; CHECK-NEXT:    lsl r24
 ; CHECK-NEXT:    rol r25
+; CHECK-NEXT:  .LBB2_2: ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    dec r22
 ; CHECK-NEXT:    brpl .LBB2_1
-; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:  ; %bb.3:
 ; CHECK-NEXT:    ret
   %result = shl i16 %a, %b
   ret i16 %result
diff --git a/llvm/test/CodeGen/Generic/machine-function-splitter.ll b/llvm/test/CodeGen/Generic/machine-function-splitter.ll
index 364eadd64c7556d..cdbf0690356b628 100644
--- a/llvm/test/CodeGen/Generic/machine-function-splitter.ll
+++ b/llvm/test/CodeGen/Generic/machine-function-splitter.ll
@@ -523,7 +523,6 @@ define i32 @foo18(i32 %in) !prof !14 !section_prefix !15 {
 ; MFS-DEFAULTS-LABEL:        foo18
 ; MFS-DEFAULTS:              .section        .text.split.foo18
 ; MFS-DEFAULTS-NEXT:         foo18.cold:
-; MFS-DEFAULTS-SAME:           %common.ret
 ; MFS-DEFAULTS-X86-DAG:        jmp     qux
 ; MFS-DEFAULTS-X86-DAG:        jmp     bam
 ; MFS-DEFAULTS-AARCH64-NOT:    b       bar
diff --git a/llvm/test/CodeGen/Hexagon/lsr-postinc-nested-loop.ll b/llvm/test/CodeGen/Hexagon/lsr-postinc-nested-loop.ll
index 0e6aadf092db9aa..18144fdb1d9caaf 100644
--- a/llvm/test/CodeGen/Hexagon/lsr-postinc-nested-loop.ll
+++ b/llvm/test/CodeGen/Hexagon/lsr-postinc-nested-loop.ll
@@ -2,10 +2,10 @@
 ; Test to ensure LSR does not optimize out addrec of the outerloop.
 ; This will help to generate post-increment instructions, otherwise
 ; it end up an as extra reg+reg add inside the loop.
-; CHECK:  loop0(.LBB0_[[LOOP:.]],
-; CHECK: .LBB0_[[LOOP]]:
+; CHECK: .LBB0_[[LOOP:.]]:{{.*}}// %for.body4.us
 ; CHECK: memuh{{.*}}++
 ; CHECK: endloop
+; CHECK:  loop0(.LBB0_[[LOOP]],
 
 
 define dso_local signext i16 @foo(ptr nocapture readonly %filt, ptr nocapture readonly %inp, i32 %c1, i32 %c2) local_unnamed_addr {
diff --git a/llvm/test/CodeGen/Hexagon/prof-early-if.ll b/llvm/test/CodeGen/Hexagon/prof-early-if.ll
index 5f0c9c8f10a5891..db505ed9e1a2c30 100644
--- a/llvm/test/CodeGen/Hexagon/prof-early-if.ll
+++ b/llvm/test/CodeGen/Hexagon/prof-early-if.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
 ; Rely on the comments generated by llc. Check that "if.then" was not predicated.
-; CHECK: b5
 ; CHECK: b2
+; CHECK: b5
 ; CHECK-NOT: if{{.*}}memd
 
 %s.0 = type { [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [3 x i32], [24 x i32], [8 x %s.1], [5 x i32] }
diff --git a/llvm/test/CodeGen/Hexagon/swp-multi-loops.ll b/llvm/test/CodeGen/Hexagon/swp-multi-loops.ll
index c356353a0264ddc..743bfbfe7e621bf 100644
--- a/llvm/test/CodeGen/Hexagon/swp-multi-loops.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-multi-loops.ll
@@ -3,18 +3,18 @@
 ; Make sure we attempt to pipeline all inner most loops.
 
 ; Check if the first loop is pipelined.
-; CHECK: loop0(.LBB0_[[LOOP:.]],
-; CHECK: .LBB0_[[LOOP]]:
+; CHECK: .LBB0_[[LOOP:.]]:{{.*}}// Block address taken
 ; CHECK: add(r{{[0-9]+}},r{{[0-9]+}})
 ; CHECK-NEXT: memw(r{{[0-9]+}}++#4)
 ; CHECK-NEXT: endloop0
+; CHECK: loop0(.LBB0_[[LOOP:.]],
 
 ; Check if the second loop is pipelined.
-; CHECK: loop0(.LBB0_[[LOOP:.]],
-; CHECK: .LBB0_[[LOOP]]:
+; CHECK: .LBB0_[[LOOP:.]]:{{.*}}// Block address taken
 ; CHECK: add(r{{[0-9]+}},r{{[0-9]+}})
 ; CHECK-NEXT: memw(r{{[0-9]+}}++#4)
 ; CHECK-NEXT: endloop0
+; CHECK: loop0(.LBB0_[[LOOP:.]],
 
 define i32 @test(ptr %a, i32 %n, i32 %l) {
 entry:
diff --git a/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll b/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
index b079169974d8b85..ff863a19b4918fb 100644
--- a/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
+++ b/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
@@ -40,7 +40,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS32R2-NEXT:    addiu $sp, $sp, -16
 ; MIPS32R2-NEXT:    .cfi_def_cfa_offset 16
 ; MIPS32R2-NEXT:    sltiu $1, $4, 7
-; MIPS32R2-NEXT:    beqz $1, $BB0_3
+; MIPS32R2-NEXT:    beqz $1, $BB0_9
 ; MIPS32R2-NEXT:    sw $4, 4($sp)
 ; MIPS32R2-NEXT:  $BB0_1: # %entry
 ; MIPS32R2-NEXT:    sll $1, $4, 2
@@ -54,39 +54,39 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str)
 ; MIPS32R2-NEXT:    j $BB0_10
 ; MIPS32R2-NEXT:    sw $1, 8($sp)
-; MIPS32R2-NEXT:  $BB0_3: # %sw.epilog
-; MIPS32R2-NEXT:    lui $1, %hi($.str.7)
-; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.7)
-; MIPS32R2-NEXT:    j $BB0_10
-; MIPS32R2-NEXT:    sw $1, 8($sp)
-; MIPS32R2-NEXT:  $BB0_4: # %sw.bb1
+; MIPS32R2-NEXT:  $BB0_3: # %sw.bb1
 ; MIPS32R2-NEXT:    lui $1, %hi($.str.1)
 ; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.1)
 ; MIPS32R2-NEXT:    j $BB0_10
 ; MIPS32R2-NEXT:    sw $1, 8($sp)
-; MIPS32R2-NEXT:  $BB0_5: # %sw.bb2
+; MIPS32R2-NEXT:  $BB0_4: # %sw.bb2
 ; MIPS32R2-NEXT:    lui $1, %hi($.str.2)
 ; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.2)
 ; MIPS32R2-NEXT:    j $BB0_10
 ; MIPS32R2-NEXT:    sw $1, 8($sp)
-; MIPS32R2-NEXT:  $BB0_6: # %sw.bb3
+; MIPS32R2-NEXT:  $BB0_5: # %sw.bb3
 ; MIPS32R2-NEXT:    lui $1, %hi($.str.3)
 ; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.3)
 ; MIPS32R2-NEXT:    j $BB0_10
 ; MIPS32R2-NEXT:    sw $1, 8($sp)
-; MIPS32R2-NEXT:  $BB0_7: # %sw.bb4
+; MIPS32R2-NEXT:  $BB0_6: # %sw.bb4
 ; MIPS32R2-NEXT:    lui $1, %hi($.str.4)
 ; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.4)
 ; MIPS32R2-NEXT:    j $BB0_10
 ; MIPS32R2-NEXT:    sw $1, 8($sp)
-; MIPS32R2-NEXT:  $BB0_8: # %sw.bb5
+; MIPS32R2-NEXT:  $BB0_7: # %sw.bb5
 ; MIPS32R2-NEXT:    lui $1, %hi($.str.5)
 ; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.5)
 ; MIPS32R2-NEXT:    j $BB0_10
 ; MIPS32R2-NEXT:    sw $1, 8($sp)
-; MIPS32R2-NEXT:  $BB0_9: # %sw.bb6
+; MIPS32R2-NEXT:  $BB0_8: # %sw.bb6
 ; MIPS32R2-NEXT:    lui $1, %hi($.str.6)
 ; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.6)
+; MIPS32R2-NEXT:    j $BB0_10
+; MIPS32R2-NEXT:    sw $1, 8($sp)
+; MIPS32R2-NEXT:  $BB0_9: # %sw.epilog
+; MIPS32R2-NEXT:    lui $1, %hi($.str.7)
+; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.7)
 ; MIPS32R2-NEXT:    sw $1, 8($sp)
 ; MIPS32R2-NEXT:  $BB0_10: # %return
 ; MIPS32R2-NEXT:    lw $2, 8($sp)
@@ -98,7 +98,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS32R6-NEXT:    addiu $sp, $sp, -16
 ; MIPS32R6-NEXT:    .cfi_def_cfa_offset 16
 ; MIPS32R6-NEXT:    sltiu $1, $4, 7
-; MIPS32R6-NEXT:    beqz $1, $BB0_3
+; MIPS32R6-NEXT:    beqz $1, $BB0_9
 ; MIPS32R6-NEXT:    sw $4, 4($sp)
 ; MIPS32R6-NEXT:  $BB0_1: # %entry
 ; MIPS32R6-NEXT:    sll $1, $4, 2
@@ -112,39 +112,39 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str)
 ; MIPS32R6-NEXT:    j $BB0_10
 ; MIPS32R6-NEXT:    sw $1, 8($sp)
-; MIPS32R6-NEXT:  $BB0_3: # %sw.epilog
-; MIPS32R6-NEXT:    lui $1, %hi($.str.7)
-; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.7)
-; MIPS32R6-NEXT:    j $BB0_10
-; MIPS32R6-NEXT:    sw $1, 8($sp)
-; MIPS32R6-NEXT:  $BB0_4: # %sw.bb1
+; MIPS32R6-NEXT:  $BB0_3: # %sw.bb1
 ; MIPS32R6-NEXT:    lui $1, %hi($.str.1)
 ; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.1)
 ; MIPS32R6-NEXT:    j $BB0_10
 ; MIPS32R6-NEXT:    sw $1, 8($sp)
-; MIPS32R6-NEXT:  $BB0_5: # %sw.bb2
+; MIPS32R6-NEXT:  $BB0_4: # %sw.bb2
 ; MIPS32R6-NEXT:    lui $1, %hi($.str.2)
 ; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.2)
 ; MIPS32R6-NEXT:    j $BB0_10
 ; MIPS32R6-NEXT:    sw $1, 8($sp)
-; MIPS32R6-NEXT:  $BB0_6: # %sw.bb3
+; MIPS32R6-NEXT:  $BB0_5: # %sw.bb3
 ; MIPS32R6-NEXT:    lui $1, %hi($.str.3)
 ; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.3)
 ; MIPS32R6-NEXT:    j $BB0_10
 ; MIPS32R6-NEXT:    sw $1, 8($sp)
-; MIPS32R6-NEXT:  $BB0_7: # %sw.bb4
+; MIPS32R6-NEXT:  $BB0_6: # %sw.bb4
 ; MIPS32R6-NEXT:    lui $1, %hi($.str.4)
 ; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.4)
 ; MIPS32R6-NEXT:    j $BB0_10
 ; MIPS32R6-NEXT:    sw $1, 8($sp)
-; MIPS32R6-NEXT:  $BB0_8: # %sw.bb5
+; MIPS32R6-NEXT:  $BB0_7: # %sw.bb5
 ; MIPS32R6-NEXT:    lui $1, %hi($.str.5)
 ; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.5)
 ; MIPS32R6-NEXT:    j $BB0_10
 ; MIPS32R6-NEXT:    sw $1, 8($sp)
-; MIPS32R6-NEXT:  $BB0_9: # %sw.bb6
+; MIPS32R6-NEXT:  $BB0_8: # %sw.bb6
 ; MIPS32R6-NEXT:    lui $1, %hi($.str.6)
 ; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.6)
+; MIPS32R6-NEXT:    j $BB0_10
+; MIPS32R6-NEXT:    sw $1, 8($sp)
+; MIPS32R6-NEXT:  $BB0_9: # %sw.epilog
+; MIPS32R6-NEXT:    lui $1, %hi($.str.7)
+; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.7)
 ; MIPS32R6-NEXT:    sw $1, 8($sp)
 ; MIPS32R6-NEXT:  $BB0_10: # %return
 ; MIPS32R6-NEXT:    lw $2, 8($sp)
@@ -157,7 +157,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    .cfi_def_cfa_offset 16
 ; MIPS64R2-NEXT:    dext $2, $4, 0, 32
 ; MIPS64R2-NEXT:    sltiu $1, $2, 7
-; MIPS64R2-NEXT:    beqz $1, .LBB0_3
+; MIPS64R2-NEXT:    beqz $1, .LBB0_9
 ; MIPS64R2-NEXT:    sw $4, 4($sp)
 ; MIPS64R2-NEXT:  .LBB0_1: # %entry
 ; MIPS64R2-NEXT:    dsll $1, $2, 3
@@ -179,16 +179,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str)
 ; MIPS64R2-NEXT:    j .LBB0_10
 ; MIPS64R2-NEXT:    sd $1, 8($sp)
-; MIPS64R2-NEXT:  .LBB0_3: # %sw.epilog
-; MIPS64R2-NEXT:    lui $1, %highest(.L.str.7)
-; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.7)
-; MIPS64R2-NEXT:    dsll $1, $1, 16
-; MIPS64R2-NEXT:    daddiu $1, $1, %hi(.L.str.7)
-; MIPS64R2-NEXT:    dsll $1, $1, 16
-; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.7)
-; MIPS64R2-NEXT:    j .LBB0_10
-; MIPS64R2-NEXT:    sd $1, 8($sp)
-; MIPS64R2-NEXT:  .LBB0_4: # %sw.bb1
+; MIPS64R2-NEXT:  .LBB0_3: # %sw.bb1
 ; MIPS64R2-NEXT:    lui $1, %highest(.L.str.1)
 ; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.1)
 ; MIPS64R2-NEXT:    dsll $1, $1, 16
@@ -197,7 +188,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.1)
 ; MIPS64R2-NEXT:    j .LBB0_10
 ; MIPS64R2-NEXT:    sd $1, 8($sp)
-; MIPS64R2-NEXT:  .LBB0_5: # %sw.bb2
+; MIPS64R2-NEXT:  .LBB0_4: # %sw.bb2
 ; MIPS64R2-NEXT:    lui $1, %highest(.L.str.2)
 ; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.2)
 ; MIPS64R2-NEXT:    dsll $1, $1, 16
@@ -206,7 +197,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.2)
 ; MIPS64R2-NEXT:    j .LBB0_10
 ; MIPS64R2-NEXT:    sd $1, 8($sp)
-; MIPS64R2-NEXT:  .LBB0_6: # %sw.bb3
+; MIPS64R2-NEXT:  .LBB0_5: # %sw.bb3
 ; MIPS64R2-NEXT:    lui $1, %highest(.L.str.3)
 ; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.3)
 ; MIPS64R2-NEXT:    dsll $1, $1, 16
@@ -215,7 +206,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.3)
 ; MIPS64R2-NEXT:    j .LBB0_10
 ; MIPS64R2-NEXT:    sd $1, 8($sp)
-; MIPS64R2-NEXT:  .LBB0_7: # %sw.bb4
+; MIPS64R2-NEXT:  .LBB0_6: # %sw.bb4
 ; MIPS64R2-NEXT:    lui $1, %highest(.L.str.4)
 ; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.4)
 ; MIPS64R2-NEXT:    dsll $1, $1, 16
@@ -224,7 +215,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.4)
 ; MIPS64R2-NEXT:    j .LBB0_10
 ; MIPS64R2-NEXT:    sd $1, 8($sp)
-; MIPS64R2-NEXT:  .LBB0_8: # %sw.bb5
+; MIPS64R2-NEXT:  .LBB0_7: # %sw.bb5
 ; MIPS64R2-NEXT:    lui $1, %highest(.L.str.5)
 ; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.5)
 ; MIPS64R2-NEXT:    dsll $1, $1, 16
@@ -233,13 +224,22 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.5)
 ; MIPS64R2-NEXT:    j .LBB0_10
 ; MIPS64R2-NEXT:    sd $1, 8($sp)
-; MIPS64R2-NEXT:  .LBB0_9: # %sw.bb6
+; MIPS64R2-NEXT:  .LBB0_8: # %sw.bb6
 ; MIPS64R2-NEXT:    lui $1, %highest(.L.str.6)
 ; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.6)
 ; MIPS64R2-NEXT:    dsll $1, $1, 16
 ; MIPS64R2-NEXT:    daddiu $1, $1, %hi(.L.str.6)
 ; MIPS64R2-NEXT:    dsll $1, $1, 16
 ; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.6)
+; MIPS64R2-NEXT:    j .LBB0_10
+; MIPS64R2-NEXT:    sd $1, 8($sp)
+; MIPS64R2-NEXT:  .LBB0_9: # %sw.epilog
+; MIPS64R2-NEXT:    lui $1, %highest(.L.str.7)
+; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.7)
+; MIPS64R2-NEXT:    dsll $1, $1, 16
+; MIPS64R2-NEXT:    daddiu $1, $1, %hi(.L.str.7)
+; MIPS64R2-NEXT:    dsll $1, $1, 16
+; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.7)
 ; MIPS64R2-NEXT:    sd $1, 8($sp)
 ; MIPS64R2-NEXT:  .LBB0_10: # %return
 ; MIPS64R2-NEXT:    ld $2, 8($sp)
@@ -252,7 +252,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    .cfi_def_cfa_offset 16
 ; MIPS64R6-NEXT:    dext $2, $4, 0, 32
 ; MIPS64R6-NEXT:    sltiu $1, $2, 7
-; MIPS64R6-NEXT:    beqz $1, .LBB0_3
+; MIPS64R6-NEXT:    beqz $1, .LBB0_9
 ; MIPS64R6-NEXT:    sw $4, 4($sp)
 ; MIPS64R6-NEXT:  .LBB0_1: # %entry
 ; MIPS64R6-NEXT:    dsll $1, $2, 3
@@ -274,16 +274,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str)
 ; MIPS64R6-NEXT:    j .LBB0_10
 ; MIPS64R6-NEXT:    sd $1, 8($sp)
-; MIPS64R6-NEXT:  .LBB0_3: # %sw.epilog
-; MIPS64R6-NEXT:    lui $1, %highest(.L.str.7)
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.7)
-; MIPS64R6-NEXT:    dsll $1, $1, 16
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(.L.str.7)
-; MIPS64R6-NEXT:    dsll $1, $1, 16
-; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.7)
-; MIPS64R6-NEXT:    j .LBB0_10
-; MIPS64R6-NEXT:    sd $1, 8($sp)
-; MIPS64R6-NEXT:  .LBB0_4: # %sw.bb1
+; MIPS64R6-NEXT:  .LBB0_3: # %sw.bb1
 ; MIPS64R6-NEXT:    lui $1, %highest(.L.str.1)
 ; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.1)
 ; MIPS64R6-NEXT:    dsll $1, $1, 16
@@ -292,7 +283,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.1)
 ; MIPS64R6-NEXT:    j .LBB0_10
 ; MIPS64R6-NEXT:    sd $1, 8($sp)
-; MIPS64R6-NEXT:  .LBB0_5: # %sw.bb2
+; MIPS64R6-NEXT:  .LBB0_4: # %sw.bb2
 ; MIPS64R6-NEXT:    lui $1, %highest(.L.str.2)
 ; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.2)
 ; MIPS64R6-NEXT:    dsll $1, $1, 16
@@ -301,7 +292,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.2)
 ; MIPS64R6-NEXT:    j .LBB0_10
 ; MIPS64R6-NEXT:    sd $1, 8($sp)
-; MIPS64R6-NEXT:  .LBB0_6: # %sw.bb3
+; MIPS64R6-NEXT:  .LBB0_5: # %sw.bb3
 ; MIPS64R6-NEXT:    lui $1, %highest(.L.str.3)
 ; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.3)
 ; MIPS64R6-NEXT:    dsll $1, $1, 16
@@ -310,7 +301,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.3)
 ; MIPS64R6-NEXT:    j .LBB0_10
 ; MIPS64R6-NEXT:    sd $1, 8($sp)
-; MIPS64R6-NEXT:  .LBB0_7: # %sw.bb4
+; MIPS64R6-NEXT:  .LBB0_6: # %sw.bb4
 ; MIPS64R6-NEXT:    lui $1, %highest(.L.str.4)
 ; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.4)
 ; MIPS64R6-NEXT:    dsll $1, $1, 16
@@ -319,7 +310,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.4)
 ; MIPS64R6-NEXT:    j .LBB0_10
 ; MIPS64R6-NEXT:    sd $1, 8($sp)
-; MIPS64R6-NEXT:  .LBB0_8: # %sw.bb5
+; MIPS64R6-NEXT:  .LBB0_7: # %sw.bb5
 ; MIPS64R6-NEXT:    lui $1, %highest(.L.str.5)
 ; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.5)
 ; MIPS64R6-NEXT:    dsll $1, $1, 16
@@ -328,13 +319,22 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.5)
 ; MIPS64R6-NEXT:    j .LBB0_10
 ; MIPS64R6-NEXT:    sd $1, 8($sp)
-; MIPS64R6-NEXT:  .LBB0_9: # %sw.bb6
+; MIPS64R6-NEXT:  .LBB0_8: # %sw.bb6
 ; MIPS64R6-NEXT:    lui $1, %highest(.L.str.6)
 ; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.6)
 ; MIPS64R6-NEXT:    dsll $1, $1, 16
 ; MIPS64R6-NEXT:    daddiu $1, $1, %hi(.L.str.6)
 ; MIPS64R6-NEXT:    dsll $1, $1, 16
 ; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.6)
+; MIPS64R6-NEXT:    j .LBB0_10
+; MIPS64R6-NEXT:    sd $1, 8($sp)
+; MIPS64R6-NEXT:  .LBB0_9: # %sw.epilog
+; MIPS64R6-NEXT:    lui $1, %highest(.L.str.7)
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.7)
+; MIPS64R6-NEXT:    dsll $1, $1, 16
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(.L.str.7)
+; MIPS64R6-NEXT:    dsll $1, $1, 16
+; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.7)
 ; MIPS64R6-NEXT:    sd $1, 8($sp)
 ; MIPS64R6-NEXT:  .LBB0_10: # %return
 ; MIPS64R6-NEXT:    ld $2, 8($sp)
@@ -349,7 +349,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS32R2-NEXT:    .cfi_def_cfa_offset 16
 ; PIC-MIPS32R2-NEXT:    addu $2, $2, $25
 ; PIC-MIPS32R2-NEXT:    sltiu $1, $4, 7
-; PIC-MIPS32R2-NEXT:    beqz $1, $BB0_3
+; PIC-MIPS32R2-NEXT:    beqz $1, $BB0_9
 ; PIC-MIPS32R2-NEXT:    sw $4, 4($sp)
 ; PIC-MIPS32R2-NEXT:  $BB0_1: # %entry
 ; PIC-MIPS32R2-NEXT:    sll $1, $4, 2
@@ -364,39 +364,39 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str)
 ; PIC-MIPS32R2-NEXT:    b $BB0_10
 ; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_3: # %sw.epilog
-; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.7)($2)
-; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.7)
-; PIC-MIPS32R2-NEXT:    b $BB0_10
-; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_4: # %sw.bb1
+; PIC-MIPS32R2-NEXT:  $BB0_3: # %sw.bb1
 ; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.1)($2)
 ; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.1)
 ; PIC-MIPS32R2-NEXT:    b $BB0_10
 ; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_5: # %sw.bb2
+; PIC-MIPS32R2-NEXT:  $BB0_4: # %sw.bb2
 ; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.2)($2)
 ; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.2)
 ; PIC-MIPS32R2-NEXT:    b $BB0_10
 ; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_6: # %sw.bb3
+; PIC-MIPS32R2-NEXT:  $BB0_5: # %sw.bb3
 ; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.3)($2)
 ; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.3)
 ; PIC-MIPS32R2-NEXT:    b $BB0_10
 ; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_7: # %sw.bb4
+; PIC-MIPS32R2-NEXT:  $BB0_6: # %sw.bb4
 ; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.4)($2)
 ; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.4)
 ; PIC-MIPS32R2-NEXT:    b $BB0_10
 ; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_8: # %sw.bb5
+; PIC-MIPS32R2-NEXT:  $BB0_7: # %sw.bb5
 ; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.5)($2)
 ; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.5)
 ; PIC-MIPS32R2-NEXT:    b $BB0_10
 ; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_9: # %sw.bb6
+; PIC-MIPS32R2-NEXT:  $BB0_8: # %sw.bb6
 ; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.6)($2)
 ; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.6)
+; PIC-MIPS32R2-NEXT:    b $BB0_10
+; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
+; PIC-MIPS32R2-NEXT:  $BB0_9: # %sw.epilog
+; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.7)($2)
+; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.7)
 ; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
 ; PIC-MIPS32R2-NEXT:  $BB0_10: # %return
 ; PIC-MIPS32R2-NEXT:    lw $2, 8($sp)
@@ -411,7 +411,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS32R6-NEXT:    .cfi_def_cfa_offset 16
 ; PIC-MIPS32R6-NEXT:    addu $2, $2, $25
 ; PIC-MIPS32R6-NEXT:    sltiu $1, $4, 7
-; PIC-MIPS32R6-NEXT:    beqz $1, $BB0_3
+; PIC-MIPS32R6-NEXT:    beqz $1, $BB0_9
 ; PIC-MIPS32R6-NEXT:    sw $4, 4($sp)
 ; PIC-MIPS32R6-NEXT:  $BB0_1: # %entry
 ; PIC-MIPS32R6-NEXT:    sll $1, $4, 2
@@ -426,39 +426,39 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str)
 ; PIC-MIPS32R6-NEXT:    b $BB0_10
 ; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_3: # %sw.epilog
-; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.7)($2)
-; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.7)
-; PIC-MIPS32R6-NEXT:    b $BB0_10
-; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_4: # %sw.bb1
+; PIC-MIPS32R6-NEXT:  $BB0_3: # %sw.bb1
 ; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.1)($2)
 ; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.1)
 ; PIC-MIPS32R6-NEXT:    b $BB0_10
 ; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_5: # %sw.bb2
+; PIC-MIPS32R6-NEXT:  $BB0_4: # %sw.bb2
 ; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.2)($2)
 ; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.2)
 ; PIC-MIPS32R6-NEXT:    b $BB0_10
 ; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_6: # %sw.bb3
+; PIC-MIPS32R6-NEXT:  $BB0_5: # %sw.bb3
 ; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.3)($2)
 ; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.3)
 ; PIC-MIPS32R6-NEXT:    b $BB0_10
 ; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_7: # %sw.bb4
+; PIC-MIPS32R6-NEXT:  $BB0_6: # %sw.bb4
 ; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.4)($2)
 ; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.4)
 ; PIC-MIPS32R6-NEXT:    b $BB0_10
 ; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_8: # %sw.bb5
+; PIC-MIPS32R6-NEXT:  $BB0_7: # %sw.bb5
 ; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.5)($2)
 ; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.5)
 ; PIC-MIPS32R6-NEXT:    b $BB0_10
 ; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_9: # %sw.bb6
+; PIC-MIPS32R6-NEXT:  $BB0_8: # %sw.bb6
 ; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.6)($2)
 ; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.6)
+; PIC-MIPS32R6-NEXT:    b $BB0_10
+; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
+; PIC-MIPS32R6-NEXT:  $BB0_9: # %sw.epilog
+; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.7)($2)
+; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.7)
 ; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
 ; PIC-MIPS32R6-NEXT:  $BB0_10: # %return
 ; PIC-MIPS32R6-NEXT:    lw $2, 8($sp)
@@ -474,7 +474,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS64R2-NEXT:    daddiu $2, $1, %lo(%neg(%gp_rel(_Z3fooi)))
 ; PIC-MIPS64R2-NEXT:    dext $3, $4, 0, 32
 ; PIC-MIPS64R2-NEXT:    sltiu $1, $3, 7
-; PIC-MIPS64R2-NEXT:    beqz $1, .LBB0_3
+; PIC-MIPS64R2-NEXT:    beqz $1, .LBB0_9
 ; PIC-MIPS64R2-NEXT:    sw $4, 4($sp)
 ; PIC-MIPS64R2-NEXT:  .LBB0_1: # %entry
 ; PIC-MIPS64R2-NEXT:    dsll $1, $3, 3
@@ -489,39 +489,39 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str)
 ; PIC-MIPS64R2-NEXT:    b .LBB0_10
 ; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_3: # %sw.epilog
-; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.7)($2)
-; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.7)
-; PIC-MIPS64R2-NEXT:    b .LBB0_10
-; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_4: # %sw.bb1
+; PIC-MIPS64R2-NEXT:  .LBB0_3: # %sw.bb1
 ; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.1)($2)
 ; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.1)
 ; PIC-MIPS64R2-NEXT:    b .LBB0_10
 ; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_5: # %sw.bb2
+; PIC-MIPS64R2-NEXT:  .LBB0_4: # %sw.bb2
 ; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.2)($2)
 ; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.2)
 ; PIC-MIPS64R2-NEXT:    b .LBB0_10
 ; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_6: # %sw.bb3
+; PIC-MIPS64R2-NEXT:  .LBB0_5: # %sw.bb3
 ; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.3)($2)
 ; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.3)
 ; PIC-MIPS64R2-NEXT:    b .LBB0_10
 ; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_7: # %sw.bb4
+; PIC-MIPS64R2-NEXT:  .LBB0_6: # %sw.bb4
 ; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.4)($2)
 ; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.4)
 ; PIC-MIPS64R2-NEXT:    b .LBB0_10
 ; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_8: # %sw.bb5
+; PIC-MIPS64R2-NEXT:  .LBB0_7: # %sw.bb5
 ; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.5)($2)
 ; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.5)
 ; PIC-MIPS64R2-NEXT:    b .LBB0_10
 ; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_9: # %sw.bb6
+; PIC-MIPS64R2-NEXT:  .LBB0_8: # %sw.bb6
 ; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.6)($2)
 ; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.6)
+; PIC-MIPS64R2-NEXT:    b .LBB0_10
+; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
+; PIC-MIPS64R2-NEXT:  .LBB0_9: # %sw.epilog
+; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.7)($2)
+; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.7)
 ; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
 ; PIC-MIPS64R2-NEXT:  .LBB0_10: # %return
 ; PIC-MIPS64R2-NEXT:    ld $2, 8($sp)
@@ -537,7 +537,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS64R6-NEXT:    daddiu $2, $1, %lo(%neg(%gp_rel(_Z3fooi)))
 ; PIC-MIPS64R6-NEXT:    dext $3, $4, 0, 32
 ; PIC-MIPS64R6-NEXT:    sltiu $1, $3, 7
-; PIC-MIPS64R6-NEXT:    beqz $1, .LBB0_3
+; PIC-MIPS64R6-NEXT:    beqz $1, .LBB0_9
 ; PIC-MIPS64R6-NEXT:    sw $4, 4($sp)
 ; PIC-MIPS64R6-NEXT:  .LBB0_1: # %entry
 ; PIC-MIPS64R6-NEXT:    dsll $1, $3, 3
@@ -552,39 +552,39 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str)
 ; PIC-MIPS64R6-NEXT:    b .LBB0_10
 ; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_3: # %sw.epilog
-; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.7)($2)
-; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.7)
-; PIC-MIPS64R6-NEXT:    b .LBB0_10
-; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_4: # %sw.bb1
+; PIC-MIPS64R6-NEXT:  .LBB0_3: # %sw.bb1
 ; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.1)($2)
 ; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.1)
 ; PIC-MIPS64R6-NEXT:    b .LBB0_10
 ; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_5: # %sw.bb2
+; PIC-MIPS64R6-NEXT:  .LBB0_4: # %sw.bb2
 ; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.2)($2)
 ; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.2)
 ; PIC-MIPS64R6-NEXT:    b .LBB0_10
 ; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_6: # %sw.bb3
+; PIC-MIPS64R6-NEXT:  .LBB0_5: # %sw.bb3
 ; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.3)($2)
 ; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.3)
 ; PIC-MIPS64R6-NEXT:    b .LBB0_10
 ; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_7: # %sw.bb4
+; PIC-MIPS64R6-NEXT:  .LBB0_6: # %sw.bb4
 ; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.4)($2)
 ; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.4)
 ; PIC-MIPS64R6-NEXT:    b .LBB0_10
 ; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_8: # %sw.bb5
+; PIC-MIPS64R6-NEXT:  .LBB0_7: # %sw.bb5
 ; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.5)($2)
 ; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.5)
 ; PIC-MIPS64R6-NEXT:    b .LBB0_10
 ; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_9: # %sw.bb6
+; PIC-MIPS64R6-NEXT:  .LBB0_8: # %sw.bb6
 ; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.6)($2)
 ; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.6)
+; PIC-MIPS64R6-NEXT:    b .LBB0_10
+; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
+; PIC-MIPS64R6-NEXT:  .LBB0_9: # %sw.epilog
+; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.7)($2)
+; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.7)
 ; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
 ; PIC-MIPS64R6-NEXT:  .LBB0_10: # %return
 ; PIC-MIPS64R6-NEXT:    ld $2, 8($sp)
diff --git a/llvm/test/CodeGen/Mips/jump-table-mul.ll b/llvm/test/CodeGen/Mips/jump-table-mul.ll
index ef7452cf253fee6..22f41f53d154bf2 100644
--- a/llvm/test/CodeGen/Mips/jump-table-mul.ll
+++ b/llvm/test/CodeGen/Mips/jump-table-mul.ll
@@ -8,15 +8,11 @@ define i64 @test(i64 %arg) {
 ; CHECK-NEXT:    lui $1, %hi(%neg(%gp_rel(test)))
 ; CHECK-NEXT:    daddu $2, $1, $25
 ; CHECK-NEXT:    sltiu $1, $4, 11
-; CHECK-NEXT:    beqz $1, .LBB0_3
+; CHECK-NEXT:    beqz $1, .LBB0_4
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:  .LBB0_1: # %entry
 ; CHECK-NEXT:    daddiu $1, $2, %lo(%neg(%gp_rel(test)))
 ; CHECK-NEXT:    dsll $2, $4, 3
-; Previously this dsll was the following sequence:
-;	daddiu	$2, $zero, 8
-;	dmult	$4, $2
-;	mflo	$2
 ; CHECK-NEXT:    ld $3, %got_page(.LJTI0_0)($1)
 ; CHECK-NEXT:    daddu $2, $2, $3
 ; CHECK-NEXT:    ld $2, %got_ofst(.LJTI0_0)($2)
@@ -26,12 +22,16 @@ define i64 @test(i64 %arg) {
 ; CHECK-NEXT:  .LBB0_2: # %sw.bb
 ; CHECK-NEXT:    jr $ra
 ; CHECK-NEXT:    daddiu $2, $zero, 1
-; CHECK-NEXT:  .LBB0_3: # %default
-; CHECK-NEXT:    jr $ra
-; CHECK-NEXT:    daddiu $2, $zero, 1234
-; CHECK-NEXT:  .LBB0_4: # %sw.bb1
+; CHECK-NEXT:  .LBB0_3: # %sw.bb1
 ; CHECK-NEXT:    jr $ra
 ; CHECK-NEXT:    daddiu $2, $zero, 0
+; CHECK-NEXT:  .LBB0_4: # %default
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    daddiu $2, $zero, 1234
+; Previously this dsll was the following sequence:
+;	daddiu	$2, $zero, 8
+;	dmult	$4, $2
+;	mflo	$2
 entry:
   switch i64 %arg, label %default [
     i64 0, label %sw.bb
@@ -54,13 +54,13 @@ sw.bb1:
 ; CHECK-NEXT: 	.p2align	3
 ; CHECK-LABEL: .LJTI0_0:
 ; CHECK-NEXT: 	.gpdword	.LBB0_2
-; CHECK-NEXT: 	.gpdword	.LBB0_3
-; CHECK-NEXT: 	.gpdword	.LBB0_3
+; CHECK-NEXT: 	.gpdword	.LBB0_4
+; CHECK-NEXT: 	.gpdword	.LBB0_4
 ; CHECK-NEXT: 	.gpdword	.LBB0_2
-; CHECK-NEXT: 	.gpdword	.LBB0_3
+; CHECK-NEXT: 	.gpdword	.LBB0_4
 ; CHECK-NEXT: 	.gpdword	.LBB0_2
-; CHECK-NEXT: 	.gpdword	.LBB0_3
-; CHECK-NEXT: 	.gpdword	.LBB0_3
-; CHECK-NEXT: 	.gpdword	.LBB0_3
-; CHECK-NEXT: 	.gpdword	.LBB0_3
 ; CHECK-NEXT: 	.gpdword	.LBB0_4
+; CHECK-NEXT: 	.gpdword	.LBB0_4
+; CHECK-NEXT: 	.gpdword	.LBB0_4
+; CHECK-NEXT: 	.gpdword	.LBB0_4
+; CHECK-NEXT: 	.gpdword	.LBB0_3
diff --git a/llvm/test/CodeGen/Mips/nacl-align.ll b/llvm/test/CodeGen/Mips/nacl-align.ll
index bca6c93de2624d3..964ddfc963c4ec3 100644
--- a/llvm/test/CodeGen/Mips/nacl-align.ll
+++ b/llvm/test/CodeGen/Mips/nacl-align.ll
@@ -44,9 +44,6 @@ default:
 ; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
 ; CHECK-NEXT:        jr      $ra
 ; CHECK-NEXT:        addiu   $2, $zero, 111
-; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
-; CHECK-NEXT:        jr      $ra
-; CHECK-NEXT:        addiu   $2, $zero, 555
 ; CHECK-NEXT:        .p2align  4
 ; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
 ; CHECK-NEXT:        jr      $ra
@@ -55,6 +52,13 @@ default:
 ; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
 ; CHECK-NEXT:        jr      $ra
 ; CHECK-NEXT:        addiu   $2, $zero, 333
+; CHECK-NEXT:        .p2align  4
+; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
+; CHECK-NEXT:        jr      $ra
+; CHECK-NEXT:        addiu   $2, $zero, 444
+; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
+; CHECK-NEXT:        jr      $ra
+; CHECK-NEXT:        addiu   $2, $zero, 555
 
 }
 
diff --git a/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll b/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll
index 31f077d57a93355..afb79e55f4f90b8 100644
--- a/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll
+++ b/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll
@@ -12,7 +12,7 @@ define i32 @test(i32 signext %x, i32 signext %c) {
 ; CHECK-NEXT:    addiu $2, $2, %lo(_gp_disp)
 ; CHECK-NEXT:    addiur2 $5, $5, -1
 ; CHECK-NEXT:    sltiu $1, $5, 4
-; CHECK-NEXT:    beqz $1, $BB0_3
+; CHECK-NEXT:    beqz $1, $BB0_6
 ; CHECK-NEXT:    addu $3, $2, $25
 ; CHECK-NEXT:  $BB0_1: # %entry
 ; CHECK-NEXT:    li16 $2, 0
@@ -26,17 +26,17 @@ define i32 @test(i32 signext %x, i32 signext %c) {
 ; CHECK-NEXT:  $BB0_2: # %sw.bb2
 ; CHECK-NEXT:    addiur2 $2, $4, 1
 ; CHECK-NEXT:    jrc $ra
-; CHECK-NEXT:  $BB0_3:
-; CHECK-NEXT:    move $2, $4
-; CHECK-NEXT:    jrc $ra
-; CHECK-NEXT:  $BB0_4: # %sw.bb3
+; CHECK-NEXT:  $BB0_3: # %sw.bb3
 ; CHECK-NEXT:    addius5 $4, 2
 ; CHECK-NEXT:    move $2, $4
 ; CHECK-NEXT:    jrc $ra
-; CHECK-NEXT:  $BB0_5: # %sw.bb5
+; CHECK-NEXT:  $BB0_4: # %sw.bb5
 ; CHECK-NEXT:    addius5 $4, 3
 ; CHECK-NEXT:    move $2, $4
-; CHECK-NEXT:  $BB0_6: # %for.cond.cleanup
+; CHECK-NEXT:  $BB0_5: # %for.cond.cleanup
+; CHECK-NEXT:    jrc $ra
+; CHECK-NEXT:  $BB0_6:
+; CHECK-NEXT:    move $2, $4
 ; CHECK-NEXT:    jrc $ra
 entry:
   switch i32 %c, label %sw.epilog [
diff --git a/llvm/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll b/llvm/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
index db4e22fd8d17afc..e76e6b1e80fbaa7 100644
--- a/llvm/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
+++ b/llvm/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
@@ -42,9 +42,16 @@ define void @Bork(i64 %range.0.0, i64 %range.0.1, i64 %size) personality ptr @__
 ; CHECK-NEXT:  .Ltmp1:
 ; CHECK-NEXT:  # %bb.1: # %bb30.preheader
 ; CHECK-NEXT:    addi 28, 31, 120
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_2: # %invcont23
+; CHECK-NEXT:    #
+; CHECK-NEXT:    ld 3, 128(31)
+; CHECK-NEXT:    sub 30, 30, 3
+; CHECK-NEXT:  .LBB0_3: # %bb30
+; CHECK-NEXT:    #
 ; CHECK-NEXT:    cmpldi 30, 0
-; CHECK-NEXT:    beq 0, .LBB0_4
-; CHECK-NEXT:  .LBB0_2: # %cond_true
+; CHECK-NEXT:    beq 0, .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %cond_true
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:  .Ltmp3:
 ; CHECK-NEXT:    mr 3, 29
@@ -52,13 +59,8 @@ define void @Bork(i64 %range.0.0, i64 %range.0.1, i64 %size) personality ptr @__
 ; CHECK-NEXT:    bl Bar
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:  .Ltmp4:
-; CHECK-NEXT:  # %bb.3: # %invcont23
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ld 3, 128(31)
-; CHECK-NEXT:    sub 30, 30, 3
-; CHECK-NEXT:    cmpldi 30, 0
-; CHECK-NEXT:    bne 0, .LBB0_2
-; CHECK-NEXT:  .LBB0_4: # %cleanup
+; CHECK-NEXT:    b .LBB0_2
+; CHECK-NEXT:  .LBB0_5: # %cleanup
 ; CHECK-NEXT:    ld 30, 160(31) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld 29, 152(31) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld 28, 144(31) # 8-byte Folded Reload
@@ -68,12 +70,12 @@ define void @Bork(i64 %range.0.0, i64 %range.0.1, i64 %size) personality ptr @__
 ; CHECK-NEXT:    ld 31, -8(1)
 ; CHECK-NEXT:    mtlr 0
 ; CHECK-NEXT:    blr
-; CHECK-NEXT:  .LBB0_5: # %unwind.loopexit.split-lp
+; CHECK-NEXT:  .LBB0_6: # %unwind.loopexit.split-lp
 ; CHECK-NEXT:  .Ltmp2:
-; CHECK-NEXT:    b .LBB0_7
-; CHECK-NEXT:  .LBB0_6: # %unwind.loopexit
+; CHECK-NEXT:    b .LBB0_8
+; CHECK-NEXT:  .LBB0_7: # %unwind.loopexit
 ; CHECK-NEXT:  .Ltmp5:
-; CHECK-NEXT:  .LBB0_7: # %unwind
+; CHECK-NEXT:  .LBB0_8: # %unwind
 ; CHECK-NEXT:    ld 4, 0(1)
 ; CHECK-NEXT:    mr 1, 27
 ; CHECK-NEXT:    std 4, 0(1)
diff --git a/llvm/test/CodeGen/PowerPC/branch-opt.ll b/llvm/test/CodeGen/PowerPC/branch-opt.ll
index 5e31270840b80d9..95cfd52036e47f4 100644
--- a/llvm/test/CodeGen/PowerPC/branch-opt.ll
+++ b/llvm/test/CodeGen/PowerPC/branch-opt.ll
@@ -11,10 +11,11 @@ target triple = "powerpc-unknown-linux-gnu"
 ; One of the blocks ends up with a loop exit block that gets a tail-duplicated copy
 ; of %cond_next48, so there should only be two unconditional branches.
 
-;CHECK: b .LBB0_13
-;CHECK: b .LBB0_13
-;CHECK-NOT: b .LBB0_13
-;CHECK: .LBB0_13: # %cond_next48
+;CHECK: b .LBB0_14
+;CHECK: b .LBB0_14
+;CHECK: b .LBB0_14
+;CHECK-NOT: b .LBB0_14
+;CHECK: .LBB0_14: # %cond_next48
 
 define void @foo(i32 %W, i32 %X, i32 %Y, i32 %Z) {
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/jump-tables-collapse-rotate.ll b/llvm/test/CodeGen/PowerPC/jump-tables-collapse-rotate.ll
index ccc9adbc2bdd1dd..efb356b7ed817f1 100644
--- a/llvm/test/CodeGen/PowerPC/jump-tables-collapse-rotate.ll
+++ b/llvm/test/CodeGen/PowerPC/jump-tables-collapse-rotate.ll
@@ -11,7 +11,7 @@ define dso_local zeroext i32 @test(i32 signext %l) nounwind {
 ; CHECK-NEXT:    addi r3, r3, -1
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    cmplwi r3, 5
-; CHECK-NEXT:    bgt cr0, .LBB0_3
+; CHECK-NEXT:    bgt cr0, .LBB0_9
 ; CHECK-NEXT:  # %bb.1: # %entry
 ; CHECK-NEXT:    addis r4, r2, .LC0 at toc@ha
 ; CHECK-NEXT:    rldic r3, r3, 2, 30
@@ -24,42 +24,41 @@ define dso_local zeroext i32 @test(i32 signext %l) nounwind {
 ; CHECK-NEXT:    li r3, 2
 ; CHECK-NEXT:    bl test1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    b .LBB0_10
-; CHECK-NEXT:  .LBB0_3: # %sw.default
-; CHECK-NEXT:    li r3, 1
-; CHECK-NEXT:    bl test1
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    bl test3
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    b .LBB0_10
-; CHECK-NEXT:  .LBB0_4: # %sw.bb3
+; CHECK-NEXT:    b .LBB0_11
+; CHECK-NEXT:  .LBB0_3: # %sw.bb3
 ; CHECK-NEXT:    li r3, 3
-; CHECK-NEXT:    b .LBB0_9
-; CHECK-NEXT:  .LBB0_5: # %sw.bb5
+; CHECK-NEXT:    b .LBB0_8
+; CHECK-NEXT:  .LBB0_4: # %sw.bb5
 ; CHECK-NEXT:    li r3, 4
 ; CHECK-NEXT:    bl test2
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    bl test3
-; CHECK-NEXT:    nop
 ; CHECK-NEXT:    b .LBB0_10
-; CHECK-NEXT:  .LBB0_6: # %sw.bb8
+; CHECK-NEXT:  .LBB0_5: # %sw.bb8
 ; CHECK-NEXT:    li r3, 5
 ; CHECK-NEXT:    bl test4
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    b .LBB0_10
-; CHECK-NEXT:  .LBB0_7: # %sw.bb10
+; CHECK-NEXT:    b .LBB0_11
+; CHECK-NEXT:  .LBB0_6: # %sw.bb10
 ; CHECK-NEXT:    li r3, 66
 ; CHECK-NEXT:    bl test4
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    bl test1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    b .LBB0_10
-; CHECK-NEXT:  .LBB0_8: # %sw.bb13
+; CHECK-NEXT:    b .LBB0_11
+; CHECK-NEXT:  .LBB0_7: # %sw.bb13
 ; CHECK-NEXT:    li r3, 66
-; CHECK-NEXT:  .LBB0_9: # %return
+; CHECK-NEXT:  .LBB0_8: # %return
 ; CHECK-NEXT:    bl test2
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    b .LBB0_11
+; CHECK-NEXT:  .LBB0_9: # %sw.default
+; CHECK-NEXT:    li r3, 1
+; CHECK-NEXT:    bl test1
+; CHECK-NEXT:    nop
 ; CHECK-NEXT:  .LBB0_10: # %return
+; CHECK-NEXT:    bl test3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  .LBB0_11: # %return
 ; CHECK-NEXT:    clrldi r3, r3, 32
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
diff --git a/llvm/test/CodeGen/PowerPC/licm-tocReg.ll b/llvm/test/CodeGen/PowerPC/licm-tocReg.ll
index 7b531087501923a..58c15f31c218361 100644
--- a/llvm/test/CodeGen/PowerPC/licm-tocReg.ll
+++ b/llvm/test/CodeGen/PowerPC/licm-tocReg.ll
@@ -73,20 +73,20 @@ define signext i32 @test(ptr nocapture %FP) local_unnamed_addr #0 {
 ; CHECKLX-NEXT:    ld 3, .LC0 at toc@l(3)
 ; CHECKLX-NEXT:    ld 5, .LC1 at toc@l(4)
 ; CHECKLX-NEXT:    lwz 6, 0(3)
+; CHECKLX-NEXT:    b .LBB0_2
 ; CHECKLX-NEXT:    .p2align 5
 ; CHECKLX-NEXT:  .LBB0_1: # %if.end
 ; CHECKLX-NEXT:    #
-; CHECKLX-NEXT:    lwz 7, 0(5)
-; CHECKLX-NEXT:    lwz 4, 0(3)
-; CHECKLX-NEXT:    cmpw 6, 7
-; CHECKLX-NEXT:    bgt 0, .LBB0_3
-; CHECKLX-NEXT:  # %bb.2: # %if.end
-; CHECKLX-NEXT:    #
 ; CHECKLX-NEXT:    addi 4, 4, 1
 ; CHECKLX-NEXT:    stw 4, 0(3)
 ; CHECKLX-NEXT:    lwz 6, 0(3)
-; CHECKLX-NEXT:    b .LBB0_1
-; CHECKLX-NEXT:  .LBB0_3: # %if.then
+; CHECKLX-NEXT:  .LBB0_2: # %if.end
+; CHECKLX-NEXT:    #
+; CHECKLX-NEXT:    lwz 7, 0(5)
+; CHECKLX-NEXT:    lwz 4, 0(3)
+; CHECKLX-NEXT:    cmpw 6, 7
+; CHECKLX-NEXT:    ble 0, .LBB0_1
+; CHECKLX-NEXT:  # %bb.3: # %if.then
 ; CHECKLX-NEXT:    mflr 0
 ; CHECKLX-NEXT:    stdu 1, -32(1)
 ; CHECKLX-NEXT:    std 2, 24(1)
@@ -106,19 +106,19 @@ define signext i32 @test(ptr nocapture %FP) local_unnamed_addr #0 {
 ; CHECKAIX:       # %bb.0: # %entry
 ; CHECKAIX-NEXT:    ld 5, L..C0(2) # @ga
 ; CHECKAIX-NEXT:    ld 6, L..C1(2) # @gb
+; CHECKAIX-NEXT:    b L..BB0_2
 ; CHECKAIX-NEXT:  L..BB0_1: # %if.end
 ; CHECKAIX-NEXT:    #
+; CHECKAIX-NEXT:    addi 4, 4, 1
+; CHECKAIX-NEXT:    stw 4, 0(5)
+; CHECKAIX-NEXT:  L..BB0_2: # %if.end
+; CHECKAIX-NEXT:    #
 ; CHECKAIX-NEXT:    lwz 4, 0(5)
 ; CHECKAIX-NEXT:    lwz 7, 0(6)
 ; CHECKAIX-NEXT:    cmpw 4, 7
 ; CHECKAIX-NEXT:    lwz 4, 0(5)
-; CHECKAIX-NEXT:    bgt 0, L..BB0_3
-; CHECKAIX-NEXT:  # %bb.2: # %if.end
-; CHECKAIX-NEXT:    #
-; CHECKAIX-NEXT:    addi 4, 4, 1
-; CHECKAIX-NEXT:    stw 4, 0(5)
-; CHECKAIX-NEXT:    b L..BB0_1
-; CHECKAIX-NEXT:  L..BB0_3: # %if.then
+; CHECKAIX-NEXT:    ble 0, L..BB0_1
+; CHECKAIX-NEXT:  # %bb.3: # %if.then
 ; CHECKAIX-NEXT:    mflr 0
 ; CHECKAIX-NEXT:    stdu 1, -112(1)
 ; CHECKAIX-NEXT:    ld 5, 0(3)
@@ -139,19 +139,19 @@ define signext i32 @test(ptr nocapture %FP) local_unnamed_addr #0 {
 ; CHECKAIX32:       # %bb.0: # %entry
 ; CHECKAIX32-NEXT:    lwz 5, L..C0(2) # @ga
 ; CHECKAIX32-NEXT:    lwz 6, L..C1(2) # @gb
+; CHECKAIX32-NEXT:    b L..BB0_2
 ; CHECKAIX32-NEXT:  L..BB0_1: # %if.end
 ; CHECKAIX32-NEXT:    #
+; CHECKAIX32-NEXT:    addi 4, 4, 1
+; CHECKAIX32-NEXT:    stw 4, 0(5)
+; CHECKAIX32-NEXT:  L..BB0_2: # %if.end
+; CHECKAIX32-NEXT:    #
 ; CHECKAIX32-NEXT:    lwz 4, 0(5)
 ; CHECKAIX32-NEXT:    lwz 7, 0(6)
 ; CHECKAIX32-NEXT:    cmpw 4, 7
 ; CHECKAIX32-NEXT:    lwz 4, 0(5)
-; CHECKAIX32-NEXT:    bgt 0, L..BB0_3
-; CHECKAIX32-NEXT:  # %bb.2: # %if.end
-; CHECKAIX32-NEXT:    #
-; CHECKAIX32-NEXT:    addi 4, 4, 1
-; CHECKAIX32-NEXT:    stw 4, 0(5)
-; CHECKAIX32-NEXT:    b L..BB0_1
-; CHECKAIX32-NEXT:  L..BB0_3: # %if.then
+; CHECKAIX32-NEXT:    ble 0, L..BB0_1
+; CHECKAIX32-NEXT:  # %bb.3: # %if.then
 ; CHECKAIX32-NEXT:    mflr 0
 ; CHECKAIX32-NEXT:    stwu 1, -64(1)
 ; CHECKAIX32-NEXT:    lwz 5, 0(3)
diff --git a/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll b/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll
index 79f2ef3e3746a79..432b73e4a3a53be 100644
--- a/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll
+++ b/llvm/test/CodeGen/PowerPC/lsr-profitable-chain.ll
@@ -27,22 +27,9 @@ define void @foo(ptr readonly %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6
 ; CHECK-NEXT:    mulld 30, 8, 30
 ; CHECK-NEXT:    mulld 28, 8, 28
 ; CHECK-NEXT:    mulld 8, 8, 27
-; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:    b .LBB0_5
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_2:
-; CHECK-NEXT:    add 5, 5, 9
-; CHECK-NEXT:    add 12, 12, 0
-; CHECK-NEXT:    add 30, 30, 0
-; CHECK-NEXT:    add 28, 28, 0
-; CHECK-NEXT:    add 8, 8, 0
-; CHECK-NEXT:    cmpd 5, 7
-; CHECK-NEXT:    bge 0, .LBB0_6
-; CHECK-NEXT:  .LBB0_3: # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB0_5 Depth 2
-; CHECK-NEXT:    sub 27, 5, 10
-; CHECK-NEXT:    cmpd 6, 27
-; CHECK-NEXT:    bge 0, .LBB0_2
-; CHECK-NEXT:  # %bb.4:
 ; CHECK-NEXT:    add 25, 6, 12
 ; CHECK-NEXT:    add 24, 6, 8
 ; CHECK-NEXT:    sldi 26, 6, 3
@@ -58,7 +45,7 @@ define void @foo(ptr readonly %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6
 ; CHECK-NEXT:    add 22, 3, 22
 ; CHECK-NEXT:    add 25, 29, 25
 ; CHECK-NEXT:    .p2align 5
-; CHECK-NEXT:  .LBB0_5: # Parent Loop BB0_3 Depth=1
+; CHECK-NEXT:  .LBB0_3: # Parent Loop BB0_5 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    lfd 0, 0(26)
 ; CHECK-NEXT:    lfd 1, 0(23)
@@ -101,8 +88,21 @@ define void @foo(ptr readonly %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6
 ; CHECK-NEXT:    xsadddp 0, 0, 1
 ; CHECK-NEXT:    stfd 0, 0(26)
 ; CHECK-NEXT:    add 26, 26, 11
-; CHECK-NEXT:    blt 0, .LBB0_5
-; CHECK-NEXT:    b .LBB0_2
+; CHECK-NEXT:    blt 0, .LBB0_3
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    add 5, 5, 9
+; CHECK-NEXT:    add 12, 12, 0
+; CHECK-NEXT:    add 30, 30, 0
+; CHECK-NEXT:    add 28, 28, 0
+; CHECK-NEXT:    add 8, 8, 0
+; CHECK-NEXT:    cmpd 5, 7
+; CHECK-NEXT:    bge 0, .LBB0_6
+; CHECK-NEXT:  .LBB0_5: # =>This Loop Header: Depth=1
+; CHECK-NEXT:    # Child Loop BB0_3 Depth 2
+; CHECK-NEXT:    sub 27, 5, 10
+; CHECK-NEXT:    cmpd 6, 27
+; CHECK-NEXT:    blt 0, .LBB0_2
+; CHECK-NEXT:    b .LBB0_4
 ; CHECK-NEXT:  .LBB0_6:
 ; CHECK-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll b/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
index eeadb73b9db2cff..e97e17fc64782f4 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
@@ -155,11 +155,11 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_25
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_26: # %bb62
+; CHECK-NEXT:  .LBB0_26: # %bb56
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_26
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_27: # %bb56
+; CHECK-NEXT:  .LBB0_27: # %bb62
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_27
 ; CHECK-NEXT:    .p2align 4
@@ -348,11 +348,11 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_25
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_26: # %bb62
+; CHECK-BE-NEXT:  .LBB0_26: # %bb56
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_26
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_27: # %bb56
+; CHECK-BE-NEXT:  .LBB0_27: # %bb62
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_27
 ; CHECK-BE-NEXT:    .p2align 4
diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-crlt.ll b/llvm/test/CodeGen/PowerPC/p10-spill-crlt.ll
index 32f3342243904e6..4b032781c3764cf 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-crlt.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-crlt.ll
@@ -59,10 +59,10 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    plwz r3, call_1 at PCREL(0), 1
 ; CHECK-NEXT:    cmplwi r3, 0
-; CHECK-NEXT:    bne- cr0, .LBB0_10
+; CHECK-NEXT:    bne- cr0, .LBB0_9
 ; CHECK-NEXT:  # %bb.5: # %bb30
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    bc 12, 4*cr3+eq, .LBB0_9
+; CHECK-NEXT:    bc 12, 4*cr3+eq, .LBB0_11
 ; CHECK-NEXT:  # %bb.6: # %bb32
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    rlwinm r30, r30, 0, 24, 22
@@ -72,10 +72,10 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
 ; CHECK-NEXT:    beq+ cr2, .LBB0_3
 ; CHECK-NEXT:  # %bb.7: # %bb37
 ; CHECK-NEXT:  .LBB0_8: # %bb22
-; CHECK-NEXT:  .LBB0_9: # %bb35
-; CHECK-NEXT:  .LBB0_10: # %bb27
+; CHECK-NEXT:  .LBB0_9: # %bb27
 ; CHECK-NEXT:    bc 4, 4*cr3+lt, .LBB0_12
-; CHECK-NEXT:  # %bb.11: # %bb28
+; CHECK-NEXT:  # %bb.10: # %bb28
+; CHECK-NEXT:  .LBB0_11: # %bb35
 ; CHECK-NEXT:  .LBB0_12: # %bb29
 ; CHECK-NEXT:  .LBB0_13: # %bb3
 ; CHECK-NEXT:  .LBB0_14: # %bb2
@@ -120,10 +120,10 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    lwz r3, call_1 at toc@l(r30)
 ; CHECK-BE-NEXT:    cmplwi r3, 0
-; CHECK-BE-NEXT:    bne- cr0, .LBB0_10
+; CHECK-BE-NEXT:    bne- cr0, .LBB0_9
 ; CHECK-BE-NEXT:  # %bb.5: # %bb30
 ; CHECK-BE-NEXT:    #
-; CHECK-BE-NEXT:    bc 12, 4*cr3+eq, .LBB0_9
+; CHECK-BE-NEXT:    bc 12, 4*cr3+eq, .LBB0_11
 ; CHECK-BE-NEXT:  # %bb.6: # %bb32
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    rlwinm r29, r29, 0, 24, 22
@@ -134,10 +134,10 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
 ; CHECK-BE-NEXT:    beq+ cr2, .LBB0_3
 ; CHECK-BE-NEXT:  # %bb.7: # %bb37
 ; CHECK-BE-NEXT:  .LBB0_8: # %bb22
-; CHECK-BE-NEXT:  .LBB0_9: # %bb35
-; CHECK-BE-NEXT:  .LBB0_10: # %bb27
+; CHECK-BE-NEXT:  .LBB0_9: # %bb27
 ; CHECK-BE-NEXT:    bc 4, 4*cr3+lt, .LBB0_12
-; CHECK-BE-NEXT:  # %bb.11: # %bb28
+; CHECK-BE-NEXT:  # %bb.10: # %bb28
+; CHECK-BE-NEXT:  .LBB0_11: # %bb35
 ; CHECK-BE-NEXT:  .LBB0_12: # %bb29
 ; CHECK-BE-NEXT:  .LBB0_13: # %bb3
 ; CHECK-BE-NEXT:  .LBB0_14: # %bb2
diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-crun.ll b/llvm/test/CodeGen/PowerPC/p10-spill-crun.ll
index 4ca2dc5dbe04da4..fd049280c9f39f6 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-crun.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-crun.ll
@@ -82,7 +82,7 @@ define dso_local void @P10_Spill_CR_UN(ptr %arg, ptr %arg1, i32 %arg2) local_unn
 ; CHECK-NEXT:    lwz r28, 0(r3)
 ; CHECK-NEXT:    bc 12, 4*cr5+lt, .LBB0_5
 ; CHECK-NEXT:  # %bb.4: # %bb37
-; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_14
+; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_12
 ; CHECK-NEXT:  .LBB0_5: # %bb42
 ; CHECK-NEXT:    paddi r3, 0, global_1 at PCREL, 1
 ; CHECK-NEXT:    li r4, 0
@@ -95,10 +95,10 @@ define dso_local void @P10_Spill_CR_UN(ptr %arg, ptr %arg1, i32 %arg2) local_unn
 ; CHECK-NEXT:    lxsihzx v2, 0, r3
 ; CHECK-NEXT:    vextsh2d v2, v2
 ; CHECK-NEXT:    xscvsxdsp f0, v2
-; CHECK-NEXT:    bc 12, 4*cr2+lt, .LBB0_12
+; CHECK-NEXT:    bc 12, 4*cr2+lt, .LBB0_13
 ; CHECK-NEXT:  # %bb.6: # %bb42
 ; CHECK-NEXT:    xxspltidp vs1, 1069547520
-; CHECK-NEXT:    b .LBB0_13
+; CHECK-NEXT:    b .LBB0_14
 ; CHECK-NEXT:  .LBB0_7: # %bb19
 ; CHECK-NEXT:    setnbc r3, 4*cr2+un
 ; CHECK-NEXT:    paddi r4, 0, global_4 at PCREL, 1
@@ -134,15 +134,15 @@ define dso_local void @P10_Spill_CR_UN(ptr %arg, ptr %arg1, i32 %arg2) local_unn
 ; CHECK-NEXT:    rlwimi r3, r4, 21, 11, 11
 ; CHECK-NEXT:    mtocrf 32, r3
 ; CHECK-NEXT:    b .LBB0_16
-; CHECK-NEXT:  .LBB0_12:
+; CHECK-NEXT:  .LBB0_12: # %bb41
+; CHECK-NEXT:    # implicit-def: $r3
+; CHECK-NEXT:    b .LBB0_15
+; CHECK-NEXT:  .LBB0_13:
 ; CHECK-NEXT:    xxspltidp vs1, 1071644672
-; CHECK-NEXT:  .LBB0_13: # %bb42
+; CHECK-NEXT:  .LBB0_14: # %bb42
 ; CHECK-NEXT:    xsmulsp f0, f1, f0
 ; CHECK-NEXT:    xscvdpsxws f0, f0
 ; CHECK-NEXT:    mffprwz r3, f0
-; CHECK-NEXT:    b .LBB0_15
-; CHECK-NEXT:  .LBB0_14: # %bb41
-; CHECK-NEXT:    # implicit-def: $r3
 ; CHECK-NEXT:  .LBB0_15: # %bb50
 ; CHECK-NEXT:    li r4, 0
 ; CHECK-NEXT:    xxspltidp vs3, -1082130432
@@ -232,7 +232,7 @@ define dso_local void @P10_Spill_CR_UN(ptr %arg, ptr %arg1, i32 %arg2) local_unn
 ; CHECK-BE-NEXT:    addis r3, r2, global_1 at toc@ha
 ; CHECK-BE-NEXT:    bc 12, 4*cr5+lt, .LBB0_5
 ; CHECK-BE-NEXT:  # %bb.4: # %bb37
-; CHECK-BE-NEXT:    bc 4, 4*cr5+lt, .LBB0_14
+; CHECK-BE-NEXT:    bc 4, 4*cr5+lt, .LBB0_12
 ; CHECK-BE-NEXT:  .LBB0_5: # %bb42
 ; CHECK-BE-NEXT:    addi r3, r3, global_1 at toc@l
 ; CHECK-BE-NEXT:    li r4, 0
@@ -247,10 +247,10 @@ define dso_local void @P10_Spill_CR_UN(ptr %arg, ptr %arg1, i32 %arg2) local_unn
 ; CHECK-BE-NEXT:    lxsihzx v2, 0, r3
 ; CHECK-BE-NEXT:    vextsh2d v2, v2
 ; CHECK-BE-NEXT:    xscvsxdsp f0, v2
-; CHECK-BE-NEXT:    bc 12, 4*cr2+lt, .LBB0_12
+; CHECK-BE-NEXT:    bc 12, 4*cr2+lt, .LBB0_13
 ; CHECK-BE-NEXT:  # %bb.6: # %bb42
 ; CHECK-BE-NEXT:    xxspltidp vs1, 1069547520
-; CHECK-BE-NEXT:    b .LBB0_13
+; CHECK-BE-NEXT:    b .LBB0_14
 ; CHECK-BE-NEXT:  .LBB0_7: # %bb19
 ; CHECK-BE-NEXT:    setnbc r3, 4*cr2+un
 ; CHECK-BE-NEXT:    addis r4, r2, global_4 at toc@ha
@@ -292,15 +292,15 @@ define dso_local void @P10_Spill_CR_UN(ptr %arg, ptr %arg1, i32 %arg2) local_unn
 ; CHECK-BE-NEXT:    rlwimi r3, r4, 21, 11, 11
 ; CHECK-BE-NEXT:    mtocrf 32, r3
 ; CHECK-BE-NEXT:    b .LBB0_16
-; CHECK-BE-NEXT:  .LBB0_12:
+; CHECK-BE-NEXT:  .LBB0_12: # %bb41
+; CHECK-BE-NEXT:    # implicit-def: $r3
+; CHECK-BE-NEXT:    b .LBB0_15
+; CHECK-BE-NEXT:  .LBB0_13:
 ; CHECK-BE-NEXT:    xxspltidp vs1, 1071644672
-; CHECK-BE-NEXT:  .LBB0_13: # %bb42
+; CHECK-BE-NEXT:  .LBB0_14: # %bb42
 ; CHECK-BE-NEXT:    xsmulsp f0, f1, f0
 ; CHECK-BE-NEXT:    xscvdpsxws f0, f0
 ; CHECK-BE-NEXT:    mffprwz r3, f0
-; CHECK-BE-NEXT:    b .LBB0_15
-; CHECK-BE-NEXT:  .LBB0_14: # %bb41
-; CHECK-BE-NEXT:    # implicit-def: $r3
 ; CHECK-BE-NEXT:  .LBB0_15: # %bb50
 ; CHECK-BE-NEXT:    li r4, 0
 ; CHECK-BE-NEXT:    xxspltidp vs3, -1082130432
diff --git a/llvm/test/CodeGen/PowerPC/pr43527.ll b/llvm/test/CodeGen/PowerPC/pr43527.ll
index 379bd6c070c777f..d7c70ab7f066c80 100644
--- a/llvm/test/CodeGen/PowerPC/pr43527.ll
+++ b/llvm/test/CodeGen/PowerPC/pr43527.ll
@@ -5,9 +5,9 @@
 define dso_local void @test(i64 %arg, i64 %arg1) {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %bb
-; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_5
+; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_6
 ; CHECK-NEXT:  # %bb.1: # %bb3
-; CHECK-NEXT:    bc 12, 4*cr5+lt, .LBB0_6
+; CHECK-NEXT:    bc 12, 4*cr5+lt, .LBB0_5
 ; CHECK-NEXT:  # %bb.2: # %bb4
 ; CHECK-NEXT:    mflr r0
 ; CHECK-NEXT:    .cfi_def_cfa_offset 64
@@ -37,8 +37,8 @@ define dso_local void @test(i64 %arg, i64 %arg1) {
 ; CHECK-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-; CHECK-NEXT:  .LBB0_5: # %bb2
-; CHECK-NEXT:  .LBB0_6: # %bb14
+; CHECK-NEXT:  .LBB0_5: # %bb14
+; CHECK-NEXT:  .LBB0_6: # %bb2
 bb:
   br i1 undef, label %bb3, label %bb2
 
diff --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll
index 0f8014df8adca93..6b3d578f6b33829 100644
--- a/llvm/test/CodeGen/PowerPC/pr45448.ll
+++ b/llvm/test/CodeGen/PowerPC/pr45448.ll
@@ -7,17 +7,17 @@ define hidden void @julia_tryparse_internal_45896() #0 {
 ; CHECK:       # %bb.0: # %top
 ; CHECK-NEXT:    ld r3, 0(r3)
 ; CHECK-NEXT:    cmpldi r3, 0
-; CHECK-NEXT:    beq cr0, .LBB0_3
+; CHECK-NEXT:    beq cr0, .LBB0_6
 ; CHECK-NEXT:  # %bb.1: # %top
 ; CHECK-NEXT:    cmpldi r3, 10
-; CHECK-NEXT:    beq cr0, .LBB0_4
+; CHECK-NEXT:    beq cr0, .LBB0_3
 ; CHECK-NEXT:  # %bb.2: # %top
-; CHECK-NEXT:  .LBB0_3: # %fail194
-; CHECK-NEXT:  .LBB0_4: # %L294
-; CHECK-NEXT:    bc 12, 4*cr5+lt, .LBB0_6
-; CHECK-NEXT:  # %bb.5: # %L294
+; CHECK-NEXT:  .LBB0_3: # %L294
+; CHECK-NEXT:    bc 12, 4*cr5+lt, .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %L294
 ; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_7
-; CHECK-NEXT:  .LBB0_6: # %L1057.preheader
+; CHECK-NEXT:  .LBB0_5: # %L1057.preheader
+; CHECK-NEXT:  .LBB0_6: # %fail194
 ; CHECK-NEXT:  .LBB0_7: # %L670
 ; CHECK-NEXT:    li r5, -3
 ; CHECK-NEXT:    cmpdi r3, 0
diff --git a/llvm/test/CodeGen/PowerPC/pr46759.ll b/llvm/test/CodeGen/PowerPC/pr46759.ll
index d1129b1825aeef3..409cd7996ee5422 100644
--- a/llvm/test/CodeGen/PowerPC/pr46759.ll
+++ b/llvm/test/CodeGen/PowerPC/pr46759.ll
@@ -14,15 +14,17 @@ define void @foo(i32 %vla_size) #0 {
 ; CHECK-LE-NEXT:    li r12, -6144
 ; CHECK-LE-NEXT:    add r0, r12, r0
 ; CHECK-LE-NEXT:    sub r12, r0, r1
-; CHECK-LE-NEXT:    cmpdi r12, -4096
-; CHECK-LE-NEXT:    bge cr0, .LBB0_2
+; CHECK-LE-NEXT:    b .LBB0_2
+; CHECK-LE-NEXT:    .p2align 4
 ; CHECK-LE-NEXT:  .LBB0_1: # %entry
 ; CHECK-LE-NEXT:    #
 ; CHECK-LE-NEXT:    stdu r30, -4096(r1)
 ; CHECK-LE-NEXT:    addi r12, r12, 4096
+; CHECK-LE-NEXT:  .LBB0_2: # %entry
+; CHECK-LE-NEXT:    #
 ; CHECK-LE-NEXT:    cmpdi r12, -4096
 ; CHECK-LE-NEXT:    blt cr0, .LBB0_1
-; CHECK-LE-NEXT:  .LBB0_2: # %entry
+; CHECK-LE-NEXT:  # %bb.3: # %entry
 ; CHECK-LE-NEXT:    stdux r30, r1, r12
 ; CHECK-LE-NEXT:    mr r0, r30
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r0
@@ -46,13 +48,13 @@ define void @foo(i32 %vla_size) #0 {
 ; CHECK-LE-NEXT:    sub r5, r5, r6
 ; CHECK-LE-NEXT:    stdux r3, r1, r5
 ; CHECK-LE-NEXT:    cmpd r1, r4
-; CHECK-LE-NEXT:    beq cr0, .LBB0_4
-; CHECK-LE-NEXT:  .LBB0_3: # %entry
+; CHECK-LE-NEXT:    beq cr0, .LBB0_5
+; CHECK-LE-NEXT:  .LBB0_4: # %entry
 ; CHECK-LE-NEXT:    #
 ; CHECK-LE-NEXT:    stdu r3, -4096(r1)
 ; CHECK-LE-NEXT:    cmpd r1, r4
-; CHECK-LE-NEXT:    bne cr0, .LBB0_3
-; CHECK-LE-NEXT:  .LBB0_4: # %entry
+; CHECK-LE-NEXT:    bne cr0, .LBB0_4
+; CHECK-LE-NEXT:  .LBB0_5: # %entry
 ; CHECK-LE-NEXT:    addi r3, r1, 2048
 ; CHECK-LE-NEXT:    lbz r3, 0(r3)
 ; CHECK-LE-NEXT:    mr r1, r30
diff --git a/llvm/test/CodeGen/PowerPC/pr48519.ll b/llvm/test/CodeGen/PowerPC/pr48519.ll
index 002dd8f0d167a90..1b76555b67141fd 100644
--- a/llvm/test/CodeGen/PowerPC/pr48519.ll
+++ b/llvm/test/CodeGen/PowerPC/pr48519.ll
@@ -156,28 +156,28 @@ define void @func_48786() #0 {
 ; CHECK-NEXT:    ld r3, 0(r3)
 ; CHECK-NEXT:    cmpdi r3, 0
 ; CHECK-NEXT:    crnot 4*cr2+lt, eq
-; CHECK-NEXT:    b .LBB2_2
+; CHECK-NEXT:    b .LBB2_4
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB2_1: # %bb10
-; CHECK-NEXT:    #
-; CHECK-NEXT:    addi r30, r30, -1
-; CHECK-NEXT:    cmpldi r30, 0
-; CHECK-NEXT:    bc 4, gt, .LBB2_5
-; CHECK-NEXT:  .LBB2_2: # %bb2
-; CHECK-NEXT:    #
-; CHECK-NEXT:    bc 12, 4*cr5+lt, .LBB2_1
-; CHECK-NEXT:  # %bb.3: # %bb4
+; CHECK-NEXT:  .LBB2_1: # %bb4
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lhz r3, 0(r3)
 ; CHECK-NEXT:    bl __gnu_h2f_ieee
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    bc 4, 4*cr2+lt, .LBB2_6
-; CHECK-NEXT:  # %bb.4: # %bb8
+; CHECK-NEXT:  # %bb.2: # %bb8
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    bl __gnu_f2h_ieee
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    sth r3, 0(0)
-; CHECK-NEXT:    b .LBB2_1
+; CHECK-NEXT:  .LBB2_3: # %bb10
+; CHECK-NEXT:    #
+; CHECK-NEXT:    addi r30, r30, -1
+; CHECK-NEXT:    cmpldi r30, 0
+; CHECK-NEXT:    bc 4, gt, .LBB2_5
+; CHECK-NEXT:  .LBB2_4: # %bb2
+; CHECK-NEXT:    #
+; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB2_1
+; CHECK-NEXT:    b .LBB2_3
 ; CHECK-NEXT:  .LBB2_5: # %bb14
 ; CHECK-NEXT:    ld r30, 32(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    addi r1, r1, 48
@@ -195,24 +195,24 @@ define void @func_48786() #0 {
 ; CHECK-P9-NEXT:    mtctr r3
 ; CHECK-P9-NEXT:    li r3, 0
 ; CHECK-P9-NEXT:    crnot 4*cr5+lt, eq
-; CHECK-P9-NEXT:    b .LBB2_2
+; CHECK-P9-NEXT:    b .LBB2_4
 ; CHECK-P9-NEXT:    .p2align 5
-; CHECK-P9-NEXT:  .LBB2_1: # %bb10
-; CHECK-P9-NEXT:    #
-; CHECK-P9-NEXT:    bdzlr
-; CHECK-P9-NEXT:  .LBB2_2: # %bb2
-; CHECK-P9-NEXT:    #
-; CHECK-P9-NEXT:    bc 12, 4*cr5+lt, .LBB2_1
-; CHECK-P9-NEXT:  # %bb.3: # %bb4
+; CHECK-P9-NEXT:  .LBB2_1: # %bb4
 ; CHECK-P9-NEXT:    #
 ; CHECK-P9-NEXT:    lxsihzx f0, 0, r3
 ; CHECK-P9-NEXT:    xscvhpdp f0, f0
 ; CHECK-P9-NEXT:    bc 4, 4*cr5+lt, .LBB2_5
-; CHECK-P9-NEXT:  # %bb.4: # %bb8
+; CHECK-P9-NEXT:  # %bb.2: # %bb8
 ; CHECK-P9-NEXT:    #
 ; CHECK-P9-NEXT:    xscvdphp f0, f0
 ; CHECK-P9-NEXT:    stxsihx f0, 0, r3
-; CHECK-P9-NEXT:    b .LBB2_1
+; CHECK-P9-NEXT:  .LBB2_3: # %bb10
+; CHECK-P9-NEXT:    #
+; CHECK-P9-NEXT:    bdzlr
+; CHECK-P9-NEXT:  .LBB2_4: # %bb2
+; CHECK-P9-NEXT:    #
+; CHECK-P9-NEXT:    bc 4, 4*cr5+lt, .LBB2_1
+; CHECK-P9-NEXT:    b .LBB2_3
 ; CHECK-P9-NEXT:  .LBB2_5: # %bb15
 bb:
   %i = load i64, ptr addrspace(11) undef, align 8
diff --git a/llvm/test/CodeGen/PowerPC/stack-clash-prologue.ll b/llvm/test/CodeGen/PowerPC/stack-clash-prologue.ll
index 4b19cb30aba2a90..009757c8fb90e04 100644
--- a/llvm/test/CodeGen/PowerPC/stack-clash-prologue.ll
+++ b/llvm/test/CodeGen/PowerPC/stack-clash-prologue.ll
@@ -606,14 +606,15 @@ define i32 @f9(i64 %i) local_unnamed_addr #0 {
 ; CHECK-LE-NEXT:    li r12, -10240
 ; CHECK-LE-NEXT:    add r0, r12, r0
 ; CHECK-LE-NEXT:    sub r12, r0, r1
-; CHECK-LE-NEXT:    cmpdi r12, -4096
-; CHECK-LE-NEXT:    bge cr0, .LBB9_2
+; CHECK-LE-NEXT:    b .LBB9_2
+; CHECK-LE-NEXT:    .p2align 4
 ; CHECK-LE-NEXT:  .LBB9_1:
 ; CHECK-LE-NEXT:    stdu r30, -4096(r1)
 ; CHECK-LE-NEXT:    addi r12, r12, 4096
+; CHECK-LE-NEXT:  .LBB9_2:
 ; CHECK-LE-NEXT:    cmpdi r12, -4096
 ; CHECK-LE-NEXT:    blt cr0, .LBB9_1
-; CHECK-LE-NEXT:  .LBB9_2:
+; CHECK-LE-NEXT:  # %bb.3:
 ; CHECK-LE-NEXT:    stdux r30, r1, r12
 ; CHECK-LE-NEXT:    mr r0, r30
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r0
@@ -637,14 +638,14 @@ define i32 @f9(i64 %i) local_unnamed_addr #0 {
 ; CHECK-BE-NEXT:    li r12, -10240
 ; CHECK-BE-NEXT:    add r0, r12, r0
 ; CHECK-BE-NEXT:    sub r12, r0, r1
-; CHECK-BE-NEXT:    cmpdi r12, -4096
-; CHECK-BE-NEXT:    bge cr0, .LBB9_2
+; CHECK-BE-NEXT:    b .LBB9_2
 ; CHECK-BE-NEXT:  .LBB9_1:
 ; CHECK-BE-NEXT:    stdu r30, -4096(r1)
 ; CHECK-BE-NEXT:    addi r12, r12, 4096
+; CHECK-BE-NEXT:  .LBB9_2:
 ; CHECK-BE-NEXT:    cmpdi r12, -4096
 ; CHECK-BE-NEXT:    blt cr0, .LBB9_1
-; CHECK-BE-NEXT:  .LBB9_2:
+; CHECK-BE-NEXT:  # %bb.3:
 ; CHECK-BE-NEXT:    stdux r30, r1, r12
 ; CHECK-BE-NEXT:    mr r0, r30
 ; CHECK-BE-NEXT:    .cfi_def_cfa_register r0
@@ -667,14 +668,14 @@ define i32 @f9(i64 %i) local_unnamed_addr #0 {
 ; CHECK-32-NEXT:    add r0, r12, r0
 ; CHECK-32-NEXT:    sub r12, r0, r1
 ; CHECK-32-NEXT:    mr r0, r1
-; CHECK-32-NEXT:    cmpwi r12, -4096
-; CHECK-32-NEXT:    bge cr0, .LBB9_2
+; CHECK-32-NEXT:    b .LBB9_2
 ; CHECK-32-NEXT:  .LBB9_1:
 ; CHECK-32-NEXT:    stwu r0, -4096(r1)
 ; CHECK-32-NEXT:    addi r12, r12, 4096
+; CHECK-32-NEXT:  .LBB9_2:
 ; CHECK-32-NEXT:    cmpwi r12, -4096
 ; CHECK-32-NEXT:    blt cr0, .LBB9_1
-; CHECK-32-NEXT:  .LBB9_2:
+; CHECK-32-NEXT:  # %bb.3:
 ; CHECK-32-NEXT:    stwux r0, r1, r12
 ; CHECK-32-NEXT:    .cfi_def_cfa_register r0
 ; CHECK-32-NEXT:    sub r0, r1, r0
@@ -713,14 +714,15 @@ define i32 @f10(i64 %i) local_unnamed_addr #0 {
 ; CHECK-LE-NEXT:    li r12, -5120
 ; CHECK-LE-NEXT:    add r0, r12, r0
 ; CHECK-LE-NEXT:    sub r12, r0, r1
-; CHECK-LE-NEXT:    cmpdi r12, -4096
-; CHECK-LE-NEXT:    bge cr0, .LBB10_2
+; CHECK-LE-NEXT:    b .LBB10_2
+; CHECK-LE-NEXT:    .p2align 4
 ; CHECK-LE-NEXT:  .LBB10_1:
 ; CHECK-LE-NEXT:    stdu r30, -4096(r1)
 ; CHECK-LE-NEXT:    addi r12, r12, 4096
+; CHECK-LE-NEXT:  .LBB10_2:
 ; CHECK-LE-NEXT:    cmpdi r12, -4096
 ; CHECK-LE-NEXT:    blt cr0, .LBB10_1
-; CHECK-LE-NEXT:  .LBB10_2:
+; CHECK-LE-NEXT:  # %bb.3:
 ; CHECK-LE-NEXT:    stdux r30, r1, r12
 ; CHECK-LE-NEXT:    mr r0, r30
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r0
@@ -744,14 +746,14 @@ define i32 @f10(i64 %i) local_unnamed_addr #0 {
 ; CHECK-BE-NEXT:    li r12, -5120
 ; CHECK-BE-NEXT:    add r0, r12, r0
 ; CHECK-BE-NEXT:    sub r12, r0, r1
-; CHECK-BE-NEXT:    cmpdi r12, -4096
-; CHECK-BE-NEXT:    bge cr0, .LBB10_2
+; CHECK-BE-NEXT:    b .LBB10_2
 ; CHECK-BE-NEXT:  .LBB10_1:
 ; CHECK-BE-NEXT:    stdu r30, -4096(r1)
 ; CHECK-BE-NEXT:    addi r12, r12, 4096
+; CHECK-BE-NEXT:  .LBB10_2:
 ; CHECK-BE-NEXT:    cmpdi r12, -4096
 ; CHECK-BE-NEXT:    blt cr0, .LBB10_1
-; CHECK-BE-NEXT:  .LBB10_2:
+; CHECK-BE-NEXT:  # %bb.3:
 ; CHECK-BE-NEXT:    stdux r30, r1, r12
 ; CHECK-BE-NEXT:    mr r0, r30
 ; CHECK-BE-NEXT:    .cfi_def_cfa_register r0
@@ -774,14 +776,14 @@ define i32 @f10(i64 %i) local_unnamed_addr #0 {
 ; CHECK-32-NEXT:    add r0, r12, r0
 ; CHECK-32-NEXT:    sub r12, r0, r1
 ; CHECK-32-NEXT:    mr r0, r1
-; CHECK-32-NEXT:    cmpwi r12, -4096
-; CHECK-32-NEXT:    bge cr0, .LBB10_2
+; CHECK-32-NEXT:    b .LBB10_2
 ; CHECK-32-NEXT:  .LBB10_1:
 ; CHECK-32-NEXT:    stwu r0, -4096(r1)
 ; CHECK-32-NEXT:    addi r12, r12, 4096
+; CHECK-32-NEXT:  .LBB10_2:
 ; CHECK-32-NEXT:    cmpwi r12, -4096
 ; CHECK-32-NEXT:    blt cr0, .LBB10_1
-; CHECK-32-NEXT:  .LBB10_2:
+; CHECK-32-NEXT:  # %bb.3:
 ; CHECK-32-NEXT:    stwux r0, r1, r12
 ; CHECK-32-NEXT:    .cfi_def_cfa_register r0
 ; CHECK-32-NEXT:    sub r0, r1, r0
@@ -821,14 +823,15 @@ define void @f11(i32 %vla_size, i64 %i) #0 {
 ; CHECK-LE-NEXT:    ori r12, r12, 32768
 ; CHECK-LE-NEXT:    add r0, r12, r0
 ; CHECK-LE-NEXT:    sub r12, r0, r1
-; CHECK-LE-NEXT:    cmpdi r12, -4096
-; CHECK-LE-NEXT:    bge cr0, .LBB11_2
+; CHECK-LE-NEXT:    b .LBB11_2
+; CHECK-LE-NEXT:    .p2align 4
 ; CHECK-LE-NEXT:  .LBB11_1:
 ; CHECK-LE-NEXT:    stdu r30, -4096(r1)
 ; CHECK-LE-NEXT:    addi r12, r12, 4096
+; CHECK-LE-NEXT:  .LBB11_2:
 ; CHECK-LE-NEXT:    cmpdi r12, -4096
 ; CHECK-LE-NEXT:    blt cr0, .LBB11_1
-; CHECK-LE-NEXT:  .LBB11_2:
+; CHECK-LE-NEXT:  # %bb.3:
 ; CHECK-LE-NEXT:    stdux r30, r1, r12
 ; CHECK-LE-NEXT:    mr r0, r30
 ; CHECK-LE-NEXT:    .cfi_def_cfa_register r0
@@ -858,12 +861,12 @@ define void @f11(i32 %vla_size, i64 %i) #0 {
 ; CHECK-LE-NEXT:    sub r5, r5, r6
 ; CHECK-LE-NEXT:    stdux r3, r1, r5
 ; CHECK-LE-NEXT:    cmpd r1, r4
-; CHECK-LE-NEXT:    beq cr0, .LBB11_4
-; CHECK-LE-NEXT:  .LBB11_3:
+; CHECK-LE-NEXT:    beq cr0, .LBB11_5
+; CHECK-LE-NEXT:  .LBB11_4:
 ; CHECK-LE-NEXT:    stdu r3, -4096(r1)
 ; CHECK-LE-NEXT:    cmpd r1, r4
-; CHECK-LE-NEXT:    bne cr0, .LBB11_3
-; CHECK-LE-NEXT:  .LBB11_4:
+; CHECK-LE-NEXT:    bne cr0, .LBB11_4
+; CHECK-LE-NEXT:  .LBB11_5:
 ; CHECK-LE-NEXT:    addi r3, r1, -32768
 ; CHECK-LE-NEXT:    lbz r3, 0(r3)
 ; CHECK-LE-NEXT:    mr r1, r30
@@ -882,14 +885,14 @@ define void @f11(i32 %vla_size, i64 %i) #0 {
 ; CHECK-BE-NEXT:    ori r12, r12, 32768
 ; CHECK-BE-NEXT:    add r0, r12, r0
 ; CHECK-BE-NEXT:    sub r12, r0, r1
-; CHECK-BE-NEXT:    cmpdi r12, -4096
-; CHECK-BE-NEXT:    bge cr0, .LBB11_2
+; CHECK-BE-NEXT:    b .LBB11_2
 ; CHECK-BE-NEXT:  .LBB11_1:
 ; CHECK-BE-NEXT:    stdu r30, -4096(r1)
 ; CHECK-BE-NEXT:    addi r12, r12, 4096
+; CHECK-BE-NEXT:  .LBB11_2:
 ; CHECK-BE-NEXT:    cmpdi r12, -4096
 ; CHECK-BE-NEXT:    blt cr0, .LBB11_1
-; CHECK-BE-NEXT:  .LBB11_2:
+; CHECK-BE-NEXT:  # %bb.3:
 ; CHECK-BE-NEXT:    stdux r30, r1, r12
 ; CHECK-BE-NEXT:    mr r0, r30
 ; CHECK-BE-NEXT:    .cfi_def_cfa_register r0
@@ -919,12 +922,12 @@ define void @f11(i32 %vla_size, i64 %i) #0 {
 ; CHECK-BE-NEXT:    add r4, r1, r7
 ; CHECK-BE-NEXT:    stdux r3, r1, r5
 ; CHECK-BE-NEXT:    cmpd r1, r4
-; CHECK-BE-NEXT:    beq cr0, .LBB11_4
-; CHECK-BE-NEXT:  .LBB11_3:
+; CHECK-BE-NEXT:    beq cr0, .LBB11_5
+; CHECK-BE-NEXT:  .LBB11_4:
 ; CHECK-BE-NEXT:    stdu r3, -4096(r1)
 ; CHECK-BE-NEXT:    cmpd r1, r4
-; CHECK-BE-NEXT:    bne cr0, .LBB11_3
-; CHECK-BE-NEXT:  .LBB11_4:
+; CHECK-BE-NEXT:    bne cr0, .LBB11_4
+; CHECK-BE-NEXT:  .LBB11_5:
 ; CHECK-BE-NEXT:    addi r3, r1, -32768
 ; CHECK-BE-NEXT:    lbz r3, 0(r3)
 ; CHECK-BE-NEXT:    mr r1, r30
@@ -941,14 +944,14 @@ define void @f11(i32 %vla_size, i64 %i) #0 {
 ; CHECK-32-NEXT:    add r0, r12, r0
 ; CHECK-32-NEXT:    sub r12, r0, r1
 ; CHECK-32-NEXT:    mr r0, r1
-; CHECK-32-NEXT:    cmpwi r12, -4096
-; CHECK-32-NEXT:    bge cr0, .LBB11_2
+; CHECK-32-NEXT:    b .LBB11_2
 ; CHECK-32-NEXT:  .LBB11_1:
 ; CHECK-32-NEXT:    stwu r0, -4096(r1)
 ; CHECK-32-NEXT:    addi r12, r12, 4096
+; CHECK-32-NEXT:  .LBB11_2:
 ; CHECK-32-NEXT:    cmpwi r12, -4096
 ; CHECK-32-NEXT:    blt cr0, .LBB11_1
-; CHECK-32-NEXT:  .LBB11_2:
+; CHECK-32-NEXT:  # %bb.3:
 ; CHECK-32-NEXT:    stwux r0, r1, r12
 ; CHECK-32-NEXT:    .cfi_def_cfa_register r0
 ; CHECK-32-NEXT:    sub r0, r1, r0
@@ -982,12 +985,12 @@ define void @f11(i32 %vla_size, i64 %i) #0 {
 ; CHECK-32-NEXT:    add r4, r1, r7
 ; CHECK-32-NEXT:    stwux r3, r1, r5
 ; CHECK-32-NEXT:    cmpw r1, r4
-; CHECK-32-NEXT:    beq cr0, .LBB11_4
-; CHECK-32-NEXT:  .LBB11_3:
+; CHECK-32-NEXT:    beq cr0, .LBB11_5
+; CHECK-32-NEXT:  .LBB11_4:
 ; CHECK-32-NEXT:    stwu r3, -4096(r1)
 ; CHECK-32-NEXT:    cmpw r1, r4
-; CHECK-32-NEXT:    bne cr0, .LBB11_3
-; CHECK-32-NEXT:  .LBB11_4:
+; CHECK-32-NEXT:    bne cr0, .LBB11_4
+; CHECK-32-NEXT:  .LBB11_5:
 ; CHECK-32-NEXT:    addi r3, r1, -32768
 ; CHECK-32-NEXT:    lbz r3, 0(r3)
 ; CHECK-32-NEXT:    lwz r31, 0(r1)
diff --git a/llvm/test/CodeGen/RISCV/shrinkwrap-jump-table.ll b/llvm/test/CodeGen/RISCV/shrinkwrap-jump-table.ll
index 99780c5e0d444b6..75af75ce75ed9c0 100644
--- a/llvm/test/CodeGen/RISCV/shrinkwrap-jump-table.ll
+++ b/llvm/test/CodeGen/RISCV/shrinkwrap-jump-table.ll
@@ -14,7 +14,7 @@ define dso_local signext i32 @test_shrinkwrap_jump_table(ptr noundef %m) local_u
 ; CHECK-NEXT:    lw a1, 0(a0)
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    li a2, 4
-; CHECK-NEXT:    bltu a2, a1, .LBB0_3
+; CHECK-NEXT:    bltu a2, a1, .LBB0_7
 ; CHECK-NEXT:  # %bb.1: # %entry
 ; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    lui a2, %hi(.LJTI0_0)
@@ -24,7 +24,15 @@ define dso_local signext i32 @test_shrinkwrap_jump_table(ptr noundef %m) local_u
 ; CHECK-NEXT:    jr a1
 ; CHECK-NEXT:  .LBB0_2: # %sw.bb
 ; CHECK-NEXT:    tail func1 at plt
-; CHECK-NEXT:  .LBB0_3: # %sw.default
+; CHECK-NEXT:  .LBB0_3: # %sw.bb1
+; CHECK-NEXT:    tail func2 at plt
+; CHECK-NEXT:  .LBB0_4: # %sw.bb3
+; CHECK-NEXT:    tail func3 at plt
+; CHECK-NEXT:  .LBB0_5: # %sw.bb5
+; CHECK-NEXT:    tail func4 at plt
+; CHECK-NEXT:  .LBB0_6: # %sw.bb7
+; CHECK-NEXT:    tail func5 at plt
+; CHECK-NEXT:  .LBB0_7: # %sw.default
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
@@ -34,14 +42,6 @@ define dso_local signext i32 @test_shrinkwrap_jump_table(ptr noundef %m) local_u
 ; CHECK-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_4: # %sw.bb1
-; CHECK-NEXT:    tail func2 at plt
-; CHECK-NEXT:  .LBB0_5: # %sw.bb3
-; CHECK-NEXT:    tail func3 at plt
-; CHECK-NEXT:  .LBB0_6: # %sw.bb5
-; CHECK-NEXT:    tail func4 at plt
-; CHECK-NEXT:  .LBB0_7: # %sw.bb7
-; CHECK-NEXT:    tail func5 at plt
 entry:
   %0 = load i32, ptr %m, align 4
   switch i32 %0, label %sw.default [
diff --git a/llvm/test/CodeGen/Thumb/pr42760.ll b/llvm/test/CodeGen/Thumb/pr42760.ll
index fc9a18bb33420f0..cba2de3324df57b 100644
--- a/llvm/test/CodeGen/Thumb/pr42760.ll
+++ b/llvm/test/CodeGen/Thumb/pr42760.ll
@@ -6,31 +6,34 @@ define hidden void @test() {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    lsls r1, r0, #2
-; CHECK-NEXT:    b .LBB0_2
-; CHECK-NEXT:  .LBB0_1: @ %bb2
-; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_1: @ %bb
+; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    bne .LBB0_7
+; CHECK-NEXT:  .LBB0_2: @ %bb2
+; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    bne .LBB0_6
-; CHECK-NEXT:  .LBB0_2: @ %switch
+; CHECK-NEXT:    bne .LBB0_7
+; CHECK-NEXT:  .LBB0_3: @ %switch
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adr r2, .LJTI0_0
 ; CHECK-NEXT:    ldr r2, [r2, r1]
 ; CHECK-NEXT:    mov pc, r2
-; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  @ %bb.4:
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  .LJTI0_0:
+; CHECK-NEXT:    .long .LBB0_7+1
 ; CHECK-NEXT:    .long .LBB0_6+1
-; CHECK-NEXT:    .long .LBB0_4+1
-; CHECK-NEXT:    .long .LBB0_6+1
+; CHECK-NEXT:    .long .LBB0_7+1
 ; CHECK-NEXT:    .long .LBB0_5+1
-; CHECK-NEXT:  .LBB0_4: @ %switch
-; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:  .LBB0_5: @ %switch
+; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    b .LBB0_1
-; CHECK-NEXT:  .LBB0_5: @ %bb
-; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    beq .LBB0_1
-; CHECK-NEXT:  .LBB0_6: @ %dead
+; CHECK-NEXT:  .LBB0_6: @ %switch
+; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    b .LBB0_2
+; CHECK-NEXT:  .LBB0_7: @ %dead
 entry:
   br label %switch
 
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
index e0c045ba0440db3..101f8f6ef6eeb1d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
@@ -12,25 +12,25 @@ define void @test_memcpy(ptr nocapture %x, ptr nocapture readonly %y, i32 %n, i3
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    lsl.w r12, r3, #2
 ; CHECK-NEXT:    movs r7, #0
-; CHECK-NEXT:    b .LBB0_2
-; CHECK-NEXT:  .LBB0_2: @ %for.body
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_2: @ Parent Loop BB0_3 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vldrb.u8 q0, [r4], #16
+; CHECK-NEXT:    vstrb.8 q0, [r5], #16
+; CHECK-NEXT:    letp lr, .LBB0_2
+; CHECK-NEXT:    b .LBB0_4
+; CHECK-NEXT:  .LBB0_3: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB0_4 Depth 2
+; CHECK-NEXT:    @ Child Loop BB0_2 Depth 2
 ; CHECK-NEXT:    adds r4, r1, r7
 ; CHECK-NEXT:    adds r5, r0, r7
-; CHECK-NEXT:    wlstp.8 lr, r3, .LBB0_3
-; CHECK-NEXT:    b .LBB0_4
-; CHECK-NEXT:  .LBB0_3: @ %for.body
-; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    wlstp.8 lr, r3, .LBB0_4
+; CHECK-NEXT:    b .LBB0_2
+; CHECK-NEXT:  .LBB0_4: @ %for.body
+; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    add r7, r12
 ; CHECK-NEXT:    subs r2, #1
 ; CHECK-NEXT:    beq .LBB0_5
-; CHECK-NEXT:    b .LBB0_2
-; CHECK-NEXT:  .LBB0_4: @ Parent Loop BB0_2 Depth=1
-; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrb.u8 q0, [r4], #16
-; CHECK-NEXT:    vstrb.8 q0, [r5], #16
-; CHECK-NEXT:    letp lr, .LBB0_4
 ; CHECK-NEXT:    b .LBB0_3
 ; CHECK-NEXT:  .LBB0_5:
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, lr}
@@ -65,23 +65,23 @@ define void @test_memset(ptr nocapture %x, i32 %n, i32 %m) {
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    lsl.w r12, r2, #2
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    b .LBB1_2
-; CHECK-NEXT:  .LBB1_2: @ %for.body
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB1_4 Depth 2
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    wlstp.8 lr, r2, .LBB1_3
+; CHECK-NEXT:    b .LBB1_3
+; CHECK-NEXT:  .LBB1_2: @ Parent Loop BB1_3 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vstrb.8 q0, [r4], #16
+; CHECK-NEXT:    letp lr, .LBB1_2
 ; CHECK-NEXT:    b .LBB1_4
 ; CHECK-NEXT:  .LBB1_3: @ %for.body
-; CHECK-NEXT:    @ in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB1_2 Depth 2
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB1_4
+; CHECK-NEXT:    b .LBB1_2
+; CHECK-NEXT:  .LBB1_4: @ %for.body
+; CHECK-NEXT:    @ in Loop: Header=BB1_3 Depth=1
 ; CHECK-NEXT:    add r0, r12
 ; CHECK-NEXT:    subs r1, #1
 ; CHECK-NEXT:    beq .LBB1_5
-; CHECK-NEXT:    b .LBB1_2
-; CHECK-NEXT:  .LBB1_4: @ Parent Loop BB1_2 Depth=1
-; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vstrb.8 q0, [r4], #16
-; CHECK-NEXT:    letp lr, .LBB1_4
 ; CHECK-NEXT:    b .LBB1_3
 ; CHECK-NEXT:  .LBB1_5:
 ; CHECK-NEXT:    pop.w {r4, lr}
diff --git a/llvm/test/CodeGen/Thumb2/bti-indirect-branches.ll b/llvm/test/CodeGen/Thumb2/bti-indirect-branches.ll
index 59e346588754a40..ed895be96cd8910 100644
--- a/llvm/test/CodeGen/Thumb2/bti-indirect-branches.ll
+++ b/llvm/test/CodeGen/Thumb2/bti-indirect-branches.ll
@@ -7,30 +7,30 @@ define internal i32 @table_switch(i32 %x) {
 ; CHECK-NEXT:    bti
 ; CHECK-NEXT:    subs r1, r0, #1
 ; CHECK-NEXT:    cmp r1, #3
-; CHECK-NEXT:    bhi .LBB0_4
+; CHECK-NEXT:    bhi .LBB0_6
 ; CHECK-NEXT:  @ %bb.1: @ %entry
 ; CHECK-NEXT:  .LCPI0_0:
 ; CHECK-NEXT:    tbb [pc, r1]
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:  .LJTI0_0:
-; CHECK-NEXT:    .byte (.LBB0_5-(.LCPI0_0+4))/2
-; CHECK-NEXT:    .byte (.LBB0_3-(.LCPI0_0+4))/2
-; CHECK-NEXT:    .byte (.LBB0_6-(.LCPI0_0+4))/2
 ; CHECK-NEXT:    .byte (.LBB0_7-(.LCPI0_0+4))/2
+; CHECK-NEXT:    .byte (.LBB0_3-(.LCPI0_0+4))/2
+; CHECK-NEXT:    .byte (.LBB0_4-(.LCPI0_0+4))/2
+; CHECK-NEXT:    .byte (.LBB0_5-(.LCPI0_0+4))/2
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  .LBB0_3: @ %bb2
 ; CHECK-NEXT:    movs r0, #2
 ; CHECK-NEXT:    bx lr
-; CHECK-NEXT:  .LBB0_4: @ %sw.epilog
-; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:  .LBB0_5: @ %return
-; CHECK-NEXT:    bx lr
-; CHECK-NEXT:  .LBB0_6: @ %bb3
+; CHECK-NEXT:  .LBB0_4: @ %bb3
 ; CHECK-NEXT:    movs r0, #3
 ; CHECK-NEXT:    bx lr
-; CHECK-NEXT:  .LBB0_7: @ %bb4
+; CHECK-NEXT:  .LBB0_5: @ %bb4
 ; CHECK-NEXT:    movs r0, #4
 ; CHECK-NEXT:    bx lr
+; CHECK-NEXT:  .LBB0_6: @ %sw.epilog
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:  .LBB0_7: @ %return
+; CHECK-NEXT:    bx lr
 entry:
   switch i32 %x, label %sw.epilog [
     i32 1, label %bb1
@@ -93,23 +93,6 @@ declare void @consume_exception(ptr)
 declare i32 @__gxx_personality_v0(...)
 
 define internal i32 @exception_handling(i32 %0) personality ptr @__gxx_personality_v0 {
-; CHECK-LABEL: exception_handling:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    bti
-; CHECK-NEXT:    .save  {r7, lr}
-; CHECK-NEXT:    push   {r7, lr}
-; CHECK-NEXT:  .Ltmp0:
-; CHECK-NEXT:    bl     may_throw
-; CHECK-NEXT:  .Ltmp1:
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:    movs   r0, #0
-; CHECK-NEXT:    pop    {r7, pc}
-; CHECK-NEXT:  .LBB2_2:
-; CHECK-NEXT:  .Ltmp2:
-; CHECK-NEXT:    bti
-; CHECK-NEXT:    bl     consume_exception
-; CHECK-NEXT:    movs   r0, #1
-; CHECK-NEXT:    pop    {r7, pc}
 entry:
   invoke void @may_throw()
           to label %return unwind label %lpad
diff --git a/llvm/test/CodeGen/Thumb2/constant-hoisting.ll b/llvm/test/CodeGen/Thumb2/constant-hoisting.ll
index 98fe30039259f03..1aeecdf1e08f36e 100644
--- a/llvm/test/CodeGen/Thumb2/constant-hoisting.ll
+++ b/llvm/test/CodeGen/Thumb2/constant-hoisting.ll
@@ -7,27 +7,27 @@ define i32 @test_values(i32 %a, i32 %b) minsize optsize {
 ; CHECK-V6M:         mov r2, r0
 ; CHECK-V6M-NEXT:    ldr r0, .LCPI0_0
 ; CHECK-V6M-NEXT:    cmp r2, #50
-; CHECK-V6M-NEXT:    beq .LBB0_5
-; CHECK-V6M-NEXT:    cmp r2, #1
 ; CHECK-V6M-NEXT:    beq .LBB0_7
+; CHECK-V6M-NEXT:    cmp r2, #1
+; CHECK-V6M-NEXT:    beq .LBB0_5
 ; CHECK-V6M-NEXT:    cmp r2, #30
-; CHECK-V6M-NEXT:    beq .LBB0_8
+; CHECK-V6M-NEXT:    beq .LBB0_6
 ; CHECK-V6M-NEXT:    cmp r2, #0
-; CHECK-V6M-NEXT:    bne .LBB0_6
+; CHECK-V6M-NEXT:    bne .LBB0_8
 ; CHECK-V6M-NEXT:    adds r0, r1, r0
 ; CHECK-V6M-NEXT:    bx lr
 ; CHECK-V6M-NEXT:  .LBB0_5:
 ; CHECK-V6M-NEXT:    adds r0, r0, r1
-; CHECK-V6M-NEXT:    adds r0, r0, #4
+; CHECK-V6M-NEXT:    adds r0, r0, #1
+; CHECK-V6M-NEXT:    bx lr
 ; CHECK-V6M-NEXT:  .LBB0_6:
+; CHECK-V6M-NEXT:    adds r0, r0, r1
+; CHECK-V6M-NEXT:    adds r0, r0, #2
 ; CHECK-V6M-NEXT:    bx lr
 ; CHECK-V6M-NEXT:  .LBB0_7:
 ; CHECK-V6M-NEXT:    adds r0, r0, r1
-; CHECK-V6M-NEXT:    adds r0, r0, #1
-; CHECK-V6M-NEXT:    bx lr
+; CHECK-V6M-NEXT:    adds r0, r0, #4
 ; CHECK-V6M-NEXT:  .LBB0_8:
-; CHECK-V6M-NEXT:    adds r0, r0, r1
-; CHECK-V6M-NEXT:    adds r0, r0, #2
 ; CHECK-V6M-NEXT:    bx lr
 ; CHECK-V6M-NEXT:    .p2align 2
 ; CHECK-V6M-NEXT:  .LCPI0_0:
diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
index 39bf97d880ea3f4..f0937059a8e7a2c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
@@ -428,9 +428,9 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:  .LBB1_4: @ %for.cond2.preheader
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB1_17 Depth 2
-; CHECK-NEXT:    @ Child Loop BB1_8 Depth 2
-; CHECK-NEXT:    @ Child Loop BB1_10 Depth 3
-; CHECK-NEXT:    @ Child Loop BB1_12 Depth 3
+; CHECK-NEXT:    @ Child Loop BB1_11 Depth 2
+; CHECK-NEXT:    @ Child Loop BB1_8 Depth 3
+; CHECK-NEXT:    @ Child Loop BB1_13 Depth 3
 ; CHECK-NEXT:    cmp.w r11, #2
 ; CHECK-NEXT:    bgt .LBB1_3
 ; CHECK-NEXT:  @ %bb.5: @ %for.body6.lr.ph
@@ -451,23 +451,9 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    mov lr, r7
 ; CHECK-NEXT:    mov r7, r4
 ; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    b .LBB1_8
-; CHECK-NEXT:  .LBB1_7: @ %for.cond.cleanup17.us
-; CHECK-NEXT:    @ in Loop: Header=BB1_8 Depth=2
-; CHECK-NEXT:    add.w r11, r3, #7
-; CHECK-NEXT:    cmn.w r3, #4
-; CHECK-NEXT:    mov.w r10, #0
-; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    bge .LBB1_3
-; CHECK-NEXT:  .LBB1_8: @ %for.body6.us
-; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
-; CHECK-NEXT:    @ => This Loop Header: Depth=2
-; CHECK-NEXT:    @ Child Loop BB1_10 Depth 3
-; CHECK-NEXT:    @ Child Loop BB1_12 Depth 3
-; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    cbz r2, .LBB1_11
-; CHECK-NEXT:  @ %bb.9: @ %for.body13.us51.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB1_8 Depth=2
+; CHECK-NEXT:    b .LBB1_11
+; CHECK-NEXT:  .LBB1_7: @ %for.body13.us51.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB1_11 Depth=2
 ; CHECK-NEXT:    movw r4, :lower16:a
 ; CHECK-NEXT:    vmov q1, q4
 ; CHECK-NEXT:    movt r4, :upper16:a
@@ -476,9 +462,9 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    movt r4, :upper16:b
 ; CHECK-NEXT:    str r1, [r4]
 ; CHECK-NEXT:    mov r4, r7
-; CHECK-NEXT:  .LBB1_10: @ %vector.body111
+; CHECK-NEXT:  .LBB1_8: @ %vector.body111
 ; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
-; CHECK-NEXT:    @ Parent Loop BB1_8 Depth=2
+; CHECK-NEXT:    @ Parent Loop BB1_11 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
 ; CHECK-NEXT:    vqadd.u32 q2, q5, r1
 ; CHECK-NEXT:    subs r4, #4
@@ -489,15 +475,32 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    vadd.i32 q1, q1, r9
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [q2]
-; CHECK-NEXT:    bne .LBB1_10
-; CHECK-NEXT:    b .LBB1_13
-; CHECK-NEXT:  .LBB1_11: @ %vector.body.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB1_8 Depth=2
+; CHECK-NEXT:    bne .LBB1_8
+; CHECK-NEXT:  .LBB1_9: @ %for.cond9.for.cond15.preheader_crit_edge.us
+; CHECK-NEXT:    @ in Loop: Header=BB1_11 Depth=2
+; CHECK-NEXT:    cbnz r6, .LBB1_14
+; CHECK-NEXT:  .LBB1_10: @ %for.cond.cleanup17.us
+; CHECK-NEXT:    @ in Loop: Header=BB1_11 Depth=2
+; CHECK-NEXT:    add.w r11, r3, #7
+; CHECK-NEXT:    cmn.w r3, #4
+; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    mov r3, r11
+; CHECK-NEXT:    bge .LBB1_3
+; CHECK-NEXT:  .LBB1_11: @ %for.body6.us
+; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
+; CHECK-NEXT:    @ => This Loop Header: Depth=2
+; CHECK-NEXT:    @ Child Loop BB1_8 Depth 3
+; CHECK-NEXT:    @ Child Loop BB1_13 Depth 3
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    bne .LBB1_7
+; CHECK-NEXT:  @ %bb.12: @ %vector.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB1_11 Depth=2
 ; CHECK-NEXT:    mov r4, r7
 ; CHECK-NEXT:    vmov q1, q4
-; CHECK-NEXT:  .LBB1_12: @ %vector.body
+; CHECK-NEXT:  .LBB1_13: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
-; CHECK-NEXT:    @ Parent Loop BB1_8 Depth=2
+; CHECK-NEXT:    @ Parent Loop BB1_11 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
 ; CHECK-NEXT:    vqadd.u32 q2, q5, r1
 ; CHECK-NEXT:    subs r4, #4
@@ -508,16 +511,13 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    vadd.i32 q1, q1, r9
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [q2]
-; CHECK-NEXT:    bne .LBB1_12
-; CHECK-NEXT:  .LBB1_13: @ %for.cond9.for.cond15.preheader_crit_edge.us
-; CHECK-NEXT:    @ in Loop: Header=BB1_8 Depth=2
-; CHECK-NEXT:    cmp r6, #0
-; CHECK-NEXT:    beq .LBB1_7
-; CHECK-NEXT:  @ %bb.14: @ %for.cond9.for.cond15.preheader_crit_edge.us
-; CHECK-NEXT:    @ in Loop: Header=BB1_8 Depth=2
+; CHECK-NEXT:    bne .LBB1_13
+; CHECK-NEXT:    b .LBB1_9
+; CHECK-NEXT:  .LBB1_14: @ %for.cond9.for.cond15.preheader_crit_edge.us
+; CHECK-NEXT:    @ in Loop: Header=BB1_11 Depth=2
 ; CHECK-NEXT:    eor r1, r10, #1
 ; CHECK-NEXT:    lsls r1, r1, #31
-; CHECK-NEXT:    bne .LBB1_7
+; CHECK-NEXT:    bne .LBB1_10
 ; CHECK-NEXT:    b .LBB1_26
 ; CHECK-NEXT:  .LBB1_15: @ %for.body6.lr.ph.split
 ; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index 88131fcf21a9233..51122ddfda63416 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1021,24 +1021,39 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    b .LBB16_5
-; CHECK-NEXT:  .LBB16_3: @ %for.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    b .LBB16_8
+; CHECK-NEXT:  .LBB16_3: @ %while.body76.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_8 Depth=1
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:  .LBB16_4: @ %while.body76
+; CHECK-NEXT:    @ Parent Loop BB16_8 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    ldrh r4, [r6], #2
+; CHECK-NEXT:    vldrh.u16 q1, [r0], #2
+; CHECK-NEXT:    vfma.f16 q0, q1, r4
+; CHECK-NEXT:    le lr, .LBB16_4
+; CHECK-NEXT:  @ %bb.5: @ %while.end.loopexit
+; CHECK-NEXT:    @ in Loop: Header=BB16_8 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    wls lr, r0, .LBB16_4
-; CHECK-NEXT:    b .LBB16_9
-; CHECK-NEXT:  .LBB16_4: @ %while.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    add.w r5, r5, r0, lsl #1
+; CHECK-NEXT:    b .LBB16_7
+; CHECK-NEXT:  .LBB16_6: @ %for.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_8 Depth=1
+; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    wls lr, r0, .LBB16_7
+; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:  .LBB16_7: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_8 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r12, r12, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #8
 ; CHECK-NEXT:    add.w r0, r5, r0, lsl #1
 ; CHECK-NEXT:    add.w r5, r0, #8
-; CHECK-NEXT:    beq.w .LBB16_12
-; CHECK-NEXT:  .LBB16_5: @ %while.body
+; CHECK-NEXT:    beq .LBB16_12
+; CHECK-NEXT:  .LBB16_8: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB16_7 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_4 Depth 2
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    ldrh.w lr, [r3, #14]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #8
@@ -1074,14 +1089,14 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vfma.f16 q0, q1, lr
 ; CHECK-NEXT:    cmp r0, #16
-; CHECK-NEXT:    blo .LBB16_8
-; CHECK-NEXT:  @ %bb.6: @ %for.body.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    blo .LBB16_11
+; CHECK-NEXT:  @ %bb.9: @ %for.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_8 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:  .LBB16_7: @ %for.body
-; CHECK-NEXT:    @ Parent Loop BB16_5 Depth=1
+; CHECK-NEXT:  .LBB16_10: @ %for.body
+; CHECK-NEXT:    @ Parent Loop BB16_8 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrh r0, [r6], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r5]
@@ -1112,26 +1127,11 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    adds r5, #16
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
-; CHECK-NEXT:    le lr, .LBB16_7
-; CHECK-NEXT:    b .LBB16_3
-; CHECK-NEXT:  .LBB16_8: @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    b .LBB16_3
-; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:  .LBB16_10: @ %while.body76
-; CHECK-NEXT:    @ Parent Loop BB16_5 Depth=1
-; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh r4, [r6], #2
-; CHECK-NEXT:    vldrh.u16 q1, [r0], #2
-; CHECK-NEXT:    vfma.f16 q0, q1, r4
 ; CHECK-NEXT:    le lr, .LBB16_10
-; CHECK-NEXT:  @ %bb.11: @ %while.end.loopexit
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    add.w r5, r5, r0, lsl #1
-; CHECK-NEXT:    b .LBB16_4
+; CHECK-NEXT:    b .LBB16_6
+; CHECK-NEXT:  .LBB16_11: @ in Loop: Header=BB16_8 Depth=1
+; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    b .LBB16_6
 ; CHECK-NEXT:  .LBB16_12: @ %if.end
 ; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index ca6b8c2fffa22cc..f7fbff39162e0d2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1016,25 +1016,40 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    str r6, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    b .LBB16_5
-; CHECK-NEXT:  .LBB16_3: @ %for.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    b .LBB16_8
+; CHECK-NEXT:  .LBB16_3: @ %while.body76.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_8 Depth=1
+; CHECK-NEXT:    mov r3, r4
+; CHECK-NEXT:  .LBB16_4: @ %while.body76
+; CHECK-NEXT:    @ Parent Loop BB16_8 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    ldr r0, [r7], #4
+; CHECK-NEXT:    vldrw.u32 q1, [r3], #4
+; CHECK-NEXT:    vfma.f32 q0, q1, r0
+; CHECK-NEXT:    le lr, .LBB16_4
+; CHECK-NEXT:  @ %bb.5: @ %while.end.loopexit
+; CHECK-NEXT:    @ in Loop: Header=BB16_8 Depth=1
+; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    add.w r4, r4, r0, lsl #2
+; CHECK-NEXT:    b .LBB16_7
+; CHECK-NEXT:  .LBB16_6: @ %for.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_8 Depth=1
 ; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
-; CHECK-NEXT:    wls lr, r0, .LBB16_4
-; CHECK-NEXT:    b .LBB16_9
-; CHECK-NEXT:  .LBB16_4: @ %while.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    wls lr, r0, .LBB16_7
+; CHECK-NEXT:    b .LBB16_3
+; CHECK-NEXT:  .LBB16_7: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_8 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r12, r12, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 ; CHECK-NEXT:    add.w r0, r4, r0, lsl #2
 ; CHECK-NEXT:    add.w r4, r0, #16
 ; CHECK-NEXT:    beq .LBB16_12
-; CHECK-NEXT:  .LBB16_5: @ %while.body
+; CHECK-NEXT:  .LBB16_8: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB16_7 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_4 Depth 2
 ; CHECK-NEXT:    add.w lr, r10, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    ldrd r3, r7, [r10]
@@ -1060,14 +1075,14 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
 ; CHECK-NEXT:    cmp r0, #16
 ; CHECK-NEXT:    vfma.f32 q0, q1, r8
-; CHECK-NEXT:    blo .LBB16_8
-; CHECK-NEXT:  @ %bb.6: @ %for.body.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    blo .LBB16_11
+; CHECK-NEXT:  @ %bb.9: @ %for.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_8 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:  .LBB16_7: @ %for.body
-; CHECK-NEXT:    @ Parent Loop BB16_5 Depth=1
+; CHECK-NEXT:  .LBB16_10: @ %for.body
+; CHECK-NEXT:    @ Parent Loop BB16_8 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldm.w r7, {r0, r3, r5, r6, r8, r11}
 ; CHECK-NEXT:    vldrw.u32 q1, [r4], #32
@@ -1088,26 +1103,11 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    vfma.f32 q0, q2, r11
 ; CHECK-NEXT:    vfma.f32 q0, q3, r9
 ; CHECK-NEXT:    vfma.f32 q0, q1, r1
-; CHECK-NEXT:    le lr, .LBB16_7
-; CHECK-NEXT:    b .LBB16_3
-; CHECK-NEXT:  .LBB16_8: @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    b .LBB16_3
-; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT:    mov r3, r4
-; CHECK-NEXT:  .LBB16_10: @ %while.body76
-; CHECK-NEXT:    @ Parent Loop BB16_5 Depth=1
-; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldr r0, [r7], #4
-; CHECK-NEXT:    vldrw.u32 q1, [r3], #4
-; CHECK-NEXT:    vfma.f32 q0, q1, r0
 ; CHECK-NEXT:    le lr, .LBB16_10
-; CHECK-NEXT:  @ %bb.11: @ %while.end.loopexit
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add.w r4, r4, r0, lsl #2
-; CHECK-NEXT:    b .LBB16_4
+; CHECK-NEXT:    b .LBB16_6
+; CHECK-NEXT:  .LBB16_11: @ in Loop: Header=BB16_8 Depth=1
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    b .LBB16_6
 ; CHECK-NEXT:  .LBB16_12:
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
@@ -1582,36 +1582,12 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    str r2, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB19_3
-; CHECK-NEXT:  .LBB19_1: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    mov r7, r5
-; CHECK-NEXT:    mov r4, r11
-; CHECK-NEXT:    mov r8, r10
-; CHECK-NEXT:  .LBB19_2: @ %if.end69
-; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    ldrd r2, r6, [sp, #8] @ 8-byte Folded Reload
-; CHECK-NEXT:    adds r0, #128
-; CHECK-NEXT:    strd r7, r4, [r9]
-; CHECK-NEXT:    subs r6, #1
-; CHECK-NEXT:    strd r3, r8, [r9, #8]
-; CHECK-NEXT:    add.w r9, r9, #16
-; CHECK-NEXT:    mov r1, r2
-; CHECK-NEXT:    beq.w .LBB19_13
-; CHECK-NEXT:  .LBB19_3: @ %do.body
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB19_5 Depth 2
-; CHECK-NEXT:    str r6, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    mov r6, r2
-; CHECK-NEXT:    ldrd r5, r11, [r9]
-; CHECK-NEXT:    ldrd r8, r10, [r9, #8]
-; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
-; CHECK-NEXT:    wls lr, r2, .LBB19_6
-; CHECK-NEXT:  @ %bb.4: @ %while.body.lr.ph
+; CHECK-NEXT:  .LBB19_1: @ %while.body.lr.ph
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    mov r4, r11
 ; CHECK-NEXT:    mov r3, r5
-; CHECK-NEXT:  .LBB19_5: @ %while.body
+; CHECK-NEXT:  .LBB19_2: @ %while.body
 ; CHECK-NEXT:    @ Parent Loop BB19_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldr r5, [r1, #12]
@@ -1638,13 +1614,39 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:    vstrb.8 q2, [r6], #16
 ; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    mov r12, r5
-; CHECK-NEXT:    le lr, .LBB19_5
-; CHECK-NEXT:  .LBB19_6: @ %while.end
+; CHECK-NEXT:    le lr, .LBB19_2
+; CHECK-NEXT:    b .LBB19_4
+; CHECK-NEXT:  .LBB19_3: @ %do.body
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB19_2 Depth 2
+; CHECK-NEXT:    str r6, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    ldrd r5, r11, [r9]
+; CHECK-NEXT:    ldrd r8, r10, [r9, #8]
+; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT:    wls lr, r2, .LBB19_4
+; CHECK-NEXT:    b .LBB19_1
+; CHECK-NEXT:  .LBB19_4: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    beq .LBB19_1
-; CHECK-NEXT:  @ %bb.7: @ %if.then
+; CHECK-NEXT:    cbnz r2, .LBB19_7
+; CHECK-NEXT:  @ %bb.5: @ in Loop: Header=BB19_3 Depth=1
+; CHECK-NEXT:    mov r3, r8
+; CHECK-NEXT:    mov r7, r5
+; CHECK-NEXT:    mov r4, r11
+; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:  .LBB19_6: @ %if.end69
+; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
+; CHECK-NEXT:    ldrd r2, r6, [sp, #8] @ 8-byte Folded Reload
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    strd r7, r4, [r9]
+; CHECK-NEXT:    subs r6, #1
+; CHECK-NEXT:    strd r3, r8, [r9, #8]
+; CHECK-NEXT:    add.w r9, r9, #16
+; CHECK-NEXT:    mov r1, r2
+; CHECK-NEXT:    beq .LBB19_13
+; CHECK-NEXT:    b .LBB19_3
+; CHECK-NEXT:  .LBB19_7: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    ldrd lr, r4, [r1]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
@@ -1694,7 +1696,7 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:  .LBB19_12: @ %if.end69
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    mov r12, r1
-; CHECK-NEXT:    b .LBB19_2
+; CHECK-NEXT:    b .LBB19_6
 ; CHECK-NEXT:  .LBB19_13: @ %do.end
 ; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index ad63e9ee9ff4c92..933e4e74a642820 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -612,13 +612,8 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:    itt ne
 ; CHECK-NEXT:    ldrne r0, [sp, #136]
 ; CHECK-NEXT:    cmpne r0, #0
-; CHECK-NEXT:    bne .LBB10_2
-; CHECK-NEXT:  .LBB10_1: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #32
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-; CHECK-NEXT:  .LBB10_2: @ %for.cond1.preheader.us.preheader
+; CHECK-NEXT:    beq.w .LBB10_15
+; CHECK-NEXT:  @ %bb.1: @ %for.cond1.preheader.us.preheader
 ; CHECK-NEXT:    ldr.w r12, [sp, #140]
 ; CHECK-NEXT:    movs r7, #1
 ; CHECK-NEXT:    mov.w r11, #0
@@ -638,69 +633,18 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:    vshl.i32 q3, q1, #2
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    b .LBB10_5
-; CHECK-NEXT:  .LBB10_3: @ %for.cond5.preheader.us73.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    add.w r3, r0, r5, lsl #1
-; CHECK-NEXT:    wlstp.8 lr, r6, .LBB10_4
-; CHECK-NEXT:    b .LBB10_15
-; CHECK-NEXT:  .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
-; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    add r11, r12
-; CHECK-NEXT:    ldr r3, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add r3, r0
-; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    adds r3, #1
-; CHECK-NEXT:    cmp r3, r0
-; CHECK-NEXT:    beq .LBB10_1
-; CHECK-NEXT:  .LBB10_5: @ %for.cond1.preheader.us
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB10_8 Depth 2
-; CHECK-NEXT:    @ Child Loop BB10_11 Depth 3
-; CHECK-NEXT:    @ Child Loop BB10_14 Depth 3
-; CHECK-NEXT:    @ Child Loop BB10_15 Depth 2
-; CHECK-NEXT:    mul r5, r3, r7
-; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    beq .LBB10_3
-; CHECK-NEXT:  @ %bb.6: @ %for.cond5.preheader.us.us.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT:    mov.w r8, #0
-; CHECK-NEXT:    b .LBB10_8
-; CHECK-NEXT:  .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us
-; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT:    ldr r3, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    add.w r0, r8, r5
-; CHECK-NEXT:    add.w r8, r8, #1
-; CHECK-NEXT:    cmp r8, r7
-; CHECK-NEXT:    strh.w r10, [r3, r0, lsl #1]
-; CHECK-NEXT:    beq .LBB10_4
-; CHECK-NEXT:  .LBB10_8: @ %for.cond5.preheader.us.us
-; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
-; CHECK-NEXT:    @ => This Loop Header: Depth=2
-; CHECK-NEXT:    @ Child Loop BB10_11 Depth 3
-; CHECK-NEXT:    @ Child Loop BB10_14 Depth 3
-; CHECK-NEXT:    cmp.w r12, #3
-; CHECK-NEXT:    bhi .LBB10_10
-; CHECK-NEXT:  @ %bb.9: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    mov.w r10, #0
-; CHECK-NEXT:    b .LBB10_13
-; CHECK-NEXT:  .LBB10_10: @ %vector.ph
-; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
+; CHECK-NEXT:    b .LBB10_10
+; CHECK-NEXT:  .LBB10_2: @ %vector.ph
+; CHECK-NEXT:    @ in Loop: Header=BB10_6 Depth=2
 ; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    vmov q5, q1
 ; CHECK-NEXT:    vmov.i32 q4, #0x0
 ; CHECK-NEXT:    vmlas.i32 q5, q2, r8
 ; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    ldr r3, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:  .LBB10_11: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
-; CHECK-NEXT:    @ Parent Loop BB10_8 Depth=2
+; CHECK-NEXT:  .LBB10_3: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB10_10 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB10_6 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
 ; CHECK-NEXT:    vadd.i32 q6, q5, q3
 ; CHECK-NEXT:    vldrh.s32 q7, [r1, q5, uxtw #1]
@@ -708,15 +652,33 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:    vmul.i32 q5, q7, q5
 ; CHECK-NEXT:    vadd.i32 q4, q5, q4
 ; CHECK-NEXT:    vmov q5, q6
-; CHECK-NEXT:    le lr, .LBB10_11
-; CHECK-NEXT:  @ %bb.12: @ %middle.block
-; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
+; CHECK-NEXT:    le lr, .LBB10_3
+; CHECK-NEXT:  @ %bb.4: @ %middle.block
+; CHECK-NEXT:    @ in Loop: Header=BB10_6 Depth=2
 ; CHECK-NEXT:    vaddv.u32 r10, q4
 ; CHECK-NEXT:    cmp r2, r12
 ; CHECK-NEXT:    mov r4, r2
-; CHECK-NEXT:    beq .LBB10_7
-; CHECK-NEXT:  .LBB10_13: @ %for.body8.us.us.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
+; CHECK-NEXT:    bne .LBB10_8
+; CHECK-NEXT:  .LBB10_5: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us
+; CHECK-NEXT:    @ in Loop: Header=BB10_6 Depth=2
+; CHECK-NEXT:    ldr r3, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    add.w r0, r8, r5
+; CHECK-NEXT:    add.w r8, r8, #1
+; CHECK-NEXT:    cmp r8, r7
+; CHECK-NEXT:    strh.w r10, [r3, r0, lsl #1]
+; CHECK-NEXT:    beq .LBB10_14
+; CHECK-NEXT:  .LBB10_6: @ %for.cond5.preheader.us.us
+; CHECK-NEXT:    @ Parent Loop BB10_10 Depth=1
+; CHECK-NEXT:    @ => This Loop Header: Depth=2
+; CHECK-NEXT:    @ Child Loop BB10_3 Depth 3
+; CHECK-NEXT:    @ Child Loop BB10_9 Depth 3
+; CHECK-NEXT:    cmp.w r12, #3
+; CHECK-NEXT:    bhi .LBB10_2
+; CHECK-NEXT:  @ %bb.7: @ in Loop: Header=BB10_6 Depth=2
+; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:  .LBB10_8: @ %for.body8.us.us.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB10_6 Depth=2
 ; CHECK-NEXT:    mla r3, r7, r4, r8
 ; CHECK-NEXT:    add.w r0, r11, r4
 ; CHECK-NEXT:    ldr r7, [sp, #24] @ 4-byte Reload
@@ -724,21 +686,56 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:    add.w r9, r7, r0, lsl #1
 ; CHECK-NEXT:    ldr r7, [sp, #136]
 ; CHECK-NEXT:    add.w r3, r1, r3, lsl #1
-; CHECK-NEXT:  .LBB10_14: @ %for.body8.us.us
-; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
-; CHECK-NEXT:    @ Parent Loop BB10_8 Depth=2
+; CHECK-NEXT:  .LBB10_9: @ %for.body8.us.us
+; CHECK-NEXT:    @ Parent Loop BB10_10 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB10_6 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
 ; CHECK-NEXT:    ldrsh.w r4, [r3]
 ; CHECK-NEXT:    add r3, r6
 ; CHECK-NEXT:    ldrsh r0, [r9], #2
 ; CHECK-NEXT:    smlabb r10, r4, r0, r10
-; CHECK-NEXT:    le lr, .LBB10_14
-; CHECK-NEXT:    b .LBB10_7
-; CHECK-NEXT:  .LBB10_15: @ Parent Loop BB10_5 Depth=1
+; CHECK-NEXT:    le lr, .LBB10_9
+; CHECK-NEXT:    b .LBB10_5
+; CHECK-NEXT:  .LBB10_10: @ %for.cond1.preheader.us
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB10_6 Depth 2
+; CHECK-NEXT:    @ Child Loop BB10_3 Depth 3
+; CHECK-NEXT:    @ Child Loop BB10_9 Depth 3
+; CHECK-NEXT:    @ Child Loop BB10_13 Depth 2
+; CHECK-NEXT:    mul r5, r3, r7
+; CHECK-NEXT:    cmp.w r12, #0
+; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    beq .LBB10_12
+; CHECK-NEXT:  @ %bb.11: @ %for.cond5.preheader.us.us.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB10_10 Depth=1
+; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    b .LBB10_6
+; CHECK-NEXT:  .LBB10_12: @ %for.cond5.preheader.us73.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB10_10 Depth=1
+; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    add.w r3, r0, r5, lsl #1
+; CHECK-NEXT:    wlstp.8 lr, r6, .LBB10_14
+; CHECK-NEXT:  .LBB10_13: @ Parent Loop BB10_10 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vstrb.8 q0, [r3], #16
-; CHECK-NEXT:    letp lr, .LBB10_15
-; CHECK-NEXT:    b .LBB10_4
+; CHECK-NEXT:    letp lr, .LBB10_13
+; CHECK-NEXT:  .LBB10_14: @ %for.cond1.for.cond.cleanup3_crit_edge.us
+; CHECK-NEXT:    @ in Loop: Header=BB10_10 Depth=1
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    add r11, r12
+; CHECK-NEXT:    ldr r3, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    add r3, r0
+; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    ldr r3, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    adds r3, #1
+; CHECK-NEXT:    cmp r3, r0
+; CHECK-NEXT:    bne .LBB10_10
+; CHECK-NEXT:  .LBB10_15: @ %for.cond.cleanup
+; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.16:
 ; CHECK-NEXT:  .LCPI10_0:
diff --git a/llvm/test/CodeGen/Thumb2/mve-memtp-branch.ll b/llvm/test/CodeGen/Thumb2/mve-memtp-branch.ll
index b5c9b903e184168..e290f64e208e900 100644
--- a/llvm/test/CodeGen/Thumb2/mve-memtp-branch.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-memtp-branch.ll
@@ -22,31 +22,28 @@ define i32 @a(i8 zeroext %b, ptr nocapture readonly %c, ptr nocapture readonly %
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vmov.i32 q3, #0x0
-; CHECK-NEXT:    b .LBB0_3
-; CHECK-NEXT:  .LBB0_2: @ %land.end.us.3
-; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:  .LBB0_3: @ %for.body.us
-; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB0_4 Depth 2
-; CHECK-NEXT:    @ Child Loop BB0_6 Depth 2
-; CHECK-NEXT:    @ Child Loop BB0_8 Depth 2
-; CHECK-NEXT:    @ Child Loop BB0_11 Depth 2
-; CHECK-NEXT:    ldr.w r0, [r2, r3, lsl #2]
+; CHECK-NEXT:    b .LBB0_8
+; CHECK-NEXT:  .LBB0_2: @ Parent Loop BB0_8 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vstrb.8 q1, [r3], #16
+; CHECK-NEXT:    letp lr, .LBB0_2
+; CHECK-NEXT:  .LBB0_3: @ %land.end.us.1
+; CHECK-NEXT:    @ in Loop: Header=BB0_8 Depth=1
+; CHECK-NEXT:    ldr r0, [r2, #4]
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ite ne
-; CHECK-NEXT:    ldrbne r0, [r1, r3]
+; CHECK-NEXT:    ldrbne r0, [r1, #1]
 ; CHECK-NEXT:    moveq r0, #0
-; CHECK-NEXT:    mla r3, r3, r12, r5
-; CHECK-NEXT:    add r3, r0
+; CHECK-NEXT:    adds r3, r5, r0
 ; CHECK-NEXT:    rsb.w r0, r0, #108
+; CHECK-NEXT:    adds r3, #19
 ; CHECK-NEXT:    wlstp.8 lr, r0, .LBB0_5
-; CHECK-NEXT:  .LBB0_4: @ Parent Loop BB0_3 Depth=1
+; CHECK-NEXT:  .LBB0_4: @ Parent Loop BB0_8 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vstrb.8 q0, [r3], #16
+; CHECK-NEXT:    vstrb.8 q2, [r3], #16
 ; CHECK-NEXT:    letp lr, .LBB0_4
-; CHECK-NEXT:  .LBB0_5: @ %land.end.us
-; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:  .LBB0_5: @ %land.end.us.2
+; CHECK-NEXT:    @ in Loop: Header=BB0_8 Depth=1
 ; CHECK-NEXT:    ldr r0, [r2, #4]
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ite ne
@@ -56,27 +53,34 @@ define i32 @a(i8 zeroext %b, ptr nocapture readonly %c, ptr nocapture readonly %
 ; CHECK-NEXT:    rsb.w r0, r0, #108
 ; CHECK-NEXT:    adds r3, #19
 ; CHECK-NEXT:    wlstp.8 lr, r0, .LBB0_7
-; CHECK-NEXT:  .LBB0_6: @ Parent Loop BB0_3 Depth=1
+; CHECK-NEXT:  .LBB0_6: @ Parent Loop BB0_8 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vstrb.8 q1, [r3], #16
+; CHECK-NEXT:    vstrb.8 q3, [r3], #16
 ; CHECK-NEXT:    letp lr, .LBB0_6
-; CHECK-NEXT:  .LBB0_7: @ %land.end.us.1
-; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    ldr r0, [r2, #4]
+; CHECK-NEXT:  .LBB0_7: @ %land.end.us.3
+; CHECK-NEXT:    @ in Loop: Header=BB0_8 Depth=1
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:  .LBB0_8: @ %for.body.us
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB0_9 Depth 2
+; CHECK-NEXT:    @ Child Loop BB0_2 Depth 2
+; CHECK-NEXT:    @ Child Loop BB0_4 Depth 2
+; CHECK-NEXT:    @ Child Loop BB0_6 Depth 2
+; CHECK-NEXT:    ldr.w r0, [r2, r3, lsl #2]
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ite ne
-; CHECK-NEXT:    ldrbne r0, [r1, #1]
+; CHECK-NEXT:    ldrbne r0, [r1, r3]
 ; CHECK-NEXT:    moveq r0, #0
-; CHECK-NEXT:    adds r3, r5, r0
+; CHECK-NEXT:    mla r3, r3, r12, r5
+; CHECK-NEXT:    add r3, r0
 ; CHECK-NEXT:    rsb.w r0, r0, #108
-; CHECK-NEXT:    adds r3, #19
-; CHECK-NEXT:    wlstp.8 lr, r0, .LBB0_9
-; CHECK-NEXT:  .LBB0_8: @ Parent Loop BB0_3 Depth=1
+; CHECK-NEXT:    wlstp.8 lr, r0, .LBB0_10
+; CHECK-NEXT:  .LBB0_9: @ Parent Loop BB0_8 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vstrb.8 q2, [r3], #16
-; CHECK-NEXT:    letp lr, .LBB0_8
-; CHECK-NEXT:  .LBB0_9: @ %land.end.us.2
-; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    vstrb.8 q0, [r3], #16
+; CHECK-NEXT:    letp lr, .LBB0_9
+; CHECK-NEXT:  .LBB0_10: @ %land.end.us
+; CHECK-NEXT:    @ in Loop: Header=BB0_8 Depth=1
 ; CHECK-NEXT:    ldr r0, [r2, #4]
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ite ne
@@ -88,14 +92,10 @@ define i32 @a(i8 zeroext %b, ptr nocapture readonly %c, ptr nocapture readonly %
 ; CHECK-NEXT:    adds r3, #19
 ; CHECK-NEXT:    lsrs r4, r4, #4
 ; CHECK-NEXT:    cmp.w r4, #0
-; CHECK-NEXT:    beq .LBB0_2
-; CHECK-NEXT:  @ %bb.10: @ %land.end.us.2
-; CHECK-NEXT:    @ in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    beq .LBB0_3
+; CHECK-NEXT:  @ %bb.11: @ %land.end.us
+; CHECK-NEXT:    @ in Loop: Header=BB0_8 Depth=1
 ; CHECK-NEXT:    dlstp.8 lr, r0
-; CHECK-NEXT:  .LBB0_11: @ Parent Loop BB0_3 Depth=1
-; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vstrb.8 q3, [r3], #16
-; CHECK-NEXT:    letp lr, .LBB0_11
 ; CHECK-NEXT:    b .LBB0_2
 ; CHECK-NEXT:  .LBB0_12:
 ; CHECK-NEXT:    movw r12, :lower16:arr_183
diff --git a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
index da59cb259db6168..3fc7162c3c5e979 100644
--- a/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-memtp-loop.ll
@@ -450,56 +450,52 @@ define void @multilooped_exit(i32 %b) {
 ; CHECK-NEXT:    mov.w r4, #-1
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    b .LBB18_3
-; CHECK-NEXT:  .LBB18_2: @ %loop
-; CHECK-NEXT:    @ in Loop: Header=BB18_3 Depth=1
-; CHECK-NEXT:    adds r4, #1
-; CHECK-NEXT:    cmp.w r4, #1024
-; CHECK-NEXT:    bge .LBB18_12
+; CHECK-NEXT:  .LBB18_2: @ Parent Loop BB18_3 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    letp lr, .LBB18_2
+; CHECK-NEXT:    b .LBB18_4
 ; CHECK-NEXT:  .LBB18_3: @ %loop
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB18_4 Depth 2
-; CHECK-NEXT:    @ Child Loop BB18_6 Depth 2
-; CHECK-NEXT:    @ Child Loop BB18_8 Depth 2
-; CHECK-NEXT:    @ Child Loop BB18_11 Depth 2
+; CHECK-NEXT:    @ Child Loop BB18_2 Depth 2
+; CHECK-NEXT:    @ Child Loop BB18_5 Depth 2
+; CHECK-NEXT:    @ Child Loop BB18_7 Depth 2
+; CHECK-NEXT:    @ Child Loop BB18_9 Depth 2
 ; CHECK-NEXT:    movw r3, :lower16:arr_56
-; CHECK-NEXT:    add.w r1, r0, #15
 ; CHECK-NEXT:    movt r3, :upper16:arr_56
-; CHECK-NEXT:    lsr.w r12, r1, #4
 ; CHECK-NEXT:    mov r2, r3
-; CHECK-NEXT:    wlstp.8 lr, r0, .LBB18_5
-; CHECK-NEXT:  .LBB18_4: @ Parent Loop BB18_3 Depth=1
-; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vstrb.8 q0, [r2], #16
-; CHECK-NEXT:    letp lr, .LBB18_4
-; CHECK-NEXT:  .LBB18_5: @ %loop
+; CHECK-NEXT:    wlstp.8 lr, r0, .LBB18_4
+; CHECK-NEXT:    b .LBB18_2
+; CHECK-NEXT:  .LBB18_4: @ %loop
 ; CHECK-NEXT:    @ in Loop: Header=BB18_3 Depth=1
 ; CHECK-NEXT:    mov r2, r3
-; CHECK-NEXT:    wlstp.8 lr, r0, .LBB18_7
-; CHECK-NEXT:  .LBB18_6: @ Parent Loop BB18_3 Depth=1
+; CHECK-NEXT:    wlstp.8 lr, r0, .LBB18_6
+; CHECK-NEXT:  .LBB18_5: @ Parent Loop BB18_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
-; CHECK-NEXT:    letp lr, .LBB18_6
-; CHECK-NEXT:  .LBB18_7: @ %loop
+; CHECK-NEXT:    letp lr, .LBB18_5
+; CHECK-NEXT:  .LBB18_6: @ %loop
 ; CHECK-NEXT:    @ in Loop: Header=BB18_3 Depth=1
 ; CHECK-NEXT:    mov r2, r3
-; CHECK-NEXT:    wlstp.8 lr, r0, .LBB18_9
-; CHECK-NEXT:  .LBB18_8: @ Parent Loop BB18_3 Depth=1
+; CHECK-NEXT:    wlstp.8 lr, r0, .LBB18_8
+; CHECK-NEXT:  .LBB18_7: @ Parent Loop BB18_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
-; CHECK-NEXT:    letp lr, .LBB18_8
-; CHECK-NEXT:  .LBB18_9: @ %loop
-; CHECK-NEXT:    @ in Loop: Header=BB18_3 Depth=1
-; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    beq .LBB18_2
-; CHECK-NEXT:  @ %bb.10: @ %loop
+; CHECK-NEXT:    letp lr, .LBB18_7
+; CHECK-NEXT:  .LBB18_8: @ %loop
 ; CHECK-NEXT:    @ in Loop: Header=BB18_3 Depth=1
-; CHECK-NEXT:    dlstp.8 lr, r0
-; CHECK-NEXT:  .LBB18_11: @ Parent Loop BB18_3 Depth=1
+; CHECK-NEXT:    wlstp.8 lr, r0, .LBB18_10
+; CHECK-NEXT:  .LBB18_9: @ Parent Loop BB18_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vstrb.8 q0, [r3], #16
-; CHECK-NEXT:    letp lr, .LBB18_11
-; CHECK-NEXT:    b .LBB18_2
-; CHECK-NEXT:  .LBB18_12:
+; CHECK-NEXT:    letp lr, .LBB18_9
+; CHECK-NEXT:  .LBB18_10: @ %loop
+; CHECK-NEXT:    @ in Loop: Header=BB18_3 Depth=1
+; CHECK-NEXT:    adds r4, #1
+; CHECK-NEXT:    cmp.w r4, #1024
+; CHECK-NEXT:    bge .LBB18_11
+; CHECK-NEXT:    b .LBB18_3
+; CHECK-NEXT:  .LBB18_11:
 ; CHECK-NEXT:    pop.w {r4, lr}
 ; CHECK-NEXT:    bx lr
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
index 2aa183c31bab595..db209931c29a6c0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -1073,37 +1073,10 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf
 ; CHECK-NEXT:    mov.w r12, #1
 ; CHECK-NEXT:    str r2, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    lsrs r1, r2, #2
-; CHECK-NEXT:    b .LBB7_3
-; CHECK-NEXT:  .LBB7_2: @ in Loop: Header=BB7_3 Depth=1
-; CHECK-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    add.w r11, r11, #1
-; CHECK-NEXT:    lsl.w r12, r12, #2
-; CHECK-NEXT:    cmp r2, #7
-; CHECK-NEXT:    asr.w r1, r2, #2
-; CHECK-NEXT:    ble .LBB7_9
-; CHECK-NEXT:  .LBB7_3: @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB7_6 Depth 2
-; CHECK-NEXT:    @ Child Loop BB7_7 Depth 3
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    cmp.w r12, #1
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    lsr.w r2, r1, #2
-; CHECK-NEXT:    str r2, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    blt .LBB7_2
-; CHECK-NEXT:  @ %bb.4: @ in Loop: Header=BB7_3 Depth=1
-; CHECK-NEXT:    lsrs r2, r1, #3
-; CHECK-NEXT:    str r2, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    beq .LBB7_2
-; CHECK-NEXT:  @ %bb.5: @ %.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB7_3 Depth=1
-; CHECK-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    lsls r1, r1, #1
-; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    lsl.w r10, r2, #1
-; CHECK-NEXT:  .LBB7_6: @ Parent Loop BB7_3 Depth=1
+; CHECK-NEXT:    b .LBB7_6
+; CHECK-NEXT:  .LBB7_2: @ Parent Loop BB7_6 Depth=1
 ; CHECK-NEXT:    @ => This Loop Header: Depth=2
-; CHECK-NEXT:    @ Child Loop BB7_7 Depth 3
+; CHECK-NEXT:    @ Child Loop BB7_3 Depth 3
 ; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd lr, r2, [r0, #16]
 ; CHECK-NEXT:    ldrd r3, r8, [r0, #24]
@@ -1122,8 +1095,8 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf
 ; CHECK-NEXT:    add.w r8, r2, r10, lsl #2
 ; CHECK-NEXT:    add.w r9, r8, r10, lsl #2
 ; CHECK-NEXT:    dls lr, r6
-; CHECK-NEXT:  .LBB7_7: @ Parent Loop BB7_3 Depth=1
-; CHECK-NEXT:    @ Parent Loop BB7_6 Depth=2
+; CHECK-NEXT:  .LBB7_3: @ Parent Loop BB7_6 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB7_2 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
 ; CHECK-NEXT:    vldrw.u32 q3, [r9]
 ; CHECK-NEXT:    vldrw.u32 q4, [r2]
@@ -1150,11 +1123,38 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf
 ; CHECK-NEXT:    vcmul.f32 q2, q0, q1, #0
 ; CHECK-NEXT:    vcmla.f32 q2, q0, q1, #90
 ; CHECK-NEXT:    vstrb.8 q2, [r9], #16
-; CHECK-NEXT:    le lr, .LBB7_7
-; CHECK-NEXT:  @ %bb.8: @ in Loop: Header=BB7_6 Depth=2
+; CHECK-NEXT:    le lr, .LBB7_3
+; CHECK-NEXT:  @ %bb.4: @ in Loop: Header=BB7_2 Depth=2
 ; CHECK-NEXT:    adds r4, #1
 ; CHECK-NEXT:    cmp r4, r12
-; CHECK-NEXT:    bne .LBB7_6
+; CHECK-NEXT:    bne .LBB7_2
+; CHECK-NEXT:  .LBB7_5: @ in Loop: Header=BB7_6 Depth=1
+; CHECK-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    add.w r11, r11, #1
+; CHECK-NEXT:    lsl.w r12, r12, #2
+; CHECK-NEXT:    cmp r2, #7
+; CHECK-NEXT:    asr.w r1, r2, #2
+; CHECK-NEXT:    ble .LBB7_9
+; CHECK-NEXT:  .LBB7_6: @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB7_2 Depth 2
+; CHECK-NEXT:    @ Child Loop BB7_3 Depth 3
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    cmp.w r12, #1
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    lsr.w r2, r1, #2
+; CHECK-NEXT:    str r2, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    blt .LBB7_5
+; CHECK-NEXT:  @ %bb.7: @ in Loop: Header=BB7_6 Depth=1
+; CHECK-NEXT:    lsrs r2, r1, #3
+; CHECK-NEXT:    str r2, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT:    beq .LBB7_5
+; CHECK-NEXT:  @ %bb.8: @ %.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB7_6 Depth=1
+; CHECK-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    lsls r1, r1, #1
+; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    lsl.w r10, r2, #1
 ; CHECK-NEXT:    b .LBB7_2
 ; CHECK-NEXT:  .LBB7_9:
 ; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
diff --git a/llvm/test/CodeGen/VE/Scalar/br_jt.ll b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
index 216d4cca097001c..fd44a54a7273801 100644
--- a/llvm/test/CodeGen/VE/Scalar/br_jt.ll
+++ b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
@@ -21,15 +21,15 @@ define signext i32 @br_jt3(i32 signext %0) {
 ; CHECK-NEXT:    or %s0, 0, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB0_1:
-; CHECK-NEXT:    or %s0, 3, (0)1
-; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
-; CHECK-NEXT:    b.l.t (, %s10)
 ; CHECK-NEXT:  .LBB0_5:
 ; CHECK-NEXT:    or %s0, 7, (0)1
 ; CHECK-NEXT:  .LBB0_6:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    or %s0, 3, (0)1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
 ;
 ; PIC-LABEL: br_jt3:
 ; PIC:       # %bb.0:
@@ -43,14 +43,14 @@ define signext i32 @br_jt3(i32 signext %0) {
 ; PIC-NEXT:    or %s0, 0, (0)1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
-; PIC-NEXT:  .LBB0_1:
-; PIC-NEXT:    or %s0, 3, (0)1
-; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
-; PIC-NEXT:    b.l.t (, %s10)
 ; PIC-NEXT:  .LBB0_5:
 ; PIC-NEXT:    or %s0, 7, (0)1
 ; PIC-NEXT:  .LBB0_6:
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
+; PIC-NEXT:    b.l.t (, %s10)
+; PIC-NEXT:  .LBB0_1:
+; PIC-NEXT:    or %s0, 3, (0)1
+; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
   switch i32 %0, label %4 [
     i32 1, label %5
@@ -308,16 +308,16 @@ define signext i32 @br_jt3_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    or %s0, 0, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB4_1:
-; CHECK-NEXT:    or %s0, 3, (0)1
-; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
-; CHECK-NEXT:    b.l.t (, %s10)
 ; CHECK-NEXT:  .LBB4_5:
 ; CHECK-NEXT:    and %s0, %s1, (32)0
 ; CHECK-NEXT:    adds.w.sx %s0, 3, %s0
 ; CHECK-NEXT:  .LBB4_6:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
+; CHECK-NEXT:  .LBB4_1:
+; CHECK-NEXT:    or %s0, 3, (0)1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
 ;
 ; PIC-LABEL: br_jt3_m:
 ; PIC:       # %bb.0:
@@ -331,15 +331,15 @@ define signext i32 @br_jt3_m(i32 signext %0, i32 signext %1) {
 ; PIC-NEXT:    or %s0, 0, (0)1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
-; PIC-NEXT:  .LBB4_1:
-; PIC-NEXT:    or %s0, 3, (0)1
-; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
-; PIC-NEXT:    b.l.t (, %s10)
 ; PIC-NEXT:  .LBB4_5:
 ; PIC-NEXT:    and %s0, %s1, (32)0
 ; PIC-NEXT:    adds.w.sx %s0, 3, %s0
 ; PIC-NEXT:  .LBB4_6:
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
+; PIC-NEXT:    b.l.t (, %s10)
+; PIC-NEXT:  .LBB4_1:
+; PIC-NEXT:    or %s0, 3, (0)1
+; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
   switch i32 %0, label %6 [
     i32 1, label %7
@@ -534,15 +534,15 @@ define signext i32 @br_jt7_m(i32 signext %0, i32 signext %1) {
 ; PIC-NEXT:    adds.w.sx %s0, 3, %s1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
-; PIC-NEXT:  .LBB6_2:
-; PIC-NEXT:    or %s0, 3, (0)1
-; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
-; PIC-NEXT:    b.l.t (, %s10)
 ; PIC-NEXT:  .LBB6_15:
 ; PIC-NEXT:    or %s0, 11, (0)1
 ; PIC-NEXT:  .LBB6_16:
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
+; PIC-NEXT:  .LBB6_2:
+; PIC-NEXT:    or %s0, 3, (0)1
+; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
+; PIC-NEXT:    b.l.t (, %s10)
 ; PIC-NEXT:  .LBB6_13:
 ; PIC-NEXT:    or %s0, 0, (0)1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
@@ -615,11 +615,6 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    adds.w.sx %s0, 3, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB7_9:
-; CHECK-NEXT:    or %s0, 0, %s2
-; CHECK-NEXT:  .LBB7_10:
-; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
-; CHECK-NEXT:    b.l.t (, %s10)
 ; CHECK-NEXT:  .LBB7_5:
 ; CHECK-NEXT:    adds.w.sx %s0, -5, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
@@ -628,12 +623,17 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    adds.w.sx %s0, -2, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
+; CHECK-NEXT:  .LBB7_7:
+; CHECK-NEXT:    or %s0, 10, (0)1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
 ; CHECK-NEXT:  .LBB7_8:
 ; CHECK-NEXT:    or %s0, 11, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB7_7:
-; CHECK-NEXT:    or %s0, 10, (0)1
+; CHECK-NEXT:  .LBB7_9:
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:  .LBB7_10:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
 ;
@@ -672,20 +672,20 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) {
 ; PIC-NEXT:  .LBB7_4:
 ; PIC-NEXT:    adds.w.sx %s0, 3, %s1
 ; PIC-NEXT:    br.l.t .LBB7_10
-; PIC-NEXT:  .LBB7_9:
-; PIC-NEXT:    or %s0, 0, %s2
-; PIC-NEXT:    br.l.t .LBB7_10
 ; PIC-NEXT:  .LBB7_5:
 ; PIC-NEXT:    adds.w.sx %s0, -5, %s1
 ; PIC-NEXT:    br.l.t .LBB7_10
 ; PIC-NEXT:  .LBB7_6:
 ; PIC-NEXT:    adds.w.sx %s0, -2, %s1
 ; PIC-NEXT:    br.l.t .LBB7_10
+; PIC-NEXT:  .LBB7_7:
+; PIC-NEXT:    or %s0, 10, (0)1
+; PIC-NEXT:    br.l.t .LBB7_10
 ; PIC-NEXT:  .LBB7_8:
 ; PIC-NEXT:    or %s0, 11, (0)1
 ; PIC-NEXT:    br.l.t .LBB7_10
-; PIC-NEXT:  .LBB7_7:
-; PIC-NEXT:    or %s0, 10, (0)1
+; PIC-NEXT:  .LBB7_9:
+; PIC-NEXT:    or %s0, 0, %s2
 ; PIC-NEXT:  .LBB7_10:
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    ld %s16, 32(, %s11)
diff --git a/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll b/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
index ca281043eddaaee..42e39c22b43cef3 100644
--- a/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
+++ b/llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
@@ -53,6 +53,30 @@ define dso_local void @foo(ptr %a0, ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5)
 ; CHECK-NEXT:    movb $1, %r8b
 ; CHECK-NEXT:    jmp .LBB0_3
 ; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_4: # %a33b
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    movl %esi, %r9d
+; CHECK-NEXT:    orl %eax, %r9d
+; CHECK-NEXT:    jns .LBB0_20
+; CHECK-NEXT:  .LBB0_5: # %a50b
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    movl %eax, %r10d
+; CHECK-NEXT:    orl %esi, %r10d
+; CHECK-NEXT:    jns .LBB0_25
+; CHECK-NEXT:  .LBB0_6: # %a57b
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    shrl $31, %r9d
+; CHECK-NEXT:    testb %r9b, %r9b
+; CHECK-NEXT:    je .LBB0_29
+; CHECK-NEXT:  .LBB0_7: # %a66b
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    shrl $31, %r10d
+; CHECK-NEXT:    testb %r10b, %r10b
+; CHECK-NEXT:    je .LBB0_33
+; CHECK-NEXT:  .LBB0_8: # %a93b
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jns .LBB0_35
 ; CHECK-NEXT:  .LBB0_9: # %b1606
 ; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    testb %dil, %dil
@@ -95,26 +119,7 @@ define dso_local void @foo(ptr %a0, ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5)
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    testb %dil, %dil
 ; CHECK-NEXT:    je .LBB0_19
-; CHECK-NEXT:  .LBB0_4: # %a33b
-; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    movl %esi, %r9d
-; CHECK-NEXT:    orl %eax, %r9d
-; CHECK-NEXT:    jns .LBB0_20
-; CHECK-NEXT:  .LBB0_5: # %a50b
-; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    movl %eax, %r10d
-; CHECK-NEXT:    orl %esi, %r10d
-; CHECK-NEXT:    jns .LBB0_25
-; CHECK-NEXT:  .LBB0_6: # %a57b
-; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    shrl $31, %r9d
-; CHECK-NEXT:    testb %r9b, %r9b
-; CHECK-NEXT:    je .LBB0_29
-; CHECK-NEXT:  .LBB0_7: # %a66b
-; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    shrl $31, %r10d
-; CHECK-NEXT:    testb %r10b, %r10b
-; CHECK-NEXT:    jne .LBB0_8
+; CHECK-NEXT:    jmp .LBB0_4
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_33: # %a74b
 ; CHECK-NEXT:    # Parent Loop BB0_3 Depth=1
@@ -125,10 +130,7 @@ define dso_local void @foo(ptr %a0, ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5)
 ; CHECK-NEXT:    # in Loop: Header=BB0_33 Depth=2
 ; CHECK-NEXT:    testb %dil, %dil
 ; CHECK-NEXT:    jne .LBB0_33
-; CHECK-NEXT:  .LBB0_8: # %a93b
-; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    testl %eax, %eax
-; CHECK-NEXT:    js .LBB0_9
+; CHECK-NEXT:    jmp .LBB0_8
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_35: # %a97b
 ; CHECK-NEXT:    # Parent Loop BB0_3 Depth=1
@@ -163,6 +165,28 @@ define dso_local void @foo(ptr %a0, ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5)
 ; CHECK-NEXT:    je .LBB0_37
 ; CHECK-NEXT:    jmp .LBB0_21
 ; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_23: # %a45b
+; CHECK-NEXT:    # Parent Loop BB0_3 Depth=1
+; CHECK-NEXT:    # Parent Loop BB0_22 Depth=2
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=3
+; CHECK-NEXT:    testb %dil, %dil
+; CHECK-NEXT:    je .LBB0_23
+; CHECK-NEXT:  .LBB0_24: # %b712
+; CHECK-NEXT:    # in Loop: Header=BB0_22 Depth=2
+; CHECK-NEXT:    testb %dil, %dil
+; CHECK-NEXT:    je .LBB0_5
+; CHECK-NEXT:  .LBB0_22: # %b535
+; CHECK-NEXT:    # Parent Loop BB0_3 Depth=1
+; CHECK-NEXT:    # => This Loop Header: Depth=2
+; CHECK-NEXT:    # Child Loop BB0_23 Depth 3
+; CHECK-NEXT:    testq %rdx, %rdx
+; CHECK-NEXT:    jns .LBB0_23
+; CHECK-NEXT:    jmp .LBB0_24
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_26: # %b879
+; CHECK-NEXT:    # in Loop: Header=BB0_25 Depth=2
+; CHECK-NEXT:    testb %r8b, %r8b
+; CHECK-NEXT:    je .LBB0_28
 ; CHECK-NEXT:  .LBB0_27: # %b1016
 ; CHECK-NEXT:    # in Loop: Header=BB0_25 Depth=2
 ; CHECK-NEXT:    testq %rsi, %rsi
@@ -181,10 +205,7 @@ define dso_local void @foo(ptr %a0, ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5)
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=3
 ; CHECK-NEXT:    testb %dil, %dil
 ; CHECK-NEXT:    je .LBB0_38
-; CHECK-NEXT:  .LBB0_26: # %b879
-; CHECK-NEXT:    # in Loop: Header=BB0_25 Depth=2
-; CHECK-NEXT:    testb %r8b, %r8b
-; CHECK-NEXT:    jne .LBB0_27
+; CHECK-NEXT:    jmp .LBB0_26
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_28: # %a53b1019
 ; CHECK-NEXT:    # Parent Loop BB0_3 Depth=1
@@ -194,6 +215,10 @@ define dso_local void @foo(ptr %a0, ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5)
 ; CHECK-NEXT:    jle .LBB0_28
 ; CHECK-NEXT:    jmp .LBB0_27
 ; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_30: # %b1139
+; CHECK-NEXT:    # in Loop: Header=BB0_29 Depth=2
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    jg .LBB0_32
 ; CHECK-NEXT:  .LBB0_31: # %b1263
 ; CHECK-NEXT:    # in Loop: Header=BB0_29 Depth=2
 ; CHECK-NEXT:    testq %rdx, %rdx
@@ -212,10 +237,7 @@ define dso_local void @foo(ptr %a0, ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5)
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=3
 ; CHECK-NEXT:    testq %rsi, %rsi
 ; CHECK-NEXT:    jle .LBB0_39
-; CHECK-NEXT:  .LBB0_30: # %b1139
-; CHECK-NEXT:    # in Loop: Header=BB0_29 Depth=2
-; CHECK-NEXT:    testq %rsi, %rsi
-; CHECK-NEXT:    jle .LBB0_31
+; CHECK-NEXT:    jmp .LBB0_30
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_32: # %a63b1266
 ; CHECK-NEXT:    # Parent Loop BB0_3 Depth=1
@@ -224,25 +246,6 @@ define dso_local void @foo(ptr %a0, ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5)
 ; CHECK-NEXT:    testq %rsi, %rsi
 ; CHECK-NEXT:    jle .LBB0_32
 ; CHECK-NEXT:    jmp .LBB0_31
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_24: # %b712
-; CHECK-NEXT:    # in Loop: Header=BB0_22 Depth=2
-; CHECK-NEXT:    testb %dil, %dil
-; CHECK-NEXT:    je .LBB0_5
-; CHECK-NEXT:  .LBB0_22: # %b535
-; CHECK-NEXT:    # Parent Loop BB0_3 Depth=1
-; CHECK-NEXT:    # => This Loop Header: Depth=2
-; CHECK-NEXT:    # Child Loop BB0_23 Depth 3
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    js .LBB0_24
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_23: # %a45b
-; CHECK-NEXT:    # Parent Loop BB0_3 Depth=1
-; CHECK-NEXT:    # Parent Loop BB0_22 Depth=2
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=3
-; CHECK-NEXT:    testb %dil, %dil
-; CHECK-NEXT:    je .LBB0_23
-; CHECK-NEXT:    jmp .LBB0_24
 ; CHECK-NEXT:  .LBB0_10: # %a109b
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
diff --git a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
index bf939c4131080d3..332a4596eb337e0 100644
--- a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
+++ b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
@@ -137,15 +137,6 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    calll __ZN12wxStringBase10ConcatSelfEmPKwm
 ; CHECK-NEXT:  Ltmp11:
 ; CHECK-NEXT:    jmp LBB0_5
-; CHECK-NEXT:  LBB0_9: ## %bb5657
-; CHECK-NEXT:  Ltmp13:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %eax, (%esp)
-; CHECK-NEXT:    calll __ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE
-; CHECK-NEXT:  Ltmp14:
-; CHECK-NEXT:    jmp LBB0_25
 ; CHECK-NEXT:  LBB0_20: ## %bb5968
 ; CHECK-NEXT:  Ltmp2:
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -154,6 +145,15 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    calll __ZN8wxString6FormatEPKwz
 ; CHECK-NEXT:    subl $4, %esp
 ; CHECK-NEXT:  Ltmp3:
+; CHECK-NEXT:    jmp LBB0_25
+; CHECK-NEXT:  LBB0_9: ## %bb5657
+; CHECK-NEXT:  Ltmp13:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, (%esp)
+; CHECK-NEXT:    calll __ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE
+; CHECK-NEXT:  Ltmp14:
 ; CHECK-NEXT:  LBB0_25: ## %bb115.critedge.i
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    addl $28, %esp
diff --git a/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll b/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll
index 214da14322d511e..5ad218f6cdc8934 100644
--- a/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll
+++ b/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll
@@ -45,9 +45,6 @@ define internal fastcc i32 @foo(i64 %bar) nounwind ssp {
 ; CHECK-NEXT:  LBB0_3: ## %RRETURN_6
 ; CHECK-NEXT:    callq _f2
 ; CHECK-NEXT:    jmp LBB0_28
-; CHECK-NEXT:  LBB0_2: ## %RETURN
-; CHECK-NEXT:    callq _f1
-; CHECK-NEXT:    jmp LBB0_28
 ; CHECK-NEXT:  LBB0_4: ## %RRETURN_7
 ; CHECK-NEXT:    callq _f3
 ; CHECK-NEXT:    jmp LBB0_28
@@ -119,6 +116,9 @@ define internal fastcc i32 @foo(i64 %bar) nounwind ssp {
 ; CHECK-NEXT:    jmp LBB0_28
 ; CHECK-NEXT:  LBB0_27: ## %RRETURN_1
 ; CHECK-NEXT:    callq _f26
+; CHECK-NEXT:    jmp LBB0_28
+; CHECK-NEXT:  LBB0_2: ## %RETURN
+; CHECK-NEXT:    callq _f1
 ; CHECK-NEXT:  LBB0_28: ## %EXIT
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    popq %rcx
diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll
index a522f0e9828a054..5cdabc0a79b0d8e 100644
--- a/llvm/test/CodeGen/X86/block-placement.ll
+++ b/llvm/test/CodeGen/X86/block-placement.ll
@@ -1149,8 +1149,8 @@ define void @test_flow_unwind() personality i32 (...)* @pers {
 ; CHECK: %entry
 ; CHECK: %then
 ; CHECK: %exit
-; CHECK: %innerlp
 ; CHECK: %outerlp
+; CHECK: %innerlp
 ; CHECK: %outercleanup
 entry:
   %0 = invoke i32 @foo()
diff --git a/llvm/test/CodeGen/X86/callbr-asm-outputs.ll b/llvm/test/CodeGen/X86/callbr-asm-outputs.ll
index f5f033398310116..880f85cc7f17b78 100644
--- a/llvm/test/CodeGen/X86/callbr-asm-outputs.ll
+++ b/llvm/test/CodeGen/X86/callbr-asm-outputs.ll
@@ -50,12 +50,12 @@ define i32 @test2(i32 %out1, i32 %out2) nounwind {
 ; CHECK-NEXT:  .LBB1_2: # Block address taken
 ; CHECK-NEXT:    # %if.then.label_true_crit_edge
 ; CHECK-NEXT:    # Label of block must be emitted
-; CHECK-NEXT:    jmp .LBB1_8
+; CHECK-NEXT:    jmp .LBB1_9
 ; CHECK-NEXT:  .LBB1_3: # %if.else
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    testl %esi, %edi
 ; CHECK-NEXT:    testl %esi, %edi
-; CHECK-NEXT:    jne .LBB1_9
+; CHECK-NEXT:    jne .LBB1_6
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:  .LBB1_4:
 ; CHECK-NEXT:    movl %esi, %eax
@@ -64,20 +64,20 @@ define i32 @test2(i32 %out1, i32 %out2) nounwind {
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
-; CHECK-NEXT:  .LBB1_7: # Block address taken
-; CHECK-NEXT:    # %if.else.label_true_crit_edge
-; CHECK-NEXT:    # Label of block must be emitted
-; CHECK-NEXT:  .LBB1_8: # %label_true
-; CHECK-NEXT:    movl $-2, %eax
-; CHECK-NEXT:    jmp .LBB1_5
-; CHECK-NEXT:  .LBB1_9: # Block address taken
+; CHECK-NEXT:  .LBB1_6: # Block address taken
 ; CHECK-NEXT:    # %if.else.return_crit_edge
 ; CHECK-NEXT:    # Label of block must be emitted
-; CHECK-NEXT:  .LBB1_6: # Block address taken
+; CHECK-NEXT:  .LBB1_7: # Block address taken
 ; CHECK-NEXT:    # %if.then.return_crit_edge
 ; CHECK-NEXT:    # Label of block must be emitted
 ; CHECK-NEXT:    movl $-1, %eax
 ; CHECK-NEXT:    jmp .LBB1_5
+; CHECK-NEXT:  .LBB1_8: # Block address taken
+; CHECK-NEXT:    # %if.else.label_true_crit_edge
+; CHECK-NEXT:    # Label of block must be emitted
+; CHECK-NEXT:  .LBB1_9: # %label_true
+; CHECK-NEXT:    movl $-2, %eax
+; CHECK-NEXT:    jmp .LBB1_5
 entry:
   %cmp = icmp slt i32 %out1, %out2
   br i1 %cmp, label %if.then, label %if.else
diff --git a/llvm/test/CodeGen/X86/code_placement_loop_rotation2.ll b/llvm/test/CodeGen/X86/code_placement_loop_rotation2.ll
index cdf2fb05a731374..cc14f73b1c2faa9 100644
--- a/llvm/test/CodeGen/X86/code_placement_loop_rotation2.ll
+++ b/llvm/test/CodeGen/X86/code_placement_loop_rotation2.ll
@@ -59,9 +59,9 @@ define void @bar() !prof !1 {
 ; Test a nested loop case when profile data is available.
 ;
 ; CHECK-PROFILE-LABEL: bar:
+; CHECK-PROFILE: callq g
 ; CHECK-PROFILE: callq h
 ; CHECK-PROFILE: callq b
-; CHECK-PROFILE: callq g
 ; CHECK-PROFILE: callq e
 ; CHECK-PROFILE: callq f
 ; CHECK-PROFILE: callq c
diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
index adb7319fe80b114..2f0dc756b426108 100644
--- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
+++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -144,17 +144,17 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    movq x0 at GOTPCREL(%rip), %r15
 ; CHECK-NEXT:    movq %rsi, %r12
 ; CHECK-NEXT:    jmp .LBB1_2
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB1_15: # %for.cond1.for.inc3_crit_edge
-; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    movl %edx, (%r8)
-; CHECK-NEXT:  .LBB1_16: # %for.inc3
+; CHECK-NEXT:  .LBB1_9: # %vector.body.prol.preheader
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    addq %rbx, %r12
-; CHECK-NEXT:    incq %r14
-; CHECK-NEXT:    addq %rbx, %r11
-; CHECK-NEXT:    incl %ecx
-; CHECK-NEXT:    je .LBB1_17
+; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; CHECK-NEXT:    movdqu %xmm0, (%r12,%r13,8)
+; CHECK-NEXT:    movdqu %xmm0, 16(%r12,%r13,8)
+; CHECK-NEXT:    movl $4, %r10d
+; CHECK-NEXT:    shrq $2, %rax
+; CHECK-NEXT:    jne .LBB1_11
+; CHECK-NEXT:    jmp .LBB1_13
+; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB1_2: # %for.cond1thread-pre-split
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB1_12 Depth 2
@@ -197,18 +197,8 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    leaq -4(%rdx), %rax
 ; CHECK-NEXT:    btl $2, %eax
-; CHECK-NEXT:    jb .LBB1_8
-; CHECK-NEXT:  # %bb.9: # %vector.body.prol.preheader
-; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT:    movdqu %xmm0, (%r12,%r13,8)
-; CHECK-NEXT:    movdqu %xmm0, 16(%r12,%r13,8)
-; CHECK-NEXT:    movl $4, %r10d
-; CHECK-NEXT:    shrq $2, %rax
-; CHECK-NEXT:    jne .LBB1_11
-; CHECK-NEXT:    jmp .LBB1_13
-; CHECK-NEXT:  .LBB1_8: # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    jae .LBB1_9
+; CHECK-NEXT:  # %bb.8: # in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    xorl %r10d, %r10d
 ; CHECK-NEXT:    shrq $2, %rax
 ; CHECK-NEXT:    je .LBB1_13
@@ -247,8 +237,17 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    cmpq $-1, %r13
 ; CHECK-NEXT:    movq %rdx, %r13
 ; CHECK-NEXT:    jl .LBB1_14
-; CHECK-NEXT:    jmp .LBB1_15
-; CHECK-NEXT:  .LBB1_17: # %for.cond.for.end5_crit_edge
+; CHECK-NEXT:  .LBB1_15: # %for.cond1.for.inc3_crit_edge
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    movl %edx, (%r8)
+; CHECK-NEXT:  .LBB1_16: # %for.inc3
+; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    addq %rbx, %r12
+; CHECK-NEXT:    incq %r14
+; CHECK-NEXT:    addq %rbx, %r11
+; CHECK-NEXT:    incl %ecx
+; CHECK-NEXT:    jne .LBB1_2
+; CHECK-NEXT:  # %bb.17: # %for.cond.for.end5_crit_edge
 ; CHECK-NEXT:    movq x5 at GOTPCREL(%rip), %rax
 ; CHECK-NEXT:    movq %rdi, (%rax)
 ; CHECK-NEXT:    movq x3 at GOTPCREL(%rip), %rax
diff --git a/llvm/test/CodeGen/X86/dup-cost.ll b/llvm/test/CodeGen/X86/dup-cost.ll
index ec9d36aa2a11b65..93af487c64ca09d 100644
--- a/llvm/test/CodeGen/X86/dup-cost.ll
+++ b/llvm/test/CodeGen/X86/dup-cost.ll
@@ -16,13 +16,13 @@ define i32 @cold(i32 %a, ptr %p, ptr %q) !prof !21 {
 ; CHECK-NEXT:  # %bb.4: # %true2
 ; CHECK-NEXT:    xorl %edi, %eax
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_5: # %false2
+; CHECK-NEXT:    andl %edi, %eax
+; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB0_2: # %false1
 ; CHECK-NEXT:    movl (%rdx), %eax
 ; CHECK-NEXT:    addl $-3, %eax
 ; CHECK-NEXT:    jmp .LBB0_3
-; CHECK-NEXT:  .LBB0_5: # %false2
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    retq
 entry:
   %cond1 = icmp sgt i32 %a, 1
   br i1 %cond1, label %true1, label %false1, !prof !30
diff --git a/llvm/test/CodeGen/X86/hoist-invariant-load.ll b/llvm/test/CodeGen/X86/hoist-invariant-load.ll
index ee23c922174d64c..4b60ce6154d1c08 100644
--- a/llvm/test/CodeGen/X86/hoist-invariant-load.ll
+++ b/llvm/test/CodeGen/X86/hoist-invariant-load.ll
@@ -219,19 +219,21 @@ define void @test_multi_def(ptr dereferenceable(8) align(8) %x1,
 ; CHECK-NEXT:    xorl %r8d, %r8d
 ; CHECK-NEXT:    movq (%rdi), %rdx
 ; CHECK-NEXT:    movq (%rsi), %rsi
+; CHECK-NEXT:    jmp LBB4_2
 ; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB4_1: ## %for.check
+; CHECK-NEXT:    ## in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    incq %r8
+; CHECK-NEXT:    addq $16, %rax
+; CHECK-NEXT:    cmpq %rcx, %r8
+; CHECK-NEXT:    jge LBB4_3
 ; CHECK-NEXT:  LBB4_2: ## %for.body
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mulxq %rsi, %r9, %rdi
 ; CHECK-NEXT:    addq %r9, (%rax)
 ; CHECK-NEXT:    adcq %rdi, 8(%rax)
-; CHECK-NEXT:  ## %bb.1: ## %for.check
-; CHECK-NEXT:    ## in Loop: Header=BB4_2 Depth=1
-; CHECK-NEXT:    incq %r8
-; CHECK-NEXT:    addq $16, %rax
-; CHECK-NEXT:    cmpq %rcx, %r8
-; CHECK-NEXT:    jl LBB4_2
-; CHECK-NEXT:  ## %bb.3: ## %exit
+; CHECK-NEXT:    jmp LBB4_1
+; CHECK-NEXT:  LBB4_3: ## %exit
 ; CHECK-NEXT:    retq
                             ptr dereferenceable(8) align(8) %x2,
                             ptr %y, i64 %count) nounwind nofree nosync {
@@ -267,19 +269,21 @@ define void @test_div_def(ptr dereferenceable(8) align(8) %x1,
 ; CHECK-NEXT:    xorl %r9d, %r9d
 ; CHECK-NEXT:    movl (%rdi), %edi
 ; CHECK-NEXT:    movl (%rsi), %esi
+; CHECK-NEXT:    jmp LBB5_2
 ; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB5_1: ## %for.check
+; CHECK-NEXT:    ## in Loop: Header=BB5_2 Depth=1
+; CHECK-NEXT:    incq %r9
+; CHECK-NEXT:    cmpl %ecx, %r9d
+; CHECK-NEXT:    jge LBB5_3
 ; CHECK-NEXT:  LBB5_2: ## %for.body
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    divl %esi
 ; CHECK-NEXT:    addl %eax, (%r8,%r9,4)
-; CHECK-NEXT:  ## %bb.1: ## %for.check
-; CHECK-NEXT:    ## in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT:    incq %r9
-; CHECK-NEXT:    cmpl %ecx, %r9d
-; CHECK-NEXT:    jl LBB5_2
-; CHECK-NEXT:  ## %bb.3: ## %exit
+; CHECK-NEXT:    jmp LBB5_1
+; CHECK-NEXT:  LBB5_3: ## %exit
 ; CHECK-NEXT:    retq
                           ptr dereferenceable(8) align(8) %x2,
                           ptr %y, i32 %count) nounwind nofree nosync {
diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce7.ll b/llvm/test/CodeGen/X86/loop-strength-reduce7.ll
index 3687194972f27f4..66a4a894093bcb1 100644
--- a/llvm/test/CodeGen/X86/loop-strength-reduce7.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce7.ll
@@ -13,7 +13,12 @@ define fastcc void @outer_loop(ptr nocapture %gfp, ptr nocapture %xr, i32 %targ_
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    movl $88, %eax
 ; CHECK-NEXT:    movl $168, %ecx
+; CHECK-NEXT:    jmp LBB0_2
 ; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB0_1: ## %bb28.i37.loopexit
+; CHECK-NEXT:    ## in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    addl $4, %eax
+; CHECK-NEXT:    addl $168, %ecx
 ; CHECK-NEXT:  LBB0_2: ## %bb28.i37
 ; CHECK-NEXT:    ## =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ## Child Loop BB0_3 Depth 2
@@ -27,11 +32,7 @@ define fastcc void @outer_loop(ptr nocapture %gfp, ptr nocapture %xr, i32 %targ_
 ; CHECK-NEXT:    addl $12, %esi
 ; CHECK-NEXT:    cmpl $11, %edx
 ; CHECK-NEXT:    jbe LBB0_3
-; CHECK-NEXT:  ## %bb.1: ## %bb28.i37.loopexit
-; CHECK-NEXT:    ## in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    addl $4, %eax
-; CHECK-NEXT:    addl $168, %ecx
-; CHECK-NEXT:    jmp LBB0_2
+; CHECK-NEXT:    jmp LBB0_1
 entry:
 	br label %bb4
 
diff --git a/llvm/test/CodeGen/X86/mul-constant-result.ll b/llvm/test/CodeGen/X86/mul-constant-result.ll
index beb2dba05e85ac3..9b74ef08a8f5482 100644
--- a/llvm/test/CodeGen/X86/mul-constant-result.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-result.ll
@@ -28,7 +28,7 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-NEXT:  .LBB0_4:
 ; X86-NEXT:    decl %ecx
 ; X86-NEXT:    cmpl $31, %ecx
-; X86-NEXT:    ja .LBB0_7
+; X86-NEXT:    ja .LBB0_12
 ; X86-NEXT:  # %bb.5:
 ; X86-NEXT:    jmpl *.LJTI0_0(,%ecx,4)
 ; X86-NEXT:  .LBB0_6:
@@ -36,27 +36,27 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_7:
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB0_8:
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_10:
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    shll $2, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_12:
+; X86-NEXT:  .LBB0_10:
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    jmp .LBB0_9
-; X86-NEXT:  .LBB0_13:
+; X86-NEXT:    jmp .LBB0_7
+; X86-NEXT:  .LBB0_11:
 ; X86-NEXT:    leal (,%eax,8), %ecx
 ; X86-NEXT:    jmp .LBB0_42
+; X86-NEXT:  .LBB0_12:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB0_13:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
 ; X86-NEXT:  .LBB0_14:
+; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    shll $3, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -64,13 +64,13 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-NEXT:  .LBB0_16:
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    jmp .LBB0_11
+; X86-NEXT:    jmp .LBB0_9
 ; X86-NEXT:  .LBB0_17:
 ; X86-NEXT:    leal (%eax,%eax,4), %ecx
 ; X86-NEXT:    jmp .LBB0_18
 ; X86-NEXT:  .LBB0_19:
 ; X86-NEXT:    shll $2, %eax
-; X86-NEXT:    jmp .LBB0_9
+; X86-NEXT:    jmp .LBB0_7
 ; X86-NEXT:  .LBB0_20:
 ; X86-NEXT:    leal (%eax,%eax,2), %ecx
 ; X86-NEXT:    jmp .LBB0_21
@@ -80,7 +80,7 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-NEXT:    jmp .LBB0_23
 ; X86-NEXT:  .LBB0_24:
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
-; X86-NEXT:    jmp .LBB0_9
+; X86-NEXT:    jmp .LBB0_7
 ; X86-NEXT:  .LBB0_25:
 ; X86-NEXT:    shll $4, %eax
 ; X86-NEXT:    popl %esi
@@ -109,7 +109,7 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-NEXT:  .LBB0_30:
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    shll $2, %eax
-; X86-NEXT:    jmp .LBB0_11
+; X86-NEXT:    jmp .LBB0_9
 ; X86-NEXT:  .LBB0_31:
 ; X86-NEXT:    leal (%eax,%eax,4), %ecx
 ; X86-NEXT:  .LBB0_21:
@@ -128,10 +128,10 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-NEXT:    jmp .LBB0_42
 ; X86-NEXT:  .LBB0_34:
 ; X86-NEXT:    shll $3, %eax
-; X86-NEXT:    jmp .LBB0_9
+; X86-NEXT:    jmp .LBB0_7
 ; X86-NEXT:  .LBB0_35:
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
-; X86-NEXT:  .LBB0_11:
+; X86-NEXT:  .LBB0_9:
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -143,7 +143,7 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-NEXT:    jmp .LBB0_27
 ; X86-NEXT:  .LBB0_37:
 ; X86-NEXT:    leal (%eax,%eax,8), %eax
-; X86-NEXT:  .LBB0_9:
+; X86-NEXT:  .LBB0_7:
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -199,41 +199,54 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X64-HSW-NEXT:    cmovel %ecx, %eax
 ; X64-HSW-NEXT:    decl %edi
 ; X64-HSW-NEXT:    cmpl $31, %edi
-; X64-HSW-NEXT:    ja .LBB0_3
+; X64-HSW-NEXT:    ja .LBB0_8
 ; X64-HSW-NEXT:  # %bb.1:
 ; X64-HSW-NEXT:    jmpq *.LJTI0_0(,%rdi,8)
 ; X64-HSW-NEXT:  .LBB0_2:
 ; X64-HSW-NEXT:    addl %eax, %eax
 ; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-HSW-NEXT:    retq
-; X64-HSW-NEXT:  .LBB0_3:
-; X64-HSW-NEXT:    xorl %eax, %eax
 ; X64-HSW-NEXT:  .LBB0_4:
-; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
-; X64-HSW-NEXT:    retq
-; X64-HSW-NEXT:  .LBB0_6:
 ; X64-HSW-NEXT:    shll $2, %eax
 ; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-HSW-NEXT:    retq
-; X64-HSW-NEXT:  .LBB0_8:
+; X64-HSW-NEXT:  .LBB0_6:
 ; X64-HSW-NEXT:    addl %eax, %eax
-; X64-HSW-NEXT:  .LBB0_5:
+; X64-HSW-NEXT:  .LBB0_3:
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
 ; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_33:
+; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
+; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
+; X64-HSW-NEXT:    jmp .LBB0_34
+; X64-HSW-NEXT:  .LBB0_8:
+; X64-HSW-NEXT:    xorl %eax, %eax
 ; X64-HSW-NEXT:  .LBB0_9:
-; X64-HSW-NEXT:    leal (,%rax,8), %ecx
-; X64-HSW-NEXT:    jmp .LBB0_38
+; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-HSW-NEXT:    retq
 ; X64-HSW-NEXT:  .LBB0_10:
 ; X64-HSW-NEXT:    shll $3, %eax
 ; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-HSW-NEXT:    retq
 ; X64-HSW-NEXT:  .LBB0_12:
 ; X64-HSW-NEXT:    addl %eax, %eax
-; X64-HSW-NEXT:  .LBB0_7:
+; X64-HSW-NEXT:  .LBB0_5:
 ; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
 ; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_31:
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %ecx
+; X64-HSW-NEXT:    leal (%rcx,%rcx,4), %ecx
+; X64-HSW-NEXT:    jmp .LBB0_34
+; X64-HSW-NEXT:  .LBB0_32:
+; X64-HSW-NEXT:    leal (%rax,%rax,8), %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
+; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_7:
+; X64-HSW-NEXT:    leal (,%rax,8), %ecx
+; X64-HSW-NEXT:    jmp .LBB0_38
 ; X64-HSW-NEXT:  .LBB0_13:
 ; X64-HSW-NEXT:    leal (%rax,%rax,4), %ecx
 ; X64-HSW-NEXT:    leal (%rax,%rcx,2), %eax
@@ -292,33 +305,6 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X64-HSW-NEXT:  .LBB0_27:
 ; X64-HSW-NEXT:    leal (%rax,%rax,4), %ecx
 ; X64-HSW-NEXT:    leal (%rax,%rcx,4), %ecx
-; X64-HSW-NEXT:    jmp .LBB0_34
-; X64-HSW-NEXT:  .LBB0_28:
-; X64-HSW-NEXT:    leal (%rax,%rax,2), %ecx
-; X64-HSW-NEXT:    shll $3, %ecx
-; X64-HSW-NEXT:    jmp .LBB0_38
-; X64-HSW-NEXT:  .LBB0_29:
-; X64-HSW-NEXT:    shll $3, %eax
-; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
-; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
-; X64-HSW-NEXT:    retq
-; X64-HSW-NEXT:  .LBB0_30:
-; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
-; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
-; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
-; X64-HSW-NEXT:    retq
-; X64-HSW-NEXT:  .LBB0_31:
-; X64-HSW-NEXT:    leal (%rax,%rax,4), %ecx
-; X64-HSW-NEXT:    leal (%rcx,%rcx,4), %ecx
-; X64-HSW-NEXT:    jmp .LBB0_34
-; X64-HSW-NEXT:  .LBB0_32:
-; X64-HSW-NEXT:    leal (%rax,%rax,8), %eax
-; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
-; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
-; X64-HSW-NEXT:    retq
-; X64-HSW-NEXT:  .LBB0_33:
-; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
-; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
 ; X64-HSW-NEXT:  .LBB0_34:
 ; X64-HSW-NEXT:    addl %eax, %ecx
 ; X64-HSW-NEXT:    movl %ecx, %eax
@@ -340,6 +326,10 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X64-HSW-NEXT:  .LBB0_37:
 ; X64-HSW-NEXT:    movl %eax, %ecx
 ; X64-HSW-NEXT:    shll $5, %ecx
+; X64-HSW-NEXT:    jmp .LBB0_38
+; X64-HSW-NEXT:  .LBB0_28:
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %ecx
+; X64-HSW-NEXT:    shll $3, %ecx
 ; X64-HSW-NEXT:  .LBB0_38:
 ; X64-HSW-NEXT:    subl %eax, %ecx
 ; X64-HSW-NEXT:    movl %ecx, %eax
@@ -348,6 +338,16 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X64-HSW-NEXT:  .LBB0_40:
 ; X64-HSW-NEXT:    shll $5, %eax
 ; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_29:
+; X64-HSW-NEXT:    shll $3, %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
+; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_30:
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
+; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-HSW-NEXT:    retq
   %3 = icmp eq i32 %1, 0
   %4 = icmp sgt i32 %1, 1
diff --git a/llvm/test/CodeGen/X86/pic.ll b/llvm/test/CodeGen/X86/pic.ll
index 7c4db752b4e0425..408ea9ad71bd9e6 100644
--- a/llvm/test/CodeGen/X86/pic.ll
+++ b/llvm/test/CodeGen/X86/pic.ll
@@ -231,19 +231,19 @@ bb12:
 ; CHECK-I686:	.long	 .LBB7_5 at GOTOFF
 ; CHECK-I686:	.long	 .LBB7_8 at GOTOFF
 ; CHECK-I686:	.long	 .LBB7_7 at GOTOFF
+; CHECK-X32:	.long	.LBB7_2-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_2-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_4-.LJTI7_0
 ; CHECK-X32:	.long	.LBB7_3-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_4-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_6-.LJTI7_0
 ; CHECK-X32:	.long	.LBB7_3-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_12-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_8-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_12-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_10-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_8-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_9-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_10-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_9-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_12-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_14-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_14-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_5-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_6-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_5-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_4-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_7-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_7-.LJTI7_0
 }
 
 declare void @foo1(...)
diff --git a/llvm/test/CodeGen/X86/postalloc-coalescing.ll b/llvm/test/CodeGen/X86/postalloc-coalescing.ll
index 68266fc4e158a3c..aba3150447d4adc 100644
--- a/llvm/test/CodeGen/X86/postalloc-coalescing.ll
+++ b/llvm/test/CodeGen/X86/postalloc-coalescing.ll
@@ -5,15 +5,16 @@ define fastcc i32 @_Z18yy_get_next_bufferv() nounwind {
 ; CHECK-LABEL: _Z18yy_get_next_bufferv:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl $42, %eax
-; CHECK-NEXT:    cmpl $-1, %eax
-; CHECK-NEXT:    je .LBB0_3
+; CHECK-NEXT:    jmp .LBB0_2
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_1: # %bb116
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    # in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    movb %al, 0
+; CHECK-NEXT:  .LBB0_2: # %bb131
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpl $-1, %eax
 ; CHECK-NEXT:    jne .LBB0_1
-; CHECK-NEXT:  .LBB0_3: # %bb158
+; CHECK-NEXT:  # %bb.3: # %bb158
 ; CHECK-NEXT:    movb %al, 0
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pr38743.ll b/llvm/test/CodeGen/X86/pr38743.ll
index c05310090660dd0..ef1c87118c8118f 100644
--- a/llvm/test/CodeGen/X86/pr38743.ll
+++ b/llvm/test/CodeGen/X86/pr38743.ll
@@ -27,15 +27,15 @@ define void @pr38743(i32 %a0) #1 align 2 {
 ; CHECK-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq .str.17(%rip), %rax
 ; CHECK-NEXT:    jmp .LBB0_4
-; CHECK-NEXT:  .LBB0_1: # %bb2
-; CHECK-NEXT:    movq .str.16+7(%rip), %rax
-; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq .str.16(%rip), %rax
-; CHECK-NEXT:    jmp .LBB0_4
 ; CHECK-NEXT:  .LBB0_3: # %bb8
 ; CHECK-NEXT:    movq .str.18+6(%rip), %rax
 ; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq .str.18(%rip), %rax
+; CHECK-NEXT:    jmp .LBB0_4
+; CHECK-NEXT:  .LBB0_1: # %bb2
+; CHECK-NEXT:    movq .str.16+7(%rip), %rax
+; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq .str.16(%rip), %rax
 ; CHECK-NEXT:  .LBB0_4: # %bb12
 ; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
diff --git a/llvm/test/CodeGen/X86/pr38795.ll b/llvm/test/CodeGen/X86/pr38795.ll
index 8e0532e60652800..cb7213325ee9107 100644
--- a/llvm/test/CodeGen/X86/pr38795.ll
+++ b/llvm/test/CodeGen/X86/pr38795.ll
@@ -30,7 +30,12 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    # implicit-def: $ebp
 ; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_14: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:  .LBB0_13: # %if.end26
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    jne .LBB0_15
+; CHECK-NEXT:  # %bb.14: # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movb %dl, %ch
 ; CHECK-NEXT:    movl %ecx, %edx
 ; CHECK-NEXT:  .LBB0_1: # %for.cond
@@ -65,8 +70,22 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    # implicit-def: $eax
 ; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    je .LBB0_6
-; CHECK-NEXT:    jmp .LBB0_18
+; CHECK-NEXT:    jne .LBB0_18
+; CHECK-NEXT:    jmp .LBB0_6
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_15: # %if.end26
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    testl %ebp, %ebp
+; CHECK-NEXT:    jne .LBB0_16
+; CHECK-NEXT:  # %bb.17: # %if.then31
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    movb %dl, %ch
+; CHECK-NEXT:    xorl %ebp, %ebp
+; CHECK-NEXT:  .LBB0_18: # %for.inc
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    movl %ecx, %edx
+; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_3: # %if.then
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
@@ -106,18 +125,13 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    # implicit-def: $ebp
 ; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    je .LBB0_13
+; CHECK-NEXT:    jmp .LBB0_10
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_16: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    movb %dl, %ch
+; CHECK-NEXT:    movl %ecx, %edx
+; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_10: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    # implicit-def: $eax
-; CHECK-NEXT:    testb %bl, %bl
-; CHECK-NEXT:    je .LBB0_19
-; CHECK-NEXT:  .LBB0_12: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    # implicit-def: $edi
-; CHECK-NEXT:    # implicit-def: $ch
-; CHECK-NEXT:    # implicit-def: $dl
-; CHECK-NEXT:    # implicit-def: $ebp
-; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    jne .LBB0_11
 ; CHECK-NEXT:  .LBB0_7: # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    movb %dl, %cl
@@ -134,30 +148,19 @@ define dso_local void @fn() {
 ; CHECK-NEXT:  # %bb.24: # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movb %ch, %dl
 ; CHECK-NEXT:    testb %bl, %bl
-; CHECK-NEXT:    jne .LBB0_10
-; CHECK-NEXT:  .LBB0_13: # %if.end26
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    testb %dl, %dl
-; CHECK-NEXT:    je .LBB0_14
-; CHECK-NEXT:  # %bb.15: # %if.end26
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    testl %ebp, %ebp
-; CHECK-NEXT:    jne .LBB0_16
-; CHECK-NEXT:  # %bb.17: # %if.then31
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    movb %dl, %ch
-; CHECK-NEXT:    xorl %ebp, %ebp
-; CHECK-NEXT:  .LBB0_18: # %for.inc
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    jmp .LBB0_1
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_16: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movb %dl, %ch
-; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:    je .LBB0_13
+; CHECK-NEXT:  .LBB0_10: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    # implicit-def: $eax
+; CHECK-NEXT:    testb %bl, %bl
+; CHECK-NEXT:    je .LBB0_19
+; CHECK-NEXT:  .LBB0_12: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    # implicit-def: $edi
+; CHECK-NEXT:    # implicit-def: $ch
+; CHECK-NEXT:    # implicit-def: $dl
+; CHECK-NEXT:    # implicit-def: $ebp
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    jne .LBB0_11
+; CHECK-NEXT:    jmp .LBB0_7
 entry:
   br label %for.cond
 
diff --git a/llvm/test/CodeGen/X86/pr63692.ll b/llvm/test/CodeGen/X86/pr63692.ll
index fb3198743a3949e..e42540cb9bc158b 100644
--- a/llvm/test/CodeGen/X86/pr63692.ll
+++ b/llvm/test/CodeGen/X86/pr63692.ll
@@ -4,16 +4,17 @@
 define void @prefault(ptr noundef %range_start, ptr noundef readnone %range_end) {
 ; CHECK-LABEL: prefault:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    jae .LBB0_3
+; CHECK-NEXT:    jmp .LBB0_2
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_1: # %while.body
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    # in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    lock orb $0, (%rdi)
 ; CHECK-NEXT:    addq $4096, %rdi # imm = 0x1000
+; CHECK-NEXT:  .LBB0_2: # %while.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpq %rsi, %rdi
 ; CHECK-NEXT:    jb .LBB0_1
-; CHECK-NEXT:  .LBB0_3: # %while.end
+; CHECK-NEXT:  # %bb.3: # %while.end
 ; CHECK-NEXT:    retq
 entry:
   %cmp3 = icmp ult ptr %range_start, %range_end
diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
index e9448a800fd9597..c50a39c99fb6c65 100644
--- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -190,14 +190,14 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
 ; CHECK-NEXT:    movl $268, %ebp ## imm = 0x10C
 ; CHECK-NEXT:    jmp LBB0_20
-; CHECK-NEXT:  LBB0_39: ## %sw.bb566
-; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movl $20, %ebp
-; CHECK-NEXT:    jmp LBB0_20
 ; CHECK-NEXT:  LBB0_19: ## %sw.bb243
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    movl $2, %ebp
 ; CHECK-NEXT:    jmp LBB0_20
+; CHECK-NEXT:  LBB0_39: ## %sw.bb566
+; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    movl $20, %ebp
+; CHECK-NEXT:    jmp LBB0_20
 ; CHECK-NEXT:  LBB0_32: ## %if.end517.loopexitsplit
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    incq %rbx
diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll b/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
index 4d0599022d53847..7c8618af83d3d7b 100644
--- a/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
+++ b/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
@@ -534,12 +534,6 @@ define dso_local i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-NEXT:    movl $7, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB6_2: # %bb0
-; X64-NEXT:    cmovbeq %rax, %rcx
-; X64-NEXT:    shlq $47, %rcx
-; X64-NEXT:    movl $2, %eax
-; X64-NEXT:    orq %rcx, %rsp
-; X64-NEXT:    retq
 ; X64-NEXT:  .LBB6_4: # Block address taken
 ; X64-NEXT:    # %bb2
 ; X64-NEXT:    cmpq $.LBB6_4, %rdx
@@ -564,6 +558,12 @@ define dso_local i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-NEXT:    movl $11, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
+; X64-NEXT:  .LBB6_2: # %bb0
+; X64-NEXT:    cmovbeq %rax, %rcx
+; X64-NEXT:    shlq $47, %rcx
+; X64-NEXT:    movl $2, %eax
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    retq
 ;
 ; X64-PIC-LABEL: test_switch_jumptable:
 ; X64-PIC:       # %bb.0: # %entry
@@ -589,12 +589,6 @@ define dso_local i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-PIC-NEXT:    movl $7, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
-; X64-PIC-NEXT:  .LBB6_2: # %bb0
-; X64-PIC-NEXT:    cmovbeq %rax, %rcx
-; X64-PIC-NEXT:    shlq $47, %rcx
-; X64-PIC-NEXT:    movl $2, %eax
-; X64-PIC-NEXT:    orq %rcx, %rsp
-; X64-PIC-NEXT:    retq
 ; X64-PIC-NEXT:  .LBB6_4: # Block address taken
 ; X64-PIC-NEXT:    # %bb2
 ; X64-PIC-NEXT:    leaq .LBB6_4(%rip), %rsi
@@ -622,6 +616,12 @@ define dso_local i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-PIC-NEXT:    movl $11, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
+; X64-PIC-NEXT:  .LBB6_2: # %bb0
+; X64-PIC-NEXT:    cmovbeq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
+; X64-PIC-NEXT:    movl $2, %eax
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
 ;
 ; X64-RETPOLINE-LABEL: test_switch_jumptable:
 ; X64-RETPOLINE:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening.ll b/llvm/test/CodeGen/X86/speculative-load-hardening.ll
index 0c47fcddc43af2a..abd7e0d28fc25ab 100644
--- a/llvm/test/CodeGen/X86/speculative-load-hardening.ll
+++ b/llvm/test/CodeGen/X86/speculative-load-hardening.ll
@@ -433,22 +433,7 @@ define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, ptr %ptr1, ptr %ptr2
 ; X64-LFENCE-NEXT:    xorl %r12d, %r12d
 ; X64-LFENCE-NEXT:    jmp .LBB3_2
 ; X64-LFENCE-NEXT:    .p2align 4, 0x90
-; X64-LFENCE-NEXT:  .LBB3_5: # %l1.latch
-; X64-LFENCE-NEXT:    # in Loop: Header=BB3_2 Depth=1
-; X64-LFENCE-NEXT:    lfence
-; X64-LFENCE-NEXT:    movslq (%r14), %rax
-; X64-LFENCE-NEXT:    movl (%rbx,%rax,4), %edi
-; X64-LFENCE-NEXT:    callq sink at PLT
-; X64-LFENCE-NEXT:    incl %r12d
-; X64-LFENCE-NEXT:    cmpl %r15d, %r12d
-; X64-LFENCE-NEXT:    jge .LBB3_6
-; X64-LFENCE-NEXT:  .LBB3_2: # %l1.header
-; X64-LFENCE-NEXT:    # =>This Loop Header: Depth=1
-; X64-LFENCE-NEXT:    # Child Loop BB3_4 Depth 2
-; X64-LFENCE-NEXT:    lfence
-; X64-LFENCE-NEXT:    testl %r15d, %r15d
-; X64-LFENCE-NEXT:    jle .LBB3_5
-; X64-LFENCE-NEXT:  # %bb.3: # %l2.header.preheader
+; X64-LFENCE-NEXT:  .LBB3_3: # %l2.header.preheader
 ; X64-LFENCE-NEXT:    # in Loop: Header=BB3_2 Depth=1
 ; X64-LFENCE-NEXT:    lfence
 ; X64-LFENCE-NEXT:    xorl %r13d, %r13d
@@ -463,6 +448,21 @@ define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, ptr %ptr1, ptr %ptr2
 ; X64-LFENCE-NEXT:    incl %r13d
 ; X64-LFENCE-NEXT:    cmpl %ebp, %r13d
 ; X64-LFENCE-NEXT:    jl .LBB3_4
+; X64-LFENCE-NEXT:  .LBB3_5: # %l1.latch
+; X64-LFENCE-NEXT:    # in Loop: Header=BB3_2 Depth=1
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    movslq (%r14), %rax
+; X64-LFENCE-NEXT:    movl (%rbx,%rax,4), %edi
+; X64-LFENCE-NEXT:    callq sink at PLT
+; X64-LFENCE-NEXT:    incl %r12d
+; X64-LFENCE-NEXT:    cmpl %r15d, %r12d
+; X64-LFENCE-NEXT:    jge .LBB3_6
+; X64-LFENCE-NEXT:  .LBB3_2: # %l1.header
+; X64-LFENCE-NEXT:    # =>This Loop Header: Depth=1
+; X64-LFENCE-NEXT:    # Child Loop BB3_4 Depth 2
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    testl %r15d, %r15d
+; X64-LFENCE-NEXT:    jg .LBB3_3
 ; X64-LFENCE-NEXT:    jmp .LBB3_5
 entry:
   %a.cmp = icmp eq i32 %a, 0
diff --git a/llvm/test/CodeGen/X86/switch.ll b/llvm/test/CodeGen/X86/switch.ll
index f5040f2b2bab557..264c5bf15aa28bb 100644
--- a/llvm/test/CodeGen/X86/switch.ll
+++ b/llvm/test/CodeGen/X86/switch.ll
@@ -284,8 +284,6 @@ define void @jt_is_better(i32 %x) {
 ; CHECK-NEXT:  .LBB4_3: # %bb1
 ; CHECK-NEXT:    movl $1, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
-; CHECK-NEXT:  .LBB4_7: # %return
-; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB4_4: # %bb2
 ; CHECK-NEXT:    movl $2, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
@@ -295,6 +293,8 @@ define void @jt_is_better(i32 %x) {
 ; CHECK-NEXT:  .LBB4_6: # %bb4
 ; CHECK-NEXT:    movl $4, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
+; CHECK-NEXT:  .LBB4_7: # %return
+; CHECK-NEXT:    retq
 ;
 ; NOOPT-LABEL: jt_is_better:
 ; NOOPT:       # %bb.0: # %entry
@@ -1880,13 +1880,6 @@ define void @left_leaning_weight_balanced_tree(i32 %x) {
 ; CHECK-NEXT:  # %bb.7: # %bb2
 ; CHECK-NEXT:    movl $2, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
-; CHECK-NEXT:  .LBB19_12: # %entry
-; CHECK-NEXT:    cmpl $50, %edi
-; CHECK-NEXT:    je .LBB19_17
-; CHECK-NEXT:  # %bb.13: # %entry
-; CHECK-NEXT:    cmpl $60, %edi
-; CHECK-NEXT:    je .LBB19_14
-; CHECK-NEXT:    jmp .LBB19_18
 ; CHECK-NEXT:  .LBB19_8: # %entry
 ; CHECK-NEXT:    cmpl $30, %edi
 ; CHECK-NEXT:    je .LBB19_16
@@ -1896,6 +1889,14 @@ define void @left_leaning_weight_balanced_tree(i32 %x) {
 ; CHECK-NEXT:  # %bb.10: # %bb4
 ; CHECK-NEXT:    movl $4, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
+; CHECK-NEXT:  .LBB19_12: # %entry
+; CHECK-NEXT:    cmpl $50, %edi
+; CHECK-NEXT:    je .LBB19_17
+; CHECK-NEXT:  # %bb.13: # %entry
+; CHECK-NEXT:    cmpl $60, %edi
+; CHECK-NEXT:    je .LBB19_14
+; CHECK-NEXT:  .LBB19_18: # %return
+; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB19_15: # %bb1
 ; CHECK-NEXT:    movl $1, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
@@ -1905,8 +1906,6 @@ define void @left_leaning_weight_balanced_tree(i32 %x) {
 ; CHECK-NEXT:  .LBB19_17: # %bb5
 ; CHECK-NEXT:    movl $5, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
-; CHECK-NEXT:  .LBB19_18: # %return
-; CHECK-NEXT:    retq
 ;
 ; NOOPT-LABEL: left_leaning_weight_balanced_tree:
 ; NOOPT:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll b/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
index 937cc173e7faefd..1d728b62b3796b6 100644
--- a/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
+++ b/llvm/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
@@ -7,27 +7,25 @@ define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_2: # %inner_loop_top
+; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    cmpb $0, (%rsi)
+; CHECK-NEXT:    js .LBB0_3
+; CHECK-NEXT:  # %bb.4: # %inner_loop_latch
+; CHECK-NEXT:    # in Loop: Header=BB0_2 Depth=2
+; CHECK-NEXT:    addq $2, %rsi
+; CHECK-NEXT:    jmp .LBB0_2
+; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_3: # %inner_loop_exit
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    incq %rsi
 ; CHECK-NEXT:  .LBB0_1: # %outer_loop_top
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB0_4 Depth 2
+; CHECK-NEXT:    # Child Loop BB0_2 Depth 2
 ; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    je .LBB0_5
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  # %bb.2: # %inner_loop_top
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    cmpb $0, (%rsi)
-; CHECK-NEXT:    js .LBB0_3
-; CHECK-NEXT:  .LBB0_4: # %inner_loop_latch
-; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    addq $2, %rsi
-; CHECK-NEXT:    cmpb $0, (%rsi)
-; CHECK-NEXT:    jns .LBB0_4
-; CHECK-NEXT:    jmp .LBB0_3
-; CHECK-NEXT:  .LBB0_5: # %exit
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  # %bb.5: # %exit
 ; CHECK-NEXT:    retq
 entry:
   %notlhs674.i = icmp eq i32 %a, 0
@@ -130,48 +128,50 @@ define i32 @loop_shared_header(i8* %exe, i32 %exesz, i32 %headsize, i32 %min, i3
 ; CHECK-NEXT:  # %bb.6: # %shared_preheader
 ; CHECK-NEXT:    movb $32, %cl
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    jmp .LBB1_8
+; CHECK-NEXT:    jmp .LBB1_10
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB1_7: # %merge_predecessor_split
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
+; CHECK-NEXT:  .LBB1_7: # %if.end287.i
+; CHECK-NEXT:    # in Loop: Header=BB1_10 Depth=1
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    # implicit-def: $cl
+; CHECK-NEXT:    jne .LBB1_10
+; CHECK-NEXT:  # %bb.8: # %if.end308.i
+; CHECK-NEXT:    # in Loop: Header=BB1_10 Depth=1
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB1_14
+; CHECK-NEXT:  # %bb.9: # %merge_predecessor_split
+; CHECK-NEXT:    # in Loop: Header=BB1_10 Depth=1
 ; CHECK-NEXT:    movb $32, %cl
-; CHECK-NEXT:  .LBB1_8: # %outer_loop_header
+; CHECK-NEXT:  .LBB1_10: # %outer_loop_header
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB1_9 Depth 2
+; CHECK-NEXT:    # Child Loop BB1_11 Depth 2
 ; CHECK-NEXT:    testl %r13d, %r13d
 ; CHECK-NEXT:    je .LBB1_16
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB1_9: # %shared_loop_header
-; CHECK-NEXT:    # Parent Loop BB1_8 Depth=1
+; CHECK-NEXT:  .LBB1_11: # %shared_loop_header
+; CHECK-NEXT:    # Parent Loop BB1_10 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    testq %r14, %r14
 ; CHECK-NEXT:    jne .LBB1_25
-; CHECK-NEXT:  # %bb.10: # %inner_loop_body
-; CHECK-NEXT:    # in Loop: Header=BB1_9 Depth=2
+; CHECK-NEXT:  # %bb.12: # %inner_loop_body
+; CHECK-NEXT:    # in Loop: Header=BB1_11 Depth=2
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je .LBB1_9
-; CHECK-NEXT:  # %bb.11: # %if.end96.i
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
+; CHECK-NEXT:    je .LBB1_11
+; CHECK-NEXT:  # %bb.13: # %if.end96.i
+; CHECK-NEXT:    # in Loop: Header=BB1_10 Depth=1
 ; CHECK-NEXT:    cmpl $3, %r13d
-; CHECK-NEXT:    jae .LBB1_20
-; CHECK-NEXT:  # %bb.12: # %if.end287.i
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
-; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    # implicit-def: $cl
-; CHECK-NEXT:    jne .LBB1_8
-; CHECK-NEXT:  # %bb.13: # %if.end308.i
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
-; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je .LBB1_7
-; CHECK-NEXT:  # %bb.14: # %if.end335.i
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
+; CHECK-NEXT:    jb .LBB1_7
+; CHECK-NEXT:    jmp .LBB1_20
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB1_14: # %if.end335.i
+; CHECK-NEXT:    # in Loop: Header=BB1_10 Depth=1
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    testb %cl, %cl
-; CHECK-NEXT:    jne .LBB1_8
+; CHECK-NEXT:    jne .LBB1_10
 ; CHECK-NEXT:  # %bb.15: # %merge_other
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
+; CHECK-NEXT:    # in Loop: Header=BB1_10 Depth=1
 ; CHECK-NEXT:    # implicit-def: $cl
-; CHECK-NEXT:    jmp .LBB1_8
+; CHECK-NEXT:    jmp .LBB1_10
 ; CHECK-NEXT:  .LBB1_23:
 ; CHECK-NEXT:    movl $1, %ebx
 ; CHECK-NEXT:    jmp .LBB1_24
diff --git a/llvm/test/CodeGen/X86/win-catchpad.ll b/llvm/test/CodeGen/X86/win-catchpad.ll
index 59612bfe9a535ea..d2067dd4e51c24a 100644
--- a/llvm/test/CodeGen/X86/win-catchpad.ll
+++ b/llvm/test/CodeGen/X86/win-catchpad.ll
@@ -64,13 +64,13 @@ try.cont:
 ; X86: retl
 
 ; FIXME: These should be de-duplicated.
-; X86: [[restorebb2:LBB0_[0-9]+]]: # Block address taken
-; X86-NEXT:                        # %handler2
+; X86: [[restorebb1:LBB0_[0-9]+]]: # Block address taken
+; X86-NEXT:                        # %handler1
 ; X86-NEXT: addl $12, %ebp
 ; X86: jmp [[contbb]]
 
-; X86: [[restorebb1:LBB0_[0-9]+]]: # Block address taken
-; X86-NEXT:                        # %handler1
+; X86: [[restorebb2:LBB0_[0-9]+]]: # Block address taken
+; X86-NEXT:                        # %handler2
 ; X86-NEXT: addl $12, %ebp
 ; X86: jmp [[contbb]]
 
diff --git a/llvm/unittests/Support/BlockFrequencyTest.cpp b/llvm/unittests/Support/BlockFrequencyTest.cpp
index cf5cf9213501c47..7e97eb5f4a0ec7c 100644
--- a/llvm/unittests/Support/BlockFrequencyTest.cpp
+++ b/llvm/unittests/Support/BlockFrequencyTest.cpp
@@ -124,4 +124,54 @@ TEST(BlockFrequencyTest, SaturatingRightShift) {
   EXPECT_EQ(Freq.getFrequency(), 0x1ULL);
 }
 
+TEST(BlockFrequencyTest, GreaterWithTolerance) {
+  EXPECT_TRUE(BlockFrequency(0x1234).greaterWithTolerance(0, 20));
+  EXPECT_TRUE(BlockFrequency(0x1234).greaterWithTolerance(0x1233, 20));
+  EXPECT_FALSE(BlockFrequency(0x1234).greaterWithTolerance(0x1234, 20));
+  EXPECT_FALSE(BlockFrequency(0x1234).greaterWithTolerance(0x1235, 20));
+
+  EXPECT_FALSE(BlockFrequency(0x1234).greaterWithTolerance(0x1233, 2));
+  EXPECT_FALSE(BlockFrequency(0x1234).greaterWithTolerance(0x1233, 8));
+  EXPECT_FALSE(BlockFrequency(0x1234).greaterWithTolerance(0x1233, 12));
+  EXPECT_TRUE(BlockFrequency(0x1234).greaterWithTolerance(0x1233, 13));
+
+  EXPECT_TRUE(BlockFrequency(0x10000000000).greaterWithTolerance(0x10, 5));
+  EXPECT_FALSE(BlockFrequency(0x10).greaterWithTolerance(0x10000000000, 5));
+
+  uint64_t Max = std::numeric_limits<uint64_t>::max();
+  EXPECT_FALSE(BlockFrequency(Max).greaterWithTolerance(Max, 2));
+  EXPECT_FALSE(BlockFrequency(Max).greaterWithTolerance(Max, 64));
+  EXPECT_TRUE(BlockFrequency(Max).greaterWithTolerance(0, 2));
+  EXPECT_TRUE(BlockFrequency(Max).greaterWithTolerance(0, 64));
+  EXPECT_FALSE(BlockFrequency(0).greaterWithTolerance(Max, 2));
+  EXPECT_FALSE(BlockFrequency(0).greaterWithTolerance(Max, 64));
+  EXPECT_FALSE(BlockFrequency(0).greaterWithTolerance(0, 2));
+  EXPECT_FALSE(BlockFrequency(0).greaterWithTolerance(0, 64));
+}
+
+TEST(BlockFrequencyTest, LessWithTolerance) {
+  EXPECT_FALSE(BlockFrequency(0x1234).lessWithTolerance(0, 20));
+  EXPECT_FALSE(BlockFrequency(0x1234).lessWithTolerance(0x1233, 20));
+  EXPECT_FALSE(BlockFrequency(0x1234).lessWithTolerance(0x1234, 20));
+  EXPECT_TRUE(BlockFrequency(0x1234).lessWithTolerance(0x1235, 20));
+
+  EXPECT_FALSE(BlockFrequency(0x1234).lessWithTolerance(0x1235, 2));
+  EXPECT_FALSE(BlockFrequency(0x1234).lessWithTolerance(0x1235, 8));
+  EXPECT_FALSE(BlockFrequency(0x1234).lessWithTolerance(0x1235, 12));
+  EXPECT_TRUE(BlockFrequency(0x1234).lessWithTolerance(0x1235, 13));
+
+  EXPECT_FALSE(BlockFrequency(0x10000000000).lessWithTolerance(0x10, 5));
+  EXPECT_TRUE(BlockFrequency(0x10).lessWithTolerance(0x10000000000, 5));
+
+  uint64_t Max = std::numeric_limits<uint64_t>::max();
+  EXPECT_FALSE(BlockFrequency(Max).lessWithTolerance(Max, 2));
+  EXPECT_FALSE(BlockFrequency(Max).lessWithTolerance(Max, 64));
+  EXPECT_FALSE(BlockFrequency(Max).lessWithTolerance(0, 2));
+  EXPECT_FALSE(BlockFrequency(Max).lessWithTolerance(0, 64));
+  EXPECT_TRUE(BlockFrequency(0).lessWithTolerance(Max, 2));
+  EXPECT_TRUE(BlockFrequency(0).lessWithTolerance(Max, 64));
+  EXPECT_FALSE(BlockFrequency(0).lessWithTolerance(0, 2));
+  EXPECT_FALSE(BlockFrequency(0).lessWithTolerance(0, 64));
+}
+
 }

>From 539dedb80e861f3c44bc0ee3788e556cd485175c Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Mon, 11 Sep 2023 15:02:56 -0700
Subject: [PATCH 3/4] Add test for BlockFrequencyInfo precision for small
 min/max spreads

---
 .../Analysis/BlockFrequencyInfo/precision.ll  | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 llvm/test/Analysis/BlockFrequencyInfo/precision.ll

diff --git a/llvm/test/Analysis/BlockFrequencyInfo/precision.ll b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
new file mode 100644
index 000000000000000..4001fe991d6d9e8
--- /dev/null
+++ b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s -disable-output -passes="print<block-freq>" 2>&1 | FileCheck %s
+; Sanity check precision for small-ish min/max spread.
+
+ at g = global i32 0
+
+; CHECK-LABEL: block-frequency-info: func0
+; CHECK: - entry: float = 1.0, {{.*}}, count = 1000
+; CHECK: - cmp0_true: float = 0.4, {{.*}}, count = 388
+; CHECK: - cmp0_false: float = 0.6, {{.*}}, count = 600
+; CHECK: - cmp1_true: float = 0.1, {{.*}}, count = 88
+; CHECK: - cmp1_false: float = 0.3, {{.*}}, count = 288
+; CHECK: - join: float = 1.0, {{.*}}, count = 1000
+
+define void @func0(i32 %a0, i32 %a1) !prof !0 {
+entry:
+  %cmp0 = icmp ne i32 %a0, 0
+  br i1 %cmp0, label %cmp0_true, label %cmp0_false, !prof !1
+
+cmp0_true:
+  store volatile i32 1, ptr @g
+  %cmp1 = icmp ne i32 %a1, 0
+  br i1 %cmp1, label %cmp1_true, label %cmp1_false, !prof !2
+
+cmp0_false:
+  store volatile i32 2, ptr @g
+  br label %join
+
+cmp1_true:
+  store volatile i32 3, ptr @g
+  br label %join
+
+cmp1_false:
+  store volatile i32 4, ptr @g
+  br label %join
+
+join:
+  store volatile i32 5, ptr @g
+  ret void
+}
+
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 400, i32 600}
+!2 = !{!"branch_weights", i32 1, i32 3}

>From 521e483d7fe59e573b1c9d786c5bf1dbd76a4594 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Mon, 11 Sep 2023 14:45:09 -0700
Subject: [PATCH 4/4] BlockFrequencyInfoImpl: Increase precision for small
 min/max spreads

BlockFrequencyInfo stores its result as integer values expressing
frequencies within a function relative to the frequency of the entry
block. This also means that the precision for frequencies < 1.0
depends on the choice for the entry block.

Before this change we would often end up with unnecessarily small
choices resuling in unnecessarily poor precision. This simplifies the
algorithm to use the full 64bits available as much as possible.
---
 ...rprof-gcov-multiple-bbs-single-line.c.gcov |  16 +-
 llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp  |  34 +--
 .../loops_with_profile_info.ll                |  19 +-
 .../Analysis/BlockFrequencyInfo/precision.ll  |   6 +-
 .../arm64-spill-remarks-treshold-hotness.ll   |   2 +-
 llvm/test/CodeGen/AArch64/sink-and-fold.ll    |  12 +-
 .../greedy-broken-ssa-verifier-error.mir      |   2 +-
 .../AMDGPU/tuple-allocation-failure.ll        | 174 ++++++-----
 llvm/test/CodeGen/Generic/bb-profile-dump.ll  |   8 +-
 llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll   |  80 +++---
 llvm/test/CodeGen/PowerPC/reduce_cr.ll        |  18 +-
 llvm/test/CodeGen/PowerPC/tail-dup-layout.ll  |  10 +-
 llvm/test/CodeGen/RISCV/branch-relaxation.ll  |  86 +++---
 .../test/CodeGen/Thumb2/mve-blockplacement.ll | 270 +++++++++---------
 .../CodeGen/Thumb2/mve-float32regloops.ll     |  36 +--
 llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll  |  22 +-
 llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll  | 119 ++++----
 llvm/test/CodeGen/VE/Scalar/br_jt.ll          |  40 +--
 llvm/test/CodeGen/VE/Scalar/brind.ll          |  15 +-
 llvm/test/CodeGen/X86/bb_rotate.ll            |   2 +-
 .../X86/code_placement_ext_tsp_large.ll       |   8 +-
 .../X86/div-rem-pair-recomposition-signed.ll  | 134 +++++----
 .../div-rem-pair-recomposition-unsigned.ll    | 206 ++++++-------
 llvm/test/CodeGen/X86/fsafdo_test3.ll         |  80 +++---
 llvm/test/CodeGen/X86/pr38795.ll              |  74 ++---
 .../speculative-load-hardening-indirect.ll    |  94 +++---
 llvm/test/CodeGen/X86/statepoint-ra.ll        |   2 +-
 llvm/test/CodeGen/X86/switch.ll               |  23 +-
 .../X86/tail-dup-no-other-successor.ll        |   2 +-
 llvm/test/CodeGen/X86/tail-opts.ll            |  14 +-
 llvm/test/Other/cfg-printer-branch-weights.ll |   4 +-
 llvm/test/ThinLTO/X86/function_entry_count.ll |   2 +-
 .../CodeExtractor/MultipleExitBranchProb.ll   |   2 +-
 .../X86/pr52689-not-all-uses-rebased.ll       |   4 +
 .../Transforms/JumpThreading/thread-prob-7.ll |   2 +-
 .../JumpThreading/update-edge-weight.ll       |   2 +-
 llvm/test/Transforms/LICM/loopsink.ll         |  20 +-
 .../AArch64/opt-remark-with-hotness.ll        |   2 +-
 .../diagnostics-with-hotness.ll               |   2 +-
 .../LoopRotate/update-branch-weights.ll       |  12 +-
 .../Transforms/LoopVectorize/X86/avx512.ll    |  23 +-
 .../X86/no_fpmath_with_hotness.ll             |   2 +-
 .../LoopVectorize/diag-with-hotness-info-2.ll |   2 +-
 .../LoopVectorize/diag-with-hotness-info.ll   |   2 +-
 .../PGOProfile/Inputs/PR41279_2.proftext      |   3 +-
 .../PGOProfile/Inputs/criticaledge.proftext   |   4 +-
 .../Inputs/criticaledge_entry.proftext        |   4 +-
 .../PGOProfile/Inputs/indirectbr.proftext     |   2 +-
 .../Inputs/indirectbr_entry.proftext          |   2 +-
 llvm/test/Transforms/PGOProfile/PR41279_2.ll  |  23 +-
 .../Transforms/PGOProfile/bfi_verification.ll |  16 +-
 .../Transforms/PGOProfile/criticaledge.ll     |   6 +-
 llvm/test/Transforms/PGOProfile/fix_bfi.ll    |   2 +-
 llvm/test/Transforms/PGOProfile/loop2.ll      |   6 +-
 .../profile-correlation-irreducible-loops.ll  |   6 +-
 .../profile-inference-rebalance.ll            |   8 +-
 .../SampleProfile/pseudo-probe-update-2.ll    |  10 +-
 57 files changed, 883 insertions(+), 898 deletions(-)

diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov
index 4debf8fc1b680d9..9297073d21ef80e 100644
--- a/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov
+++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov
@@ -10,25 +10,25 @@
 // CHECK-NEXT:        -:    4:
 // CHECK-NEXT:        1:    5:  int a = 1;
 // CHECK-NEXT:        1:    6:  if (a) {
-// CHECK-NEXT:branch  0 taken 1
-// CHECK-NEXT:branch  1 taken 0
+// CHECK-NEXT:branch  0 taken 0
+// CHECK-NEXT:branch  1 taken 1
 // CHECK-NEXT:        1:    7:    var++;
 // CHECK-NEXT:        1:    8:  }
 // CHECK-NEXT:        -:    9:
 // CHECK-NEXT:        1:   10:  if (a) {}
-// CHECK-NEXT:branch  0 taken 1
-// CHECK-NEXT:branch  1 taken 0
+// CHECK-NEXT:branch  0 taken 0
+// CHECK-NEXT:branch  1 taken 1
 // CHECK-NEXT:        -:   11:
 // CHECK-NEXT:        1:   12:  int b = 0;
 // CHECK-NEXT:        1:   13:  if (b) {
-// CHECK-NEXT:branch  0 taken 0
-// CHECK-NEXT:branch  1 taken 1
+// CHECK-NEXT:branch  0 taken 1
+// CHECK-NEXT:branch  1 taken 0
 // CHECK-NEXT:    #####:   14:    var++;
 // CHECK-NEXT:    #####:   15:  }
 // CHECK-NEXT:        -:   16:
 // CHECK-NEXT:        1:   17:  if (b) {}
-// CHECK-NEXT:branch  0 taken 0
-// CHECK-NEXT:branch  1 taken 1
+// CHECK-NEXT:branch  0 taken 1
+// CHECK-NEXT:branch  1 taken 0
 // CHECK-NEXT:        -:   18:
 // CHECK-NEXT:        1:   19:  return 0;
 // CHECK-NEXT:        -:   20:}
diff --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 82b1e3b9eede709..d4efb8871cf3054 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -481,30 +481,24 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
 
 static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
                                      const Scaled64 &Min, const Scaled64 &Max) {
-  // Scale the Factor to a size that creates integers.  Ideally, integers would
-  // be scaled so that Max == UINT64_MAX so that they can be best
-  // differentiated.  However, in the presence of large frequency values, small
-  // frequencies are scaled down to 1, making it impossible to differentiate
-  // small, unequal numbers. When the spread between Min and Max frequencies
-  // fits well within MaxBits, we make the scale be at least 8.
-  const unsigned MaxBits = 64;
-  const unsigned SpreadBits = (Max / Min).lg();
-  Scaled64 ScalingFactor;
-  if (SpreadBits <= MaxBits - 3) {
-    // If the values are small enough, make the scaling factor at least 8 to
-    // allow distinguishing small values.
-    ScalingFactor = Min.inverse();
-    ScalingFactor <<= 3;
-  } else {
-    // If the values need more than MaxBits to be represented, saturate small
-    // frequency values down to 1 by using a scaling factor that benefits large
-    // frequency values.
-    ScalingFactor = Scaled64(1, MaxBits) / Max;
-  }
+  // Scale the Factor to a size that creates integers.  If possible scale
+  // integers so that Max == UINT64_MAX so that they can be best differentiated.
+  // Is is possible that the range between min and max cannot be accurately
+  // represented in a 64bit integer without either loosing precision for small
+  // values (so small unequal numbers all map to 1) or saturaturing big numbers
+  // loosing precision for big numbers (so unequal big numbers may map to
+  // UINT64_MAX). We choose to loose precision for small numbers.
+  const unsigned MaxBits = sizeof(Scaled64::DigitsType) * CHAR_BIT;
+  // Users often add up multiple BlockFrequency values or multiply them with
+  // things like instruction costs. Leave some room to avoid saturating
+  // operations reaching UIN64_MAX too early.
+  const unsigned Slack = 10;
+  Scaled64 ScalingFactor = Scaled64(1, MaxBits - Slack) / Max;
 
   // Translate the floats to integers.
   LLVM_DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
                     << ", factor = " << ScalingFactor << "\n");
+  (void)Min;
   for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) {
     Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor;
     BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>());
diff --git a/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll b/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
index 41226a1cdfbaf32..7cebfb114f4ed4e 100644
--- a/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
+++ b/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
@@ -59,7 +59,7 @@ declare i32 @printf(i8*, ...)
 
 ; CHECK: Printing analysis {{.*}} for function 'main':
 ; CHECK-NEXT: block-frequency-info: main
-define i32 @main() {
+define i32 @main() !prof !6 {
 entry:
   %retval = alloca i32, align 4
   %i = alloca i32, align 4
@@ -93,7 +93,7 @@ for.cond4:                                        ; preds = %for.inc, %for.body3
   %cmp5 = icmp slt i32 %2, 100
   br i1 %cmp5, label %for.body6, label %for.end, !prof !3
 
-; CHECK: - for.body6: float = 500000.5, int = 4000004
+; CHECK: - for.body6: float = 1000000.0,{{.*}}count = 1000000
 for.body6:                                        ; preds = %for.cond4
   call void @bar()
   br label %for.inc
@@ -143,7 +143,7 @@ for.cond16:                                       ; preds = %for.inc19, %for.bod
   %cmp17 = icmp slt i32 %8, 10000
   br i1 %cmp17, label %for.body18, label %for.end21, !prof !4
 
-; CHECK: - for.body18: float = 499999.9, int = 3999998
+; CHECK: - for.body18: float = 999999.5,{{.*}}count = 1000000
 for.body18:                                       ; preds = %for.cond16
   call void @bar()
   br label %for.inc19
@@ -175,7 +175,7 @@ for.cond26:                                       ; preds = %for.inc29, %for.end
   %cmp27 = icmp slt i32 %12, 1000000
   br i1 %cmp27, label %for.body28, label %for.end31, !prof !5
 
-; CHECK: - for.body28: float = 499995.2, int = 3999961
+; CHECK: - for.body28: float = 1000224.3,{{.*}}count = 1000224
 for.body28:                                       ; preds = %for.cond26
   call void @bar()
   br label %for.inc29
@@ -197,8 +197,9 @@ for.end31:                                        ; preds = %for.cond26
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.7.0 (trunk 232635) (llvm/trunk 232636)"}
-!1 = !{!"branch_weights", i32 101, i32 2}
-!2 = !{!"branch_weights", i32 10001, i32 101}
-!3 = !{!"branch_weights", i32 1000001, i32 10001}
-!4 = !{!"branch_weights", i32 1000001, i32 101}
-!5 = !{!"branch_weights", i32 1000001, i32 2}
+!1 = !{!"branch_weights", i32 100, i32 1}
+!2 = !{!"branch_weights", i32 10000, i32 100}
+!3 = !{!"branch_weights", i32 1000000, i32 10000}
+!4 = !{!"branch_weights", i32 1000000, i32 100}
+!5 = !{!"branch_weights", i32 1000000, i32 1}
+!6 = !{!"function_entry_count", i32 1}
diff --git a/llvm/test/Analysis/BlockFrequencyInfo/precision.ll b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
index 4001fe991d6d9e8..7408d002d065d5b 100644
--- a/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
+++ b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
@@ -5,10 +5,10 @@
 
 ; CHECK-LABEL: block-frequency-info: func0
 ; CHECK: - entry: float = 1.0, {{.*}}, count = 1000
-; CHECK: - cmp0_true: float = 0.4, {{.*}}, count = 388
+; CHECK: - cmp0_true: float = 0.4, {{.*}}, count = 400
 ; CHECK: - cmp0_false: float = 0.6, {{.*}}, count = 600
-; CHECK: - cmp1_true: float = 0.1, {{.*}}, count = 88
-; CHECK: - cmp1_false: float = 0.3, {{.*}}, count = 288
+; CHECK: - cmp1_true: float = 0.1, {{.*}}, count = 100
+; CHECK: - cmp1_false: float = 0.3, {{.*}}, count = 300
 ; CHECK: - join: float = 1.0, {{.*}}, count = 1000
 
 define void @func0(i32 %a0, i32 %a1) !prof !0 {
diff --git a/llvm/test/CodeGen/AArch64/arm64-spill-remarks-treshold-hotness.ll b/llvm/test/CodeGen/AArch64/arm64-spill-remarks-treshold-hotness.ll
index 0578ab585402af9..5f849c67b0ca318 100644
--- a/llvm/test/CodeGen/AArch64/arm64-spill-remarks-treshold-hotness.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-spill-remarks-treshold-hotness.ll
@@ -5,7 +5,7 @@
 ; RUN:       -pass-remarks-with-hotness -pass-remarks-hotness-threshold=1 \
 ; RUN:       2>&1 | FileCheck -check-prefix=THRESHOLD %s
 
-; CHECK: remark: /tmp/kk.c:3:20: 1 spills 3.187500e+01 total spills cost 1 reloads 3.187500e+01 total reloads cost generated in loop{{$}}
+; CHECK: remark: /tmp/kk.c:3:20: 1 spills 3.200000e+01 total spills cost 1 reloads 3.200000e+01 total reloads cost generated in loop{{$}}
 ; THRESHOLD-NOT: remark
 
 define void @fpr128(ptr %p) nounwind ssp {
diff --git a/llvm/test/CodeGen/AArch64/sink-and-fold.ll b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
index 66afb840b50db99..0071fabea8bcdb3 100644
--- a/llvm/test/CodeGen/AArch64/sink-and-fold.ll
+++ b/llvm/test/CodeGen/AArch64/sink-and-fold.ll
@@ -142,11 +142,10 @@ define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast,+addr-
 ; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x1
 ; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    b .LBB4_7
 ; CHECK-NEXT:  .LBB4_2: // %if.else
 ; CHECK-NEXT:    // in Loop: Header=BB4_4 Depth=2
-; CHECK-NEXT:    ldr w0, [x22]
+; CHECK-NEXT:    ldr w0, [x20, x22, lsl #2]
 ; CHECK-NEXT:  .LBB4_3: // %LJ.latch
 ; CHECK-NEXT:    // in Loop: Header=BB4_4 Depth=2
 ; CHECK-NEXT:    add x8, x21, #1
@@ -162,21 +161,20 @@ define void @f4(ptr %a, i64 %n) nounwind "target-features"="+alu-lsl-fast,+addr-
 ; CHECK-NEXT:    tbz w8, #31, .LBB4_2
 ; CHECK-NEXT:  // %bb.5: // %if.then
 ; CHECK-NEXT:    // in Loop: Header=BB4_4 Depth=2
-; CHECK-NEXT:    mov x0, x22
+; CHECK-NEXT:    add x0, x20, x22, lsl #2
 ; CHECK-NEXT:    mov x1, x21
 ; CHECK-NEXT:    bl use
 ; CHECK-NEXT:    b .LBB4_3
 ; CHECK-NEXT:  .LBB4_6: // %LI.latch
 ; CHECK-NEXT:    // in Loop: Header=BB4_7 Depth=1
-; CHECK-NEXT:    cmp x23, x19
-; CHECK-NEXT:    mov x23, x24
+; CHECK-NEXT:    cmp x22, x19
+; CHECK-NEXT:    mov x22, x23
 ; CHECK-NEXT:    b.ge .LBB4_8
 ; CHECK-NEXT:  .LBB4_7: // %LI
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
 ; CHECK-NEXT:    // Child Loop BB4_4 Depth 2
-; CHECK-NEXT:    add x22, x20, x23, lsl #2
 ; CHECK-NEXT:    mov x21, xzr
-; CHECK-NEXT:    add x24, x23, #1
+; CHECK-NEXT:    add x23, x22, #1
 ; CHECK-NEXT:    b .LBB4_4
 ; CHECK-NEXT:  .LBB4_8:
 ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/greedy-broken-ssa-verifier-error.mir b/llvm/test/CodeGen/AMDGPU/greedy-broken-ssa-verifier-error.mir
index 537bea7d2cfbe39..7a623d235950dd2 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-broken-ssa-verifier-error.mir
+++ b/llvm/test/CodeGen/AMDGPU/greedy-broken-ssa-verifier-error.mir
@@ -15,7 +15,7 @@ machineFunctionInfo:
 body:             |
   ; GCN-LABEL: name: ra_introduces_vreg_def
   ; GCN: [[COPY_V0:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GCN: [[COPY_V0]]:vgpr_32 =
+  ; GCN: [[COPY_V1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   bb.0:
     liveins: $vgpr0, $vgpr1
     %0:vgpr_32 = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index d4941235680bf90..3e1cbce63e538dc 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -114,7 +114,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_30
 ; GLOBALNESS1-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
-; GLOBALNESS1-NEXT:    ; Child Loop BB1_15 Depth 2
+; GLOBALNESS1-NEXT:    ; Child Loop BB1_16 Depth 2
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[74:75], s[74:75] op_sel:[0,1]
 ; GLOBALNESS1-NEXT:    flat_load_dword v40, v[0:1]
 ; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
@@ -133,7 +133,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[46:47]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_8
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
 ; GLOBALNESS1-NEXT:  ; %bb.5: ; %NodeBlock
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s79, 1
@@ -143,17 +143,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s79, 1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS1-NEXT:    s_cbranch_execnz .LBB1_8
-; GLOBALNESS1-NEXT:    s_branch .LBB1_23
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_8
+; GLOBALNESS1-NEXT:    s_branch .LBB1_9
 ; GLOBALNESS1-NEXT:  .LBB1_7: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS1-NEXT:    s_branch .LBB1_23
-; GLOBALNESS1-NEXT:  .LBB1_8: ; %Flow25
+; GLOBALNESS1-NEXT:  .LBB1_8: ; %LeafBlock
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s79, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow25
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
-; GLOBALNESS1-NEXT:  .LBB1_9: ; %baz.exit.i
+; GLOBALNESS1-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS1-NEXT:    flat_load_dword v0, v[2:3]
@@ -163,57 +167,57 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[80:81], s[62:63]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_26
-; GLOBALNESS1-NEXT:  ; %bb.10: ; %bb33.i
+; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[54:55]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_12
-; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb39.i
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
+; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
-; GLOBALNESS1-NEXT:  .LBB1_12: ; %bb44.lr.ph.i
+; GLOBALNESS1-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1]
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[66:67], 0, v2
-; GLOBALNESS1-NEXT:    s_branch .LBB1_15
-; GLOBALNESS1-NEXT:  .LBB1_13: ; %Flow16
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_branch .LBB1_16
+; GLOBALNESS1-NEXT:  .LBB1_14: ; %Flow16
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GLOBALNESS1-NEXT:  .LBB1_14: ; %bb63.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb63.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[52:53]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_25
-; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb44.i
+; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS1-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[48:49]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_14
-; GLOBALNESS1-NEXT:  ; %bb.16: ; %bb46.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
+; GLOBALNESS1-NEXT:  ; %bb.17: ; %bb46.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[50:51]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_14
-; GLOBALNESS1-NEXT:  ; %bb.17: ; %bb50.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
+; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb50.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[42:43]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_20
-; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb3.i.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
+; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb3.i.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[44:45]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_20
-; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb6.i.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
+; GLOBALNESS1-NEXT:  ; %bb.20: ; %bb6.i.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[64:65]
-; GLOBALNESS1-NEXT:  .LBB1_20: ; %spam.exit.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:  .LBB1_21: ; %spam.exit.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[56:57]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_14
-; GLOBALNESS1-NEXT:  ; %bb.21: ; %bb55.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
+; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb55.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_add_u32 s68, s38, 40
 ; GLOBALNESS1-NEXT:    s_addc_u32 s69, s39, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -237,19 +241,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[66:67]
-; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_13
-; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb62.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_14
+; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb62.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[42:43], off
-; GLOBALNESS1-NEXT:    s_branch .LBB1_13
-; GLOBALNESS1-NEXT:  .LBB1_23: ; %LeafBlock
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s79, 0
-; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
-; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
+; GLOBALNESS1-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS1-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -401,7 +398,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_30
 ; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
-; GLOBALNESS0-NEXT:    ; Child Loop BB1_15 Depth 2
+; GLOBALNESS0-NEXT:    ; Child Loop BB1_16 Depth 2
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[76:77], s[76:77] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    flat_load_dword v40, v[0:1]
 ; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
@@ -420,7 +417,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[46:47]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_8
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
 ; GLOBALNESS0-NEXT:  ; %bb.5: ; %NodeBlock
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s75, 1
@@ -430,17 +427,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s75, 1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS0-NEXT:    s_cbranch_execnz .LBB1_8
-; GLOBALNESS0-NEXT:    s_branch .LBB1_23
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_8
+; GLOBALNESS0-NEXT:    s_branch .LBB1_9
 ; GLOBALNESS0-NEXT:  .LBB1_7: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS0-NEXT:    s_branch .LBB1_23
-; GLOBALNESS0-NEXT:  .LBB1_8: ; %Flow25
+; GLOBALNESS0-NEXT:  .LBB1_8: ; %LeafBlock
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s75, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow25
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
-; GLOBALNESS0-NEXT:  .LBB1_9: ; %baz.exit.i
+; GLOBALNESS0-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS0-NEXT:    flat_load_dword v0, v[2:3]
@@ -450,57 +451,57 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[80:81], s[62:63]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
-; GLOBALNESS0-NEXT:  ; %bb.10: ; %bb33.i
+; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[54:55]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_12
-; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb39.i
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
+; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
-; GLOBALNESS0-NEXT:  .LBB1_12: ; %bb44.lr.ph.i
+; GLOBALNESS0-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1]
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[66:67], 0, v2
-; GLOBALNESS0-NEXT:    s_branch .LBB1_15
-; GLOBALNESS0-NEXT:  .LBB1_13: ; %Flow16
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_branch .LBB1_16
+; GLOBALNESS0-NEXT:  .LBB1_14: ; %Flow16
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GLOBALNESS0-NEXT:  .LBB1_14: ; %bb63.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb63.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[52:53]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_25
-; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb44.i
+; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[48:49]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_14
-; GLOBALNESS0-NEXT:  ; %bb.16: ; %bb46.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
+; GLOBALNESS0-NEXT:  ; %bb.17: ; %bb46.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[50:51]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_14
-; GLOBALNESS0-NEXT:  ; %bb.17: ; %bb50.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
+; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb50.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[42:43]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_20
-; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb3.i.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
+; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb3.i.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[44:45]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_20
-; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb6.i.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
+; GLOBALNESS0-NEXT:  ; %bb.20: ; %bb6.i.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[64:65]
-; GLOBALNESS0-NEXT:  .LBB1_20: ; %spam.exit.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:  .LBB1_21: ; %spam.exit.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[56:57]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_14
-; GLOBALNESS0-NEXT:  ; %bb.21: ; %bb55.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
+; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb55.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_add_u32 s72, s38, 40
 ; GLOBALNESS0-NEXT:    s_addc_u32 s73, s39, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -524,19 +525,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[66:67]
-; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_13
-; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb62.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_14
+; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb62.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[42:43], off
-; GLOBALNESS0-NEXT:    s_branch .LBB1_13
-; GLOBALNESS0-NEXT:  .LBB1_23: ; %LeafBlock
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s75, 0
-; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
-; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
+; GLOBALNESS0-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS0-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/Generic/bb-profile-dump.ll b/llvm/test/CodeGen/Generic/bb-profile-dump.ll
index 934c281219d4ae7..12e07653f360eb2 100644
--- a/llvm/test/CodeGen/Generic/bb-profile-dump.ll
+++ b/llvm/test/CodeGen/Generic/bb-profile-dump.ll
@@ -12,7 +12,7 @@ define i64 @f2(i64 %a, i64 %b) {
     ret i64 %sum
 }
 
-; CHECK: f2,0,8
+; CHECK: f2,0,18014398509481984
 
 define i64 @f1() {
     %sum = call i64 @f2(i64 2, i64 2)
@@ -24,9 +24,9 @@ ifNotEqual:
     ret i64 %sum
 }
 
-; CHECK-NEXT: f1,0,16
-; CHECK-NEXT: f1,1,8
-; CHECK-NEXT: f1,2,16
+; CHECK-NEXT: f1,0,18014398509481984
+; CHECK-NEXT: f1,1,9007199254740992
+; CHECK-NEXT: f1,2,18014398509481984
 
 ; Check that if we pass -mbb-profile-dump but don't set -basic-block-sections,
 ; we get an appropriate error message
diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll b/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
index e97e17fc64782f4..b41f6c75133035d 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
@@ -87,11 +87,11 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_8
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_9: # %bb17
+; CHECK-NEXT:  .LBB0_9: # %bb54
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_9
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_10: # %bb26
+; CHECK-NEXT:  .LBB0_10: # %bb17
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_10
 ; CHECK-NEXT:    .p2align 4
@@ -99,7 +99,7 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_11
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_12: # %bb54
+; CHECK-NEXT:  .LBB0_12: # %bb26
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_12
 ; CHECK-NEXT:    .p2align 4
@@ -107,71 +107,71 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_13
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_14: # %bb47
+; CHECK-NEXT:  .LBB0_14: # %bb62
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_14
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_15: # %bb24
+; CHECK-NEXT:  .LBB0_15: # %bb47
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_15
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_16: # %bb19
+; CHECK-NEXT:  .LBB0_16: # %bb48
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_16
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_17: # %bb59
+; CHECK-NEXT:  .LBB0_17: # %bb49
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_17
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_18: # %bb46
+; CHECK-NEXT:  .LBB0_18: # %bb50
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_18
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_19: # %bb49
+; CHECK-NEXT:  .LBB0_19: # %bb58
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_19
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_20: # %bb57
+; CHECK-NEXT:  .LBB0_20: # %bb59
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_20
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_21: # %bb18
+; CHECK-NEXT:  .LBB0_21: # %bb60
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_21
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_22: # %bb58
+; CHECK-NEXT:  .LBB0_22: # %bb24
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_22
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_23: # %bb23
+; CHECK-NEXT:  .LBB0_23: # %bb19
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_23
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_24: # %bb60
+; CHECK-NEXT:  .LBB0_24: # %bb20
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_24
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_25: # %bb55
+; CHECK-NEXT:  .LBB0_25: # %bb23
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_25
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_26: # %bb56
+; CHECK-NEXT:  .LBB0_26: # %bb46
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_26
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_27: # %bb62
+; CHECK-NEXT:  .LBB0_27: # %bb55
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_27
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_28: # %bb20
+; CHECK-NEXT:  .LBB0_28: # %bb56
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_28
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_29: # %bb50
+; CHECK-NEXT:  .LBB0_29: # %bb57
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_29
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_30: # %bb48
+; CHECK-NEXT:  .LBB0_30: # %bb18
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_30
 ; CHECK-NEXT:  .LBB0_31: # %bb9
@@ -280,11 +280,11 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_8
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_9: # %bb17
+; CHECK-BE-NEXT:  .LBB0_9: # %bb54
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_9
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_10: # %bb26
+; CHECK-BE-NEXT:  .LBB0_10: # %bb17
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_10
 ; CHECK-BE-NEXT:    .p2align 4
@@ -292,7 +292,7 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_11
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_12: # %bb54
+; CHECK-BE-NEXT:  .LBB0_12: # %bb26
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_12
 ; CHECK-BE-NEXT:    .p2align 4
@@ -300,71 +300,71 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_13
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_14: # %bb47
+; CHECK-BE-NEXT:  .LBB0_14: # %bb62
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_14
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_15: # %bb24
+; CHECK-BE-NEXT:  .LBB0_15: # %bb47
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_15
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_16: # %bb19
+; CHECK-BE-NEXT:  .LBB0_16: # %bb48
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_16
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_17: # %bb59
+; CHECK-BE-NEXT:  .LBB0_17: # %bb49
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_17
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_18: # %bb46
+; CHECK-BE-NEXT:  .LBB0_18: # %bb50
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_18
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_19: # %bb49
+; CHECK-BE-NEXT:  .LBB0_19: # %bb58
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_19
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_20: # %bb57
+; CHECK-BE-NEXT:  .LBB0_20: # %bb59
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_20
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_21: # %bb18
+; CHECK-BE-NEXT:  .LBB0_21: # %bb60
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_21
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_22: # %bb58
+; CHECK-BE-NEXT:  .LBB0_22: # %bb24
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_22
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_23: # %bb23
+; CHECK-BE-NEXT:  .LBB0_23: # %bb19
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_23
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_24: # %bb60
+; CHECK-BE-NEXT:  .LBB0_24: # %bb20
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_24
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_25: # %bb55
+; CHECK-BE-NEXT:  .LBB0_25: # %bb23
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_25
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_26: # %bb56
+; CHECK-BE-NEXT:  .LBB0_26: # %bb46
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_26
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_27: # %bb62
+; CHECK-BE-NEXT:  .LBB0_27: # %bb55
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_27
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_28: # %bb20
+; CHECK-BE-NEXT:  .LBB0_28: # %bb56
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_28
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_29: # %bb50
+; CHECK-BE-NEXT:  .LBB0_29: # %bb57
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_29
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_30: # %bb48
+; CHECK-BE-NEXT:  .LBB0_30: # %bb18
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_30
 ; CHECK-BE-NEXT:  .LBB0_31: # %bb9
diff --git a/llvm/test/CodeGen/PowerPC/reduce_cr.ll b/llvm/test/CodeGen/PowerPC/reduce_cr.ll
index b1cac1cbc871aba..7491d13c5301015 100644
--- a/llvm/test/CodeGen/PowerPC/reduce_cr.ll
+++ b/llvm/test/CodeGen/PowerPC/reduce_cr.ll
@@ -4,10 +4,10 @@ target triple = "powerpc64le-grtev4-linux-gnu"
 
 ; First block frequency info
 ;CHECK:      block-frequency-info: loop_test
-;CHECK-NEXT: - BB0[entry]: float = 1.0, int = 12
-;CHECK-NEXT: - BB1[for.check]: float = 2.6667, int = 34
-;CHECK-NEXT: - BB2[test1]: float = 1.6667, int = 21
-;CHECK-NEXT: - BB3[optional1]: float = 0.625, int = 8
+;CHECK-NEXT: - BB0[entry]: float = 1.0, int = {{.*}}
+;CHECK-NEXT: - BB1[for.check]: float = 2.6667, int = {{.*}}
+;CHECK-NEXT: - BB2[test1]: float = 1.6667, int = {{.*}}
+;CHECK-NEXT: - BB3[optional1]: float = 0.625, int = {{.*}}
 
 ;CHECK:      block-frequency-info: loop_test
 ;CHECK:      block-frequency-info: loop_test
@@ -15,11 +15,11 @@ target triple = "powerpc64le-grtev4-linux-gnu"
 
 ; Last block frequency info
 ;CHECK:      block-frequency-info: loop_test
-;CHECK-NEXT: - BB0[entry]: float = 1.0, int = 12
-;CHECK-NEXT: - BB1[for.check]: float = 2.6667, int = 34
-;CHECK-NEXT: - BB2[for.check]: float = 2.1667, int = 27
-;CHECK-NEXT: - BB3[test1]: float = 1.6667, int = 21
-;CHECK-NEXT: - BB4[optional1]: float = 0.625, int = 8
+;CHECK-NEXT: - BB0[entry]: float = 1.0, int = {{.*}}
+;CHECK-NEXT: - BB1[for.check]: float = 2.6667, int = {{.*}}
+;CHECK-NEXT: - BB2[for.check]: float = 2.1667, int = {{.*}}
+;CHECK-NEXT: - BB3[test1]: float = 1.6667, int = {{.*}}
+;CHECK-NEXT: - BB4[optional1]: float = 0.625, int = {{.*}}
 
 
 define void @loop_test(ptr %tags, i32 %count) {
diff --git a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
index 8b4df1d2f99dac6..77d861ad0599c18 100644
--- a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
+++ b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -372,19 +372,17 @@ exit:
 ; CHECK: # %bb.{{[0-9]+}}: # %entry
 ; CHECK: andi.
 ; CHECK: # %bb.{{[0-9]+}}: # %test2
-; Make sure then2 falls through from test2
+; Make sure else2 falls through from test2
 ; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
-; CHECK: # %bb.{{[0-9]+}}: # %then2
-; CHECK: andi. {{[0-9]+}}, {{[0-9]+}}, 4
+; CHECK: # %bb.{{[0-9]+}}: # %else2
+; CHECK: bl c
 ; CHECK: # %else1
 ; CHECK: bl a
 ; CHECK: bl a
-; Make sure then2 was copied into else1
+; CHECK: # %then2
 ; CHECK: andi. {{[0-9]+}}, {{[0-9]+}}, 4
 ; CHECK: # %end1
 ; CHECK: bl d
-; CHECK: # %else2
-; CHECK: bl c
 ; CHECK: # %end2
 define void @avoidable_test(i32 %tag) {
 entry:
diff --git a/llvm/test/CodeGen/RISCV/branch-relaxation.ll b/llvm/test/CodeGen/RISCV/branch-relaxation.ll
index 4f7736e318cae6b..3d48dc9637eaedf 100644
--- a/llvm/test/CodeGen/RISCV/branch-relaxation.ll
+++ b/llvm/test/CodeGen/RISCV/branch-relaxation.ll
@@ -2769,42 +2769,22 @@ define void @relax_jal_spill_32_restore_block_correspondence() {
 ; CHECK-RV32-NEXT:    #APP
 ; CHECK-RV32-NEXT:    li t6, 31
 ; CHECK-RV32-NEXT:    #NO_APP
-; CHECK-RV32-NEXT:    bne t5, t6, .LBB6_1
-; CHECK-RV32-NEXT:  # %bb.7: # %entry
-; CHECK-RV32-NEXT:    sw s11, 0(sp)
-; CHECK-RV32-NEXT:    jump .LBB6_8, s11
-; CHECK-RV32-NEXT:  .LBB6_1: # %cond_2
-; CHECK-RV32-NEXT:    bne t3, t4, .LBB6_2
-; CHECK-RV32-NEXT:  # %bb.9: # %cond_2
-; CHECK-RV32-NEXT:    sw s11, 0(sp)
-; CHECK-RV32-NEXT:    jump .LBB6_10, s11
-; CHECK-RV32-NEXT:  .LBB6_2: # %cond_3
-; CHECK-RV32-NEXT:    bne t1, t2, .LBB6_3
-; CHECK-RV32-NEXT:  # %bb.11: # %cond_3
-; CHECK-RV32-NEXT:    sw s11, 0(sp)
-; CHECK-RV32-NEXT:    jump .LBB6_12, s11
-; CHECK-RV32-NEXT:  .LBB6_3: # %space
-; CHECK-RV32-NEXT:    #APP
-; CHECK-RV32-NEXT:    .zero 1048576
-; CHECK-RV32-NEXT:    #NO_APP
-; CHECK-RV32-NEXT:    j .LBB6_4
+; CHECK-RV32-NEXT:    bne t5, t6, .LBB6_2
+; CHECK-RV32-NEXT:    j .LBB6_1
 ; CHECK-RV32-NEXT:  .LBB6_8: # %dest_1
 ; CHECK-RV32-NEXT:    lw s11, 0(sp)
-; CHECK-RV32-NEXT:  .LBB6_4: # %dest_1
+; CHECK-RV32-NEXT:  .LBB6_1: # %dest_1
 ; CHECK-RV32-NEXT:    #APP
 ; CHECK-RV32-NEXT:    # dest 1
 ; CHECK-RV32-NEXT:    #NO_APP
-; CHECK-RV32-NEXT:    j .LBB6_5
-; CHECK-RV32-NEXT:  .LBB6_10: # %dest_2
-; CHECK-RV32-NEXT:    lw s11, 0(sp)
-; CHECK-RV32-NEXT:  .LBB6_5: # %dest_2
+; CHECK-RV32-NEXT:    j .LBB6_3
+; CHECK-RV32-NEXT:  .LBB6_2: # %cond_2
+; CHECK-RV32-NEXT:    bne t3, t4, .LBB6_5
+; CHECK-RV32-NEXT:  .LBB6_3: # %dest_2
 ; CHECK-RV32-NEXT:    #APP
 ; CHECK-RV32-NEXT:    # dest 2
 ; CHECK-RV32-NEXT:    #NO_APP
-; CHECK-RV32-NEXT:    j .LBB6_6
-; CHECK-RV32-NEXT:  .LBB6_12: # %dest_3
-; CHECK-RV32-NEXT:    lw s11, 0(sp)
-; CHECK-RV32-NEXT:  .LBB6_6: # %dest_3
+; CHECK-RV32-NEXT:  .LBB6_4: # %dest_3
 ; CHECK-RV32-NEXT:    #APP
 ; CHECK-RV32-NEXT:    # dest 3
 ; CHECK-RV32-NEXT:    #NO_APP
@@ -2907,6 +2887,15 @@ define void @relax_jal_spill_32_restore_block_correspondence() {
 ; CHECK-RV32-NEXT:    lw s11, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 64
 ; CHECK-RV32-NEXT:    ret
+; CHECK-RV32-NEXT:  .LBB6_5: # %cond_3
+; CHECK-RV32-NEXT:    beq t1, t2, .LBB6_4
+; CHECK-RV32-NEXT:  # %bb.6: # %space
+; CHECK-RV32-NEXT:    #APP
+; CHECK-RV32-NEXT:    .zero 1048576
+; CHECK-RV32-NEXT:    #NO_APP
+; CHECK-RV32-NEXT:  # %bb.7: # %space
+; CHECK-RV32-NEXT:    sw s11, 0(sp)
+; CHECK-RV32-NEXT:    jump .LBB6_8, s11
 ;
 ; CHECK-RV64-LABEL: relax_jal_spill_32_restore_block_correspondence:
 ; CHECK-RV64:       # %bb.0: # %entry
@@ -3026,34 +3015,21 @@ define void @relax_jal_spill_32_restore_block_correspondence() {
 ; CHECK-RV64-NEXT:    sext.w t6, t6
 ; CHECK-RV64-NEXT:    sd t5, 16(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    sext.w t5, t5
-; CHECK-RV64-NEXT:    bne t5, t6, .LBB6_1
-; CHECK-RV64-NEXT:  # %bb.7: # %entry
-; CHECK-RV64-NEXT:    jump .LBB6_4, t5
-; CHECK-RV64-NEXT:  .LBB6_1: # %cond_2
-; CHECK-RV64-NEXT:    sext.w t5, t4
-; CHECK-RV64-NEXT:    sext.w t6, t3
-; CHECK-RV64-NEXT:    bne t6, t5, .LBB6_2
-; CHECK-RV64-NEXT:  # %bb.9: # %cond_2
-; CHECK-RV64-NEXT:    jump .LBB6_5, t5
-; CHECK-RV64-NEXT:  .LBB6_2: # %cond_3
-; CHECK-RV64-NEXT:    sext.w t5, t2
-; CHECK-RV64-NEXT:    sext.w t6, t1
-; CHECK-RV64-NEXT:    bne t6, t5, .LBB6_3
-; CHECK-RV64-NEXT:  # %bb.11: # %cond_3
-; CHECK-RV64-NEXT:    jump .LBB6_6, t5
-; CHECK-RV64-NEXT:  .LBB6_3: # %space
-; CHECK-RV64-NEXT:    #APP
-; CHECK-RV64-NEXT:    .zero 1048576
-; CHECK-RV64-NEXT:    #NO_APP
-; CHECK-RV64-NEXT:  .LBB6_4: # %dest_1
+; CHECK-RV64-NEXT:    bne t5, t6, .LBB6_2
+; CHECK-RV64-NEXT:  .LBB6_1: # %dest_1
 ; CHECK-RV64-NEXT:    #APP
 ; CHECK-RV64-NEXT:    # dest 1
 ; CHECK-RV64-NEXT:    #NO_APP
-; CHECK-RV64-NEXT:  .LBB6_5: # %dest_2
+; CHECK-RV64-NEXT:    j .LBB6_3
+; CHECK-RV64-NEXT:  .LBB6_2: # %cond_2
+; CHECK-RV64-NEXT:    sext.w t5, t4
+; CHECK-RV64-NEXT:    sext.w t6, t3
+; CHECK-RV64-NEXT:    bne t6, t5, .LBB6_5
+; CHECK-RV64-NEXT:  .LBB6_3: # %dest_2
 ; CHECK-RV64-NEXT:    #APP
 ; CHECK-RV64-NEXT:    # dest 2
 ; CHECK-RV64-NEXT:    #NO_APP
-; CHECK-RV64-NEXT:  .LBB6_6: # %dest_3
+; CHECK-RV64-NEXT:  .LBB6_4: # %dest_3
 ; CHECK-RV64-NEXT:    #APP
 ; CHECK-RV64-NEXT:    # dest 3
 ; CHECK-RV64-NEXT:    #NO_APP
@@ -3158,6 +3134,16 @@ define void @relax_jal_spill_32_restore_block_correspondence() {
 ; CHECK-RV64-NEXT:    ld s11, 24(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 128
 ; CHECK-RV64-NEXT:    ret
+; CHECK-RV64-NEXT:  .LBB6_5: # %cond_3
+; CHECK-RV64-NEXT:    sext.w t5, t2
+; CHECK-RV64-NEXT:    sext.w t6, t1
+; CHECK-RV64-NEXT:    beq t6, t5, .LBB6_4
+; CHECK-RV64-NEXT:  # %bb.6: # %space
+; CHECK-RV64-NEXT:    #APP
+; CHECK-RV64-NEXT:    .zero 1048576
+; CHECK-RV64-NEXT:    #NO_APP
+; CHECK-RV64-NEXT:  # %bb.7: # %space
+; CHECK-RV64-NEXT:    jump .LBB6_1, t5
 entry:
   %ra = call i32 asm sideeffect "addi ra, x0, 1", "={ra}"()
   %t0 = call i32 asm sideeffect "addi t0, x0, 5", "={t0}"()
diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
index f0937059a8e7a2c..2a1bb5edf516bb6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
@@ -357,48 +357,50 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    mov r12, r1
-; CHECK-NEXT:    subs r1, r0, #1
-; CHECK-NEXT:    sbcs r1, r12, #0
+; CHECK-NEXT:    mov lr, r0
+; CHECK-NEXT:    subs r0, #1
+; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    blt.w .LBB1_28
 ; CHECK-NEXT:  @ %bb.1: @ %for.cond2.preheader.lr.ph
-; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    csel lr, r2, r3, lt
-; CHECK-NEXT:    movw r4, #43691
-; CHECK-NEXT:    mov r1, lr
-; CHECK-NEXT:    cmp.w lr, #3
+; CHECK-NEXT:    csel r7, r2, r0, lt
+; CHECK-NEXT:    mov r12, r1
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    cmp r7, #3
 ; CHECK-NEXT:    it ls
 ; CHECK-NEXT:    movls r1, #3
-; CHECK-NEXT:    movt r4, #43690
-; CHECK-NEXT:    sub.w r1, r1, lr
-; CHECK-NEXT:    ldr r6, [sp, #128]
+; CHECK-NEXT:    mov r4, r2
+; CHECK-NEXT:    subs r1, r1, r7
+; CHECK-NEXT:    movw r2, #43691
 ; CHECK-NEXT:    adds r1, #2
+; CHECK-NEXT:    movt r2, #43690
+; CHECK-NEXT:    ldr r6, [sp, #128]
 ; CHECK-NEXT:    movw r8, :lower16:c
+; CHECK-NEXT:    umull r1, r2, r1, r2
 ; CHECK-NEXT:    movt r8, :upper16:c
-; CHECK-NEXT:    mov.w r9, #12
-; CHECK-NEXT:    umull r1, r4, r1, r4
+; CHECK-NEXT:    movs r1, #4
 ; CHECK-NEXT:    @ implicit-def: $r10
 ; CHECK-NEXT:    @ implicit-def: $r5
 ; CHECK-NEXT:    @ implicit-def: $r11
-; CHECK-NEXT:    str r0, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    movs r1, #4
-; CHECK-NEXT:    strd r2, r12, [sp, #4] @ 8-byte Folded Spill
-; CHECK-NEXT:    add.w r3, r3, r4, lsr #1
-; CHECK-NEXT:    add.w r1, r1, r4, lsr #1
-; CHECK-NEXT:    movw r4, #65532
-; CHECK-NEXT:    vdup.32 q6, r3
-; CHECK-NEXT:    movt r4, #32767
-; CHECK-NEXT:    and.w r7, r1, r4
+; CHECK-NEXT:    mov.w r9, #12
+; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r0, r2, lsr #1
+; CHECK-NEXT:    add.w r1, r1, r2, lsr #1
+; CHECK-NEXT:    movw r2, #65532
+; CHECK-NEXT:    vdup.32 q6, r0
+; CHECK-NEXT:    movt r2, #32767
+; CHECK-NEXT:    and.w r3, r1, r2
 ; CHECK-NEXT:    adr r1, .LCPI1_0
-; CHECK-NEXT:    vdup.32 q7, r3
+; CHECK-NEXT:    vdup.32 q7, r0
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI1_1
 ; CHECK-NEXT:    vldrw.u32 q5, [r1]
-; CHECK-NEXT:    vadd.i32 q4, q0, lr
-; CHECK-NEXT:    b .LBB1_4
+; CHECK-NEXT:    strd r3, r7, [sp, #4] @ 8-byte Folded Spill
+; CHECK-NEXT:    vadd.i32 q4, q0, r7
+; CHECK-NEXT:    b .LBB1_6
 ; CHECK-NEXT:  .LBB1_2: @ %for.body6.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
 ; CHECK-NEXT:    mov r0, r11
 ; CHECK-NEXT:    cmn.w r11, #4
 ; CHECK-NEXT:    it le
@@ -407,7 +409,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    adds r0, #6
 ; CHECK-NEXT:    movt r2, #9362
 ; CHECK-NEXT:    sub.w r1, r0, r11
-; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    mov r10, r3
 ; CHECK-NEXT:    umull r2, r3, r1, r2
 ; CHECK-NEXT:    subs r2, r1, r3
 ; CHECK-NEXT:    add.w r2, r3, r2, lsr #1
@@ -415,59 +417,67 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    lsls r3, r3, #3
 ; CHECK-NEXT:    sub.w r2, r3, r2, lsr #2
 ; CHECK-NEXT:    subs r1, r2, r1
+; CHECK-NEXT:    mov r3, r10
 ; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:  .LBB1_3: @ %for.cond.cleanup5.loopexit134.split.loop.exit139
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
 ; CHECK-NEXT:    add.w r11, r0, #7
-; CHECK-NEXT:    ldrd r12, r0, [sp, #8] @ 8-byte Folded Reload
-; CHECK-NEXT:  .LBB1_3: @ %for.cond.cleanup5
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
+; CHECK-NEXT:  .LBB1_4: @ %for.cond.cleanup5
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
+; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:  .LBB1_5: @ %for.cond.cleanup5
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
 ; CHECK-NEXT:    adds r5, #2
-; CHECK-NEXT:    subs r1, r5, r0
-; CHECK-NEXT:    asr.w r3, r5, #31
-; CHECK-NEXT:    sbcs.w r1, r3, r12
+; CHECK-NEXT:    subs.w r1, r5, lr
+; CHECK-NEXT:    asr.w r0, r5, #31
+; CHECK-NEXT:    sbcs.w r0, r0, r12
 ; CHECK-NEXT:    bge.w .LBB1_28
-; CHECK-NEXT:  .LBB1_4: @ %for.cond2.preheader
+; CHECK-NEXT:  .LBB1_6: @ %for.cond2.preheader
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB1_17 Depth 2
-; CHECK-NEXT:    @ Child Loop BB1_11 Depth 2
-; CHECK-NEXT:    @ Child Loop BB1_8 Depth 3
-; CHECK-NEXT:    @ Child Loop BB1_13 Depth 3
+; CHECK-NEXT:    @ Child Loop BB1_19 Depth 2
+; CHECK-NEXT:    @ Child Loop BB1_13 Depth 2
+; CHECK-NEXT:    @ Child Loop BB1_10 Depth 3
+; CHECK-NEXT:    @ Child Loop BB1_15 Depth 3
 ; CHECK-NEXT:    cmp.w r11, #2
-; CHECK-NEXT:    bgt .LBB1_3
-; CHECK-NEXT:  @ %bb.5: @ %for.body6.lr.ph
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    cmp.w lr, #5
-; CHECK-NEXT:    bhi .LBB1_15
-; CHECK-NEXT:  @ %bb.6: @ %for.body6.us.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
+; CHECK-NEXT:    bgt .LBB1_5
+; CHECK-NEXT:  @ %bb.7: @ %for.body6.lr.ph
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
+; CHECK-NEXT:    cmp r7, #5
+; CHECK-NEXT:    bhi .LBB1_17
+; CHECK-NEXT:  @ %bb.8: @ %for.body6.us.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
 ; CHECK-NEXT:    ldrd r2, r3, [sp, #120]
 ; CHECK-NEXT:    movs r0, #32
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    mov r4, r7
-; CHECK-NEXT:    mov r7, lr
+; CHECK-NEXT:    mov r4, r6
+; CHECK-NEXT:    mov r7, r12
+; CHECK-NEXT:    mov r6, lr
 ; CHECK-NEXT:    bl __aeabi_ldivmod
+; CHECK-NEXT:    mov lr, r6
+; CHECK-NEXT:    mov r6, r4
+; CHECK-NEXT:    mov r12, r7
+; CHECK-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r4, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vdup.32 q0, r2
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r2, r12, [sp, #4] @ 8-byte Folded Reload
-; CHECK-NEXT:    mov lr, r7
-; CHECK-NEXT:    mov r7, r4
-; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    b .LBB1_11
-; CHECK-NEXT:  .LBB1_7: @ %for.body13.us51.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB1_11 Depth=2
-; CHECK-NEXT:    movw r4, :lower16:a
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    mov r0, r11
+; CHECK-NEXT:    b .LBB1_13
+; CHECK-NEXT:  .LBB1_9: @ %for.body13.us51.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB1_13 Depth=2
+; CHECK-NEXT:    movw r2, :lower16:a
 ; CHECK-NEXT:    vmov q1, q4
-; CHECK-NEXT:    movt r4, :upper16:a
-; CHECK-NEXT:    str r1, [r4]
-; CHECK-NEXT:    movw r4, :lower16:b
-; CHECK-NEXT:    movt r4, :upper16:b
-; CHECK-NEXT:    str r1, [r4]
-; CHECK-NEXT:    mov r4, r7
-; CHECK-NEXT:  .LBB1_8: @ %vector.body111
-; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
-; CHECK-NEXT:    @ Parent Loop BB1_11 Depth=2
+; CHECK-NEXT:    movt r2, :upper16:a
+; CHECK-NEXT:    str r1, [r2]
+; CHECK-NEXT:    movw r2, :lower16:b
+; CHECK-NEXT:    movt r2, :upper16:b
+; CHECK-NEXT:    str r1, [r2]
+; CHECK-NEXT:    mov r2, r3
+; CHECK-NEXT:  .LBB1_10: @ %vector.body111
+; CHECK-NEXT:    @ Parent Loop BB1_6 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB1_13 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
 ; CHECK-NEXT:    vqadd.u32 q2, q5, r1
-; CHECK-NEXT:    subs r4, #4
+; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vcmp.u32 hi, q7, q2
 ; CHECK-NEXT:    vshl.i32 q2, q1, #2
 ; CHECK-NEXT:    add.w r1, r1, #4
@@ -475,35 +485,35 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    vadd.i32 q1, q1, r9
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [q2]
-; CHECK-NEXT:    bne .LBB1_8
-; CHECK-NEXT:  .LBB1_9: @ %for.cond9.for.cond15.preheader_crit_edge.us
-; CHECK-NEXT:    @ in Loop: Header=BB1_11 Depth=2
-; CHECK-NEXT:    cbnz r6, .LBB1_14
-; CHECK-NEXT:  .LBB1_10: @ %for.cond.cleanup17.us
-; CHECK-NEXT:    @ in Loop: Header=BB1_11 Depth=2
-; CHECK-NEXT:    add.w r11, r3, #7
-; CHECK-NEXT:    cmn.w r3, #4
+; CHECK-NEXT:    bne .LBB1_10
+; CHECK-NEXT:  .LBB1_11: @ %for.cond9.for.cond15.preheader_crit_edge.us
+; CHECK-NEXT:    @ in Loop: Header=BB1_13 Depth=2
+; CHECK-NEXT:    cbnz r6, .LBB1_16
+; CHECK-NEXT:  .LBB1_12: @ %for.cond.cleanup17.us
+; CHECK-NEXT:    @ in Loop: Header=BB1_13 Depth=2
+; CHECK-NEXT:    add.w r11, r0, #7
+; CHECK-NEXT:    cmn.w r0, #4
 ; CHECK-NEXT:    mov.w r10, #0
-; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    bge .LBB1_3
-; CHECK-NEXT:  .LBB1_11: @ %for.body6.us
-; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
+; CHECK-NEXT:    mov r0, r11
+; CHECK-NEXT:    bge .LBB1_5
+; CHECK-NEXT:  .LBB1_13: @ %for.body6.us
+; CHECK-NEXT:    @ Parent Loop BB1_6 Depth=1
 ; CHECK-NEXT:    @ => This Loop Header: Depth=2
-; CHECK-NEXT:    @ Child Loop BB1_8 Depth 3
-; CHECK-NEXT:    @ Child Loop BB1_13 Depth 3
+; CHECK-NEXT:    @ Child Loop BB1_10 Depth 3
+; CHECK-NEXT:    @ Child Loop BB1_15 Depth 3
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    bne .LBB1_7
-; CHECK-NEXT:  @ %bb.12: @ %vector.body.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB1_11 Depth=2
-; CHECK-NEXT:    mov r4, r7
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    bne .LBB1_9
+; CHECK-NEXT:  @ %bb.14: @ %vector.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB1_13 Depth=2
+; CHECK-NEXT:    mov r2, r3
 ; CHECK-NEXT:    vmov q1, q4
-; CHECK-NEXT:  .LBB1_13: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
-; CHECK-NEXT:    @ Parent Loop BB1_11 Depth=2
+; CHECK-NEXT:  .LBB1_15: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB1_6 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB1_13 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
 ; CHECK-NEXT:    vqadd.u32 q2, q5, r1
-; CHECK-NEXT:    subs r4, #4
+; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vcmp.u32 hi, q6, q2
 ; CHECK-NEXT:    vshl.i32 q2, q1, #2
 ; CHECK-NEXT:    add.w r1, r1, #4
@@ -511,61 +521,53 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    vadd.i32 q1, q1, r9
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [q2]
-; CHECK-NEXT:    bne .LBB1_13
-; CHECK-NEXT:    b .LBB1_9
-; CHECK-NEXT:  .LBB1_14: @ %for.cond9.for.cond15.preheader_crit_edge.us
-; CHECK-NEXT:    @ in Loop: Header=BB1_11 Depth=2
+; CHECK-NEXT:    bne .LBB1_15
+; CHECK-NEXT:    b .LBB1_11
+; CHECK-NEXT:  .LBB1_16: @ %for.cond9.for.cond15.preheader_crit_edge.us
+; CHECK-NEXT:    @ in Loop: Header=BB1_13 Depth=2
 ; CHECK-NEXT:    eor r1, r10, #1
 ; CHECK-NEXT:    lsls r1, r1, #31
-; CHECK-NEXT:    bne .LBB1_10
+; CHECK-NEXT:    bne .LBB1_12
 ; CHECK-NEXT:    b .LBB1_26
-; CHECK-NEXT:  .LBB1_15: @ %for.body6.lr.ph.split
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
+; CHECK-NEXT:  .LBB1_17: @ %for.body6.lr.ph.split
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
 ; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    beq.w .LBB1_2
-; CHECK-NEXT:  @ %bb.16: @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    ldrd r12, r0, [sp, #8] @ 8-byte Folded Reload
-; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:  .LBB1_17: @ %for.body6.us60
-; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
+; CHECK-NEXT:  @ %bb.18: @ in Loop: Header=BB1_6 Depth=1
+; CHECK-NEXT:    mov r0, r11
+; CHECK-NEXT:  .LBB1_19: @ %for.body6.us60
+; CHECK-NEXT:    @ Parent Loop BB1_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    lsls.w r1, r10, #31
 ; CHECK-NEXT:    bne .LBB1_27
-; CHECK-NEXT:  @ %bb.18: @ %for.cond.cleanup17.us63
-; CHECK-NEXT:    @ in Loop: Header=BB1_17 Depth=2
-; CHECK-NEXT:    cmn.w r3, #4
-; CHECK-NEXT:    bge .LBB1_22
-; CHECK-NEXT:  @ %bb.19: @ %for.cond.cleanup17.us63.1
-; CHECK-NEXT:    @ in Loop: Header=BB1_17 Depth=2
-; CHECK-NEXT:    cmn.w r3, #12
-; CHECK-NEXT:    bgt .LBB1_23
-; CHECK-NEXT:  @ %bb.20: @ %for.cond.cleanup17.us63.2
-; CHECK-NEXT:    @ in Loop: Header=BB1_17 Depth=2
-; CHECK-NEXT:    cmn.w r3, #19
+; CHECK-NEXT:  @ %bb.20: @ %for.cond.cleanup17.us63
+; CHECK-NEXT:    @ in Loop: Header=BB1_19 Depth=2
+; CHECK-NEXT:    cmn.w r0, #4
+; CHECK-NEXT:    bge.w .LBB1_3
+; CHECK-NEXT:  @ %bb.21: @ %for.cond.cleanup17.us63.1
+; CHECK-NEXT:    @ in Loop: Header=BB1_19 Depth=2
+; CHECK-NEXT:    cmn.w r0, #12
 ; CHECK-NEXT:    bgt .LBB1_24
-; CHECK-NEXT:  @ %bb.21: @ %for.cond.cleanup17.us63.3
-; CHECK-NEXT:    @ in Loop: Header=BB1_17 Depth=2
-; CHECK-NEXT:    add.w r11, r3, #28
-; CHECK-NEXT:    cmn.w r3, #25
-; CHECK-NEXT:    mov.w r10, #0
-; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    blt .LBB1_17
-; CHECK-NEXT:    b .LBB1_3
-; CHECK-NEXT:  .LBB1_22: @ %for.cond.cleanup5.loopexit134.split.loop.exit139
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    add.w r11, r3, #7
-; CHECK-NEXT:    b .LBB1_25
-; CHECK-NEXT:  .LBB1_23: @ %for.cond.cleanup5.loopexit134.split.loop.exit137
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    add.w r11, r3, #14
-; CHECK-NEXT:    b .LBB1_25
-; CHECK-NEXT:  .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit135
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    add.w r11, r3, #21
-; CHECK-NEXT:  .LBB1_25: @ %for.cond.cleanup5
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
+; CHECK-NEXT:  @ %bb.22: @ %for.cond.cleanup17.us63.2
+; CHECK-NEXT:    @ in Loop: Header=BB1_19 Depth=2
+; CHECK-NEXT:    cmn.w r0, #19
+; CHECK-NEXT:    bgt .LBB1_25
+; CHECK-NEXT:  @ %bb.23: @ %for.cond.cleanup17.us63.3
+; CHECK-NEXT:    @ in Loop: Header=BB1_19 Depth=2
+; CHECK-NEXT:    add.w r11, r0, #28
+; CHECK-NEXT:    cmn.w r0, #25
 ; CHECK-NEXT:    mov.w r10, #0
-; CHECK-NEXT:    b .LBB1_3
+; CHECK-NEXT:    mov r0, r11
+; CHECK-NEXT:    blt .LBB1_19
+; CHECK-NEXT:    b .LBB1_5
+; CHECK-NEXT:  .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit137
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
+; CHECK-NEXT:    add.w r11, r0, #14
+; CHECK-NEXT:    b .LBB1_4
+; CHECK-NEXT:  .LBB1_25: @ %for.cond.cleanup5.loopexit134.split.loop.exit135
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
+; CHECK-NEXT:    add.w r11, r0, #21
+; CHECK-NEXT:    b .LBB1_4
 ; CHECK-NEXT:  .LBB1_26: @ %for.inc19.us
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    b .LBB1_26
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index f7fbff39162e0d2..ff8c914aac5f31e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1057,7 +1057,8 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    ldrd r11, r8, [r10, #24]
 ; CHECK-NEXT:    vstrb.8 q0, [r9], #16
 ; CHECK-NEXT:    vldrw.u32 q0, [r4], #32
-; CHECK-NEXT:    strd r9, r1, [sp, #24] @ 8-byte Folded Spill
+; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    str.w r9, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q1, [r4, #-28]
 ; CHECK-NEXT:    vmul.f32 q0, q0, r3
 ; CHECK-NEXT:    vldrw.u32 q6, [r4, #-24]
@@ -1573,12 +1574,12 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    ldrd r6, r9, [r0]
-; CHECK-NEXT:    and r7, r3, #3
+; CHECK-NEXT:    ldrd r7, r9, [r0]
+; CHECK-NEXT:    and r6, r3, #3
 ; CHECK-NEXT:    ldr r0, [r0, #8]
 ; CHECK-NEXT:    lsrs r3, r3, #2
 ; CHECK-NEXT:    @ implicit-def: $r12
-; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    str r6, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    str r2, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB19_3
@@ -1619,28 +1620,29 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:  .LBB19_3: @ %do.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB19_2 Depth 2
-; CHECK-NEXT:    str r6, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    mov r6, r2
 ; CHECK-NEXT:    ldrd r5, r11, [r9]
 ; CHECK-NEXT:    ldrd r8, r10, [r9, #8]
 ; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT:    str r7, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    wls lr, r2, .LBB19_4
 ; CHECK-NEXT:    b .LBB19_1
 ; CHECK-NEXT:  .LBB19_4: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    cbnz r2, .LBB19_7
+; CHECK-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    cbnz r3, .LBB19_7
 ; CHECK-NEXT:  @ %bb.5: @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    mov r7, r5
+; CHECK-NEXT:    mov r2, r5
 ; CHECK-NEXT:    mov r4, r11
 ; CHECK-NEXT:    mov r8, r10
 ; CHECK-NEXT:  .LBB19_6: @ %if.end69
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    ldrd r2, r6, [sp, #8] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    adds r0, #128
-; CHECK-NEXT:    strd r7, r4, [r9]
-; CHECK-NEXT:    subs r6, #1
+; CHECK-NEXT:    strd r2, r4, [r9]
+; CHECK-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    subs r7, #1
 ; CHECK-NEXT:    strd r3, r8, [r9, #8]
 ; CHECK-NEXT:    add.w r9, r9, #16
 ; CHECK-NEXT:    mov r1, r2
@@ -1650,20 +1652,20 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    ldrd lr, r4, [r1]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    ldrd r7, r1, [r1, #8]
+; CHECK-NEXT:    ldrd r2, r1, [r1, #8]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    vmul.f32 q0, q0, r1
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #64]
-; CHECK-NEXT:    vfma.f32 q0, q6, r7
+; CHECK-NEXT:    vfma.f32 q0, q6, r2
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
 ; CHECK-NEXT:    vfma.f32 q0, q7, r4
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #96]
 ; CHECK-NEXT:    vfma.f32 q0, q4, lr
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #112]
 ; CHECK-NEXT:    vfma.f32 q0, q5, r5
-; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    cmp r3, #1
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
 ; CHECK-NEXT:    vfma.f32 q0, q2, r8
 ; CHECK-NEXT:    vfma.f32 q0, q1, r10
@@ -1672,19 +1674,19 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:  @ %bb.8: @ %if.then58
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    str r5, [r6]
-; CHECK-NEXT:    mov r7, lr
+; CHECK-NEXT:    mov r2, lr
 ; CHECK-NEXT:    mov r4, r12
 ; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    b .LBB19_12
 ; CHECK-NEXT:  .LBB19_9: @ %if.else
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    vmov r8, s1
-; CHECK-NEXT:    cmp r2, #2
+; CHECK-NEXT:    cmp r3, #2
 ; CHECK-NEXT:    vstr s1, [r6, #4]
 ; CHECK-NEXT:    str r5, [r6]
 ; CHECK-NEXT:    bne .LBB19_11
 ; CHECK-NEXT:  @ %bb.10: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    mov r7, r4
+; CHECK-NEXT:    mov r2, r4
 ; CHECK-NEXT:    mov r3, r8
 ; CHECK-NEXT:    mov r4, lr
 ; CHECK-NEXT:    mov r8, r5
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll
index 747021e5c64eb30..f70af5661f4c904 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll
@@ -383,27 +383,27 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1_i1(<2 x i64> %a, <2 x i64> %b, i64
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    orr.w r2, r0, r1
+; CHECK-NEXT:    orr.w r3, r0, r1
 ; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, r3, d3
+; CHECK-NEXT:    vmov r1, r2, d3
 ; CHECK-NEXT:    csetm r12, eq
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    orrs r1, r3
-; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r1, r2, d0
+; CHECK-NEXT:    csetm r4, eq
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    csetm lr, eq
-; CHECK-NEXT:    orrs r1, r3
-; CHECK-NEXT:    vmov r1, r4, d1
-; CHECK-NEXT:    csetm r3, eq
-; CHECK-NEXT:    orrs r1, r4
+; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    csetm r1, eq
-; CHECK-NEXT:    cbz r2, .LBB15_2
+; CHECK-NEXT:    cbz r3, .LBB15_2
 ; CHECK-NEXT:  @ %bb.1: @ %select.false
 ; CHECK-NEXT:    bfi r0, r12, #0, #8
-; CHECK-NEXT:    bfi r0, lr, #8, #8
+; CHECK-NEXT:    bfi r0, r4, #8, #8
 ; CHECK-NEXT:    b .LBB15_3
 ; CHECK-NEXT:  .LBB15_2:
-; CHECK-NEXT:    bfi r0, r3, #0, #8
+; CHECK-NEXT:    bfi r0, lr, #0, #8
 ; CHECK-NEXT:    bfi r0, r1, #8, #8
 ; CHECK-NEXT:  .LBB15_3: @ %select.end
 ; CHECK-NEXT:    vmsr p0, r0
diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index b5d981ef340254d..d8e0eb7119a9b27 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -6,101 +6,102 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #12
+; CHECK-NEXT:    sub sp, #12
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq.w .LBB0_8
 ; CHECK-NEXT:  @ %bb.1: @ %entry
-; CHECK-NEXT:    mov r11, r2
 ; CHECK-NEXT:    cmp r3, #1
 ; CHECK-NEXT:    bne .LBB0_3
 ; CHECK-NEXT:  @ %bb.2:
-; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    mov r12, r0
 ; CHECK-NEXT:    mov r8, r1
-; CHECK-NEXT:    mov r10, r11
+; CHECK-NEXT:    mov r10, r2
 ; CHECK-NEXT:    b .LBB0_6
 ; CHECK-NEXT:  .LBB0_3: @ %vector.ph
-; CHECK-NEXT:    bic r2, r3, #1
-; CHECK-NEXT:    adr r4, .LCPI0_0
-; CHECK-NEXT:    subs r7, r2, #2
-; CHECK-NEXT:    movs r6, #1
 ; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    add.w r10, r11, r2, lsl #2
+; CHECK-NEXT:    bic r3, r3, #1
+; CHECK-NEXT:    subs r7, r3, #2
+; CHECK-NEXT:    movs r6, #1
+; CHECK-NEXT:    adr r4, .LCPI0_0
+; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #1
-; CHECK-NEXT:    str r2, [sp] @ 4-byte Spill
-; CHECK-NEXT:    add.w r8, r1, r2, lsl #2
-; CHECK-NEXT:    add.w r12, r0, r2, lsl #2
+; CHECK-NEXT:    add.w r10, r2, r3, lsl #2
+; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
 ; CHECK-NEXT:  .LBB0_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrd r4, r2, [r0], #8
+; CHECK-NEXT:    ldrd r4, r3, [r0], #8
 ; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:    ldrd r7, r6, [r1], #8
-; CHECK-NEXT:    smull r4, r7, r7, r4
-; CHECK-NEXT:    asrl r4, r7, #31
+; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    smull r4, r11, r7, r4
+; CHECK-NEXT:    asrl r4, r11, #31
 ; CHECK-NEXT:    rsbs.w r9, r4, #-2147483648
 ; CHECK-NEXT:    mov.w r9, #-1
-; CHECK-NEXT:    sbcs.w r3, r9, r7
+; CHECK-NEXT:    sbcs.w r3, r9, r11
 ; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r5, r3, #0, #8
-; CHECK-NEXT:    smull r2, r3, r6, r2
-; CHECK-NEXT:    asrl r2, r3, #31
-; CHECK-NEXT:    rsbs.w r6, r2, #-2147483648
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
-; CHECK-NEXT:    sbcs.w r6, r9, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r7, r3
-; CHECK-NEXT:    csetm r6, lt
-; CHECK-NEXT:    bfi r5, r6, #8, #8
+; CHECK-NEXT:    ldr r3, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    smull r6, r3, r6, r3
+; CHECK-NEXT:    asrl r6, r3, #31
+; CHECK-NEXT:    rsbs.w r7, r6, #-2147483648
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r6
+; CHECK-NEXT:    sbcs.w r7, r9, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r11, r3
+; CHECK-NEXT:    csetm r7, lt
+; CHECK-NEXT:    mvn r6, #-2147483648
+; CHECK-NEXT:    bfi r5, r7, #8, #8
 ; CHECK-NEXT:    vmsr p0, r5
-; CHECK-NEXT:    mvn r5, #-2147483648
 ; CHECK-NEXT:    vpsel q2, q2, q0
-; CHECK-NEXT:    vmov r2, r3, d4
-; CHECK-NEXT:    subs r2, r2, r5
-; CHECK-NEXT:    sbcs r2, r3, #0
-; CHECK-NEXT:    mov.w r3, #0
-; CHECK-NEXT:    csetm r2, lt
-; CHECK-NEXT:    bfi r3, r2, #0, #8
-; CHECK-NEXT:    vmov r2, r4, d5
-; CHECK-NEXT:    subs r2, r2, r5
-; CHECK-NEXT:    sbcs r2, r4, #0
-; CHECK-NEXT:    csetm r2, lt
-; CHECK-NEXT:    bfi r3, r2, #8, #8
-; CHECK-NEXT:    vmsr p0, r3
+; CHECK-NEXT:    vmov r3, r4, d4
+; CHECK-NEXT:    subs r3, r3, r6
+; CHECK-NEXT:    sbcs r3, r4, #0
+; CHECK-NEXT:    mov.w r4, #0
+; CHECK-NEXT:    csetm r3, lt
+; CHECK-NEXT:    bfi r4, r3, #0, #8
+; CHECK-NEXT:    vmov r3, r5, d5
+; CHECK-NEXT:    subs r3, r3, r6
+; CHECK-NEXT:    sbcs r3, r5, #0
+; CHECK-NEXT:    csetm r3, lt
+; CHECK-NEXT:    bfi r4, r3, #8, #8
+; CHECK-NEXT:    vmsr p0, r4
 ; CHECK-NEXT:    vpsel q2, q2, q1
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    strd r3, r2, [r11], #8
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    strd r4, r3, [r2], #8
 ; CHECK-NEXT:    le lr, .LBB0_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
-; CHECK-NEXT:    ldrd r2, r3, [sp] @ 8-byte Folded Reload
-; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    ldrd r7, r3, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    cmp r7, r3
 ; CHECK-NEXT:    beq .LBB0_8
 ; CHECK-NEXT:  .LBB0_6: @ %for.body.preheader
-; CHECK-NEXT:    sub.w lr, r3, r2
+; CHECK-NEXT:    sub.w lr, r3, r7
 ; CHECK-NEXT:    mov.w r0, #-1
 ; CHECK-NEXT:    mov.w r1, #-2147483648
-; CHECK-NEXT:    mvn r3, #-2147483648
+; CHECK-NEXT:    mvn r2, #-2147483648
 ; CHECK-NEXT:  .LBB0_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr r2, [r12], #4
+; CHECK-NEXT:    ldr r3, [r12], #4
 ; CHECK-NEXT:    ldr r4, [r8], #4
-; CHECK-NEXT:    smull r2, r5, r4, r2
-; CHECK-NEXT:    asrl r2, r5, #31
-; CHECK-NEXT:    subs r4, r1, r2
-; CHECK-NEXT:    sbcs.w r4, r0, r5
-; CHECK-NEXT:    cset r4, lt
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    csel r2, r2, r1, ne
-; CHECK-NEXT:    csel r4, r5, r0, ne
-; CHECK-NEXT:    subs r5, r2, r3
-; CHECK-NEXT:    sbcs r4, r4, #0
-; CHECK-NEXT:    csel r2, r2, r3, lt
-; CHECK-NEXT:    str r2, [r10], #4
+; CHECK-NEXT:    smull r4, r3, r4, r3
+; CHECK-NEXT:    asrl r4, r3, #31
+; CHECK-NEXT:    subs r5, r1, r4
+; CHECK-NEXT:    sbcs.w r5, r0, r3
+; CHECK-NEXT:    cset r5, lt
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csel r4, r4, r1, ne
+; CHECK-NEXT:    csel r3, r3, r0, ne
+; CHECK-NEXT:    subs r5, r4, r2
+; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    csel r3, r4, r2, lt
+; CHECK-NEXT:    str r3, [r10], #4
 ; CHECK-NEXT:    le lr, .LBB0_7
 ; CHECK-NEXT:  .LBB0_8: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    add sp, #12
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.9:
diff --git a/llvm/test/CodeGen/VE/Scalar/br_jt.ll b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
index fd44a54a7273801..69cc1800555a76a 100644
--- a/llvm/test/CodeGen/VE/Scalar/br_jt.ll
+++ b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
@@ -399,29 +399,29 @@ define signext i32 @br_jt4_m(i32 signext %0, i32 signext %1) {
 ; PIC-NEXT:    and %s0, %s0, (32)0
 ; PIC-NEXT:    brlt.w 2, %s0, .LBB5_4
 ; PIC-NEXT:  # %bb.1:
-; PIC-NEXT:    breq.w 1, %s0, .LBB5_8
+; PIC-NEXT:    breq.w 1, %s0, .LBB5_7
 ; PIC-NEXT:  # %bb.2:
-; PIC-NEXT:    brne.w 2, %s0, .LBB5_7
+; PIC-NEXT:    brne.w 2, %s0, .LBB5_9
 ; PIC-NEXT:  # %bb.3:
 ; PIC-NEXT:    or %s0, 0, (0)1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
 ; PIC-NEXT:  .LBB5_4:
-; PIC-NEXT:    breq.w 3, %s0, .LBB5_9
+; PIC-NEXT:    breq.w 3, %s0, .LBB5_8
 ; PIC-NEXT:  # %bb.5:
-; PIC-NEXT:    brne.w 4, %s0, .LBB5_7
+; PIC-NEXT:    brne.w 4, %s0, .LBB5_9
 ; PIC-NEXT:  # %bb.6:
 ; PIC-NEXT:    and %s0, %s1, (32)0
 ; PIC-NEXT:    adds.w.sx %s0, 3, %s0
-; PIC-NEXT:  .LBB5_7:
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
-; PIC-NEXT:  .LBB5_8:
+; PIC-NEXT:  .LBB5_7:
 ; PIC-NEXT:    or %s0, 3, (0)1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
-; PIC-NEXT:  .LBB5_9:
+; PIC-NEXT:  .LBB5_8:
 ; PIC-NEXT:    or %s0, 4, (0)1
+; PIC-NEXT:  .LBB5_9:
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
   switch i32 %0, label %7 [
@@ -479,21 +479,21 @@ define signext i32 @br_jt7_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    adds.w.sx %s0, 3, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB6_8:
-; CHECK-NEXT:    or %s0, 0, %s2
-; CHECK-NEXT:  .LBB6_9:
-; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
-; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB6_7:
-; CHECK-NEXT:    or %s0, 11, (0)1
+; CHECK-NEXT:  .LBB6_5:
+; CHECK-NEXT:    adds.w.sx %s0, -2, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
 ; CHECK-NEXT:  .LBB6_6:
 ; CHECK-NEXT:    or %s0, 10, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB6_5:
-; CHECK-NEXT:    adds.w.sx %s0, -2, %s1
+; CHECK-NEXT:  .LBB6_7:
+; CHECK-NEXT:    or %s0, 11, (0)1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+; CHECK-NEXT:  .LBB6_8:
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:  .LBB6_9:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
 ;
@@ -530,6 +530,10 @@ define signext i32 @br_jt7_m(i32 signext %0, i32 signext %1) {
 ; PIC-NEXT:    or %s0, 10, (0)1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
+; PIC-NEXT:  .LBB6_13:
+; PIC-NEXT:    or %s0, 0, (0)1
+; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
+; PIC-NEXT:    b.l.t (, %s10)
 ; PIC-NEXT:  .LBB6_14:
 ; PIC-NEXT:    adds.w.sx %s0, 3, %s1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
@@ -542,10 +546,6 @@ define signext i32 @br_jt7_m(i32 signext %0, i32 signext %1) {
 ; PIC-NEXT:  .LBB6_2:
 ; PIC-NEXT:    or %s0, 3, (0)1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
-; PIC-NEXT:    b.l.t (, %s10)
-; PIC-NEXT:  .LBB6_13:
-; PIC-NEXT:    or %s0, 0, (0)1
-; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
   switch i32 %0, label %11 [
     i32 1, label %12
diff --git a/llvm/test/CodeGen/VE/Scalar/brind.ll b/llvm/test/CodeGen/VE/Scalar/brind.ll
index 907f0a07504156a..6585e2b5eda664a 100644
--- a/llvm/test/CodeGen/VE/Scalar/brind.ll
+++ b/llvm/test/CodeGen/VE/Scalar/brind.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple=ve | FileCheck %s
 
 ; Function Attrs: norecurse nounwind readnone
@@ -18,19 +19,19 @@ define signext i32 @brind(i32 signext %0) {
 ; CHECK-NEXT:    cmov.w.eq %s1, %s2, %s0
 ; CHECK-NEXT:    b.l.t (, %s1)
 ; CHECK-NEXT:  .Ltmp0: # Block address taken
-; CHECK-NEXT:  .LBB{{[0-9]+}}_3:
+; CHECK-NEXT:  .LBB0_3:
 ; CHECK-NEXT:    or %s0, -1, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .Ltmp2: # Block address taken
-; CHECK-NEXT:  .LBB{{[0-9]+}}_2:
-; CHECK-NEXT:    or %s0, 2, (0)1
-; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
-; CHECK-NEXT:    b.l.t (, %s10)
 ; CHECK-NEXT:  .Ltmp1: # Block address taken
-; CHECK-NEXT:  .LBB{{[0-9]+}}_1:
+; CHECK-NEXT:  .LBB0_1:
 ; CHECK-NEXT:    or %s0, 1, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+; CHECK-NEXT:  .Ltmp2: # Block address taken
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    or %s0, 2, (0)1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = icmp eq i32 %0, 1
   %3 = select i1 %2, ptr blockaddress(@brind, %6), ptr blockaddress(@brind, %8)
diff --git a/llvm/test/CodeGen/X86/bb_rotate.ll b/llvm/test/CodeGen/X86/bb_rotate.ll
index 55a7b0138026328..0ed0600e8dbad67 100644
--- a/llvm/test/CodeGen/X86/bb_rotate.ll
+++ b/llvm/test/CodeGen/X86/bb_rotate.ll
@@ -4,13 +4,13 @@ define i1 @no_viable_top_fallthrough() {
 ; CHECK-LABEL: no_viable_top_fallthrough
 ; CHECK: %.entry
 ; CHECK: %.bb1
+; CHECK: %.stop
 ; CHECK: %.bb2
 ; CHECK: %.middle
 ; CHECK: %.backedge
 ; CHECK: %.bb3
 ; CHECK: %.header
 ; CHECK: %.exit
-; CHECK: %.stop
 .entry:
   %val1 = call i1 @foo()
   br i1 %val1, label %.bb1, label %.header, !prof !10
diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
index cee8489e9aaea0c..bb081f6bab5329f 100644
--- a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
+++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
@@ -68,8 +68,8 @@ define void @func_large() !prof !0 {
 ; increased by ~17%
 ;
 ; CHECK-LABEL: Applying ext-tsp layout
-; CHECK:   original  layout score: 9171074274.27
-; CHECK:   optimized layout score: 10844307310.87
+; CHECK:   original  layout score: 23587612604815436.00
+; CHECK:   optimized layout score: 27891096739311172.00
 ; CHECK: b0
 ; CHECK: b2
 ; CHECK: b3
@@ -84,8 +84,8 @@ define void @func_large() !prof !0 {
 ; An expected output with chain-split-threshold=1 (disabling split point enumeration)
 ;
 ; CHECK2-LABEL: Applying ext-tsp layout
-; CHECK2:   original  layout score: 9171074274.27
-; CHECK2:   optimized layout score: 10844307310.87
+; CHECK2:   original  layout score: 23587612604815436.00
+; CHECK2:   optimized layout score: 27891096739311172.00
 ; CHECK2: b0
 ; CHECK2: b2
 ; CHECK2: b3
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index d26f4b7044cf3c0..bf7c1c00c71df10 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -271,46 +271,47 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl $64, %edx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    subl %edx, %ebp
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    movl $127, %ecx
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %ebp, %ecx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    cmovnel %ebx, %edi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    cmovnel %esi, %edi
 ; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    cmovnel %ebx, %edx
+; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovnel %ebx, %eax
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    jne .LBB4_1
 ; X86-NEXT:  # %bb.8: # %_udiv-special-cases
-; X86-NEXT:    movl %ebp, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    xorl $127, %ebp
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    orl %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    je .LBB4_9
 ; X86-NEXT:  # %bb.5: # %udiv-bb1
@@ -326,9 +327,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    xorb $127, %al
 ; X86-NEXT:    movb %al, %ch
 ; X86-NEXT:    andb $7, %ch
@@ -353,33 +353,29 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %esi, %eax
 ; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jae .LBB4_2
 ; X86-NEXT:  # %bb.6:
-; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    jmp .LBB4_7
 ; X86-NEXT:  .LBB4_1:
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB4_9
 ; X86-NEXT:  .LBB4_2: # %udiv-preheader
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -393,16 +389,16 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    andb $15, %cl
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %ebx
-; X86-NEXT:    movl 100(%esp,%ebx), %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 100(%esp,%ebx), %ebp
+; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl 96(%esp,%ebx), %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %esi, %ebp
+; X86-NEXT:    shrdl %cl, %ebp, %edx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl 88(%esp,%ebx), %esi
+; X86-NEXT:    movl 88(%esp,%ebx), %edx
 ; X86-NEXT:    movl 92(%esp,%ebx), %ebx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    shrl %cl, %eax
@@ -413,8 +409,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -429,7 +425,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB4_3: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -440,22 +437,22 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shldl $1, %ebp, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %ebx, %ebp
-; X86-NEXT:    shldl $1, %esi, %ebx
+; X86-NEXT:    shldl $1, %edi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    shldl $1, %ecx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %edi, %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    shldl $1, %esi, %edi
 ; X86-NEXT:    orl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %esi, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %ebp, %ecx
@@ -464,12 +461,11 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %eax
@@ -482,8 +478,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sbbl %edi, %edx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %eax, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %esi, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -496,26 +492,25 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    jne .LBB4_3
 ; X86-NEXT:  # %bb.4:
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %edx, %edi
 ; X86-NEXT:    orl %ecx, %edi
 ; X86-NEXT:    shldl $1, %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %esi, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl %ebp, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:  .LBB4_9: # %udiv-end
 ; X86-NEXT:    xorl %ebx, %edi
 ; X86-NEXT:    xorl %ebx, %edx
@@ -528,11 +523,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sbbl %ebx, %edx
 ; X86-NEXT:    sbbl %ebx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    movl %edx, 8(%ecx)
-; X86-NEXT:    movl %edi, 12(%ecx)
+; X86-NEXT:    movl %esi, (%ebp)
+; X86-NEXT:    movl %eax, 4(%ebp)
+; X86-NEXT:    movl %edx, 8(%ebp)
+; X86-NEXT:    movl %edi, 12(%ebp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %edx, %ebx
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index ebb95f16a723c4c..41f5d8590c237dc 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -177,14 +177,14 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $132, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    subl $136, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sete %bl
@@ -205,7 +205,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    bsrl %eax, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %ebp, %ebp
+; X86-NEXT:    bsrl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    xorl $31, %ebp
@@ -262,28 +262,25 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    cmovnel %ecx, %esi
 ; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    cmovnel %ecx, %ebp
 ; X86-NEXT:    jne .LBB4_8
 ; X86-NEXT:  # %bb.1: # %_udiv-special-cases
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    xorl $127, %eax
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    je .LBB4_8
 ; X86-NEXT:  # %bb.2: # %udiv-bb1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -300,20 +297,20 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 124(%esp,%eax), %edx
-; X86-NEXT:    movl 128(%esp,%eax), %esi
+; X86-NEXT:    movl 128(%esp,%eax), %edx
+; X86-NEXT:    movl 132(%esp,%eax), %esi
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 120(%esp,%eax), %ebp
+; X86-NEXT:    movl 124(%esp,%eax), %ebp
 ; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    shrl %esi
 ; X86-NEXT:    shrl %cl, %esi
 ; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    movl 116(%esp,%eax), %ebp
+; X86-NEXT:    movl 120(%esp,%eax), %ebp
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %ebp, %edx
 ; X86-NEXT:    shll %cl, %ebp
@@ -321,16 +318,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    jae .LBB4_3
 ; X86-NEXT:  # %bb.6:
-; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    jmp .LBB4_7
 ; X86-NEXT:  .LBB4_3: # %udiv-preheader
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -354,26 +352,29 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 80(%esp,%eax), %ebp
+; X86-NEXT:    movl 84(%esp,%eax), %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl 80(%esp,%eax), %edx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 76(%esp,%eax), %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %ebp, %ebx
-; X86-NEXT:    movl 68(%esp,%eax), %esi
-; X86-NEXT:    movl 72(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 72(%esp,%eax), %ebp
+; X86-NEXT:    movl 76(%esp,%eax), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shrl %cl, %esi
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %edx, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, %ebp
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    shrdl %cl, %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -386,41 +387,41 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB4_4: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT:    shldl $1, %ebx, %ebp
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %edx, %ebx
-; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebp, %edx
+; X86-NEXT:    shldl $1, %esi, %ebp
+; X86-NEXT:    shldl $1, %edi, %esi
+; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    shldl $1, %eax, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %ecx
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    shldl $1, %edi, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl $1, %eax
@@ -433,93 +434,94 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    subl %ecx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %eax, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $-1, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    jne .LBB4_4
 ; X86-NEXT:  # %bb.5:
 ; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
 ; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %esi, %edx
-; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    shldl $1, %ebx, %esi
-; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:    shldl $1, %ebp, %ebx
-; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ebp, %ebp
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    orl %eax, %ebp
 ; X86-NEXT:  .LBB4_8: # %udiv-end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebp, (%ecx)
-; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull %ecx, %esi
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    imull %ecx, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull %esi, %ebp
 ; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %edx, %ebp
@@ -546,7 +548,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    addl $132, %esp
+; X86-NEXT:    addl $136, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/fsafdo_test3.ll b/llvm/test/CodeGen/X86/fsafdo_test3.ll
index bbcc3ff59ec35fd..79b57fe4f1a3283 100644
--- a/llvm/test/CodeGen/X86/fsafdo_test3.ll
+++ b/llvm/test/CodeGen/X86/fsafdo_test3.ll
@@ -43,51 +43,51 @@
 ;; Check BFI before and after
 
 ; BFI: block-frequency-info: foo
-; BFI:  - BB0[entry]: float = 1.0, int = 8, count = 4268
-; BFI:  - BB1[for.cond1.preheader]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB2[if.then]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB3[if.end]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB4[if.then7]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB5[if.end9]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB6[if.then.1]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB7[if.end.1]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB8[if.then7.1]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB9[if.end9.1]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB10[if.then.2]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB11[if.end.2]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB12[if.then7.2]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB13[if.end9.2]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB14[if.then.3]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB15[if.end.3]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB16[if.then7.3]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB17[if.end9.3]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB18[for.end12]: float = 1.0, int = 8, count = 4268
+; BFI:  - BB0[entry]: float = 1.0, int = {{.*}}, count = 4268
+; BFI:  - BB1[for.cond1.preheader]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB2[if.then]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB3[if.end]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB4[if.then7]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB5[if.end9]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB6[if.then.1]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB7[if.end.1]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB8[if.then7.1]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB9[if.end9.1]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB10[if.then.2]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB11[if.end.2]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB12[if.then7.2]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB13[if.end9.2]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB14[if.then.3]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB15[if.end.3]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB16[if.then7.3]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB17[if.end9.3]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB18[for.end12]: float = 1.0, int = {{.*}}, count = 4268
 ;
 ; BFI: # *** IR Dump Before SampleFDO loader in MIR (fs-profile-loader) ***:
 ; BFI: # End machine code for function foo.
 ; BFI-EMPTY:
 ; BFI: block-frequency-info: foo
-; BFI:  - BB0[entry]: float = 1.0, int = 8, count = 4268
-; BFI:  - BB1[for.cond1.preheader]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB2[if.then]: float = 2.7041, int = 21, count = 11204
-; BFI:  - BB3[if.end]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB4[if.then7]: float = 2.7041, int = 21, count = 11204
-; BFI:  - BB5[if.end9]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB6[if.then.1]: float = 65.351, int = 522, count = 278487
-; BFI:  - BB7[if.end.1]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB8[if.then7.1]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB9[if.end9.1]: float = 66.446, int = 531, count = 283289
-; BFIV0:  - BB10[if.then.2]: float = 2.7041, int = 21, count = 11204
-; BFIV1:  - BB10[if.then.2]: float = 61.075, int = 488, count = 260348
-; BFI:  - BB11[if.end.2]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB12[if.then7.2]: float = 65.405, int = 523, count = 279021
-; BFI:  - BB13[if.end9.2]: float = 66.446, int = 531, count = 283289
-; BFIV0:  - BB14[if.then.3]: float = 61.075, int = 488, count = 260348
-; BFIV1:  - BB14[if.then.3]: float = 2.7041, int = 21, count = 11204
-; BFI:  - BB15[if.end.3]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB16[if.then7.3]: float = 54.846, int = 438, count = 233673
-; BFI:  - BB17[if.end9.3]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB18[for.end12]: float = 1.0, int = 8, count = 4268
+; BFI:  - BB0[entry]: float = 1.0, int = {{.*}}, count = 4268
+; BFI:  - BB1[for.cond1.preheader]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB2[if.then]: float = 2.7041, int = {{.*}}, count = 11541
+; BFI:  - BB3[if.end]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB4[if.then7]: float = 2.7041, int = {{.*}}, count = 11541
+; BFI:  - BB5[if.end9]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB6[if.then.1]: float = 65.351, int = {{.*}}, count = 278916
+; BFI:  - BB7[if.end.1]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB8[if.then7.1]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB9[if.end9.1]: float = 66.446, int = {{.*}}, count = 283590
+; BFIV0:  - BB10[if.then.2]: float = 2.7041, int = {{.*}}, count = 11541
+; BFIV1:  - BB10[if.then.2]: float = 61.075, int = {{.*}}, count = 260670
+; BFI:  - BB11[if.end.2]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB12[if.then7.2]: float = 65.405, int = {{.*}}, count = 279149
+; BFI:  - BB13[if.end9.2]: float = 66.446, int = {{.*}}, count = 283590
+; BFIV0:  - BB14[if.then.3]: float = 61.075, int = {{.*}}, count = 260670
+; BFIV1:  - BB14[if.then.3]: float = 2.7041, int = {{.*}}, count = 11541
+; BFI:  - BB15[if.end.3]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB16[if.then7.3]: float = 54.846, int = {{.*}}, count = 234082
+; BFI:  - BB17[if.end9.3]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB18[for.end12]: float = 1.0, int = {{.*}}, count = 4268
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/X86/pr38795.ll b/llvm/test/CodeGen/X86/pr38795.ll
index cb7213325ee9107..505eb0f8e6ddacd 100644
--- a/llvm/test/CodeGen/X86/pr38795.ll
+++ b/llvm/test/CodeGen/X86/pr38795.ll
@@ -23,26 +23,27 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    .cfi_offset %ebx, -12
 ; CHECK-NEXT:    .cfi_offset %ebp, -8
 ; CHECK-NEXT:    xorl %ebx, %ebx
-; CHECK-NEXT:    # implicit-def: $esi
+; CHECK-NEXT:    # implicit-def: $ecx
 ; CHECK-NEXT:    # implicit-def: $edi
-; CHECK-NEXT:    # implicit-def: $ch
-; CHECK-NEXT:    # implicit-def: $dl
+; CHECK-NEXT:    # implicit-def: $al
+; CHECK-NEXT:    # kill: killed $al
+; CHECK-NEXT:    # implicit-def: $al
 ; CHECK-NEXT:    # implicit-def: $ebp
 ; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_13: # %if.end26
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    jne .LBB0_15
 ; CHECK-NEXT:  # %bb.14: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movb %dl, %ch
-; CHECK-NEXT:    movl %ecx, %edx
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dh, %al
 ; CHECK-NEXT:  .LBB0_1: # %for.cond
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB0_22 Depth 2
-; CHECK-NEXT:    cmpb $8, %dl
-; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    cmpb $8, %al
 ; CHECK-NEXT:    ja .LBB0_3
 ; CHECK-NEXT:  # %bb.2: # %for.cond
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
@@ -50,26 +51,27 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    je .LBB0_3
 ; CHECK-NEXT:  # %bb.4: # %if.end
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    cltd
 ; CHECK-NEXT:    idivl a
-; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    movb %cl, %dh
 ; CHECK-NEXT:    movl $0, h
-; CHECK-NEXT:    cmpb $8, %dl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    cmpb $8, %al
 ; CHECK-NEXT:    jg .LBB0_8
 ; CHECK-NEXT:  # %bb.5: # %if.then13
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movl %eax, %esi
 ; CHECK-NEXT:    movl $.str, (%esp)
-; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; CHECK-NEXT:    calll printf
-; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload
 ; CHECK-NEXT:    testb %bl, %bl
+; CHECK-NEXT:    movl %esi, %ecx
 ; CHECK-NEXT:    # implicit-def: $eax
-; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; CHECK-NEXT:    movl %ecx, %edx
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dh, %dl
 ; CHECK-NEXT:    jne .LBB0_18
 ; CHECK-NEXT:    jmp .LBB0_6
 ; CHECK-NEXT:    .p2align 4, 0x90
@@ -79,22 +81,20 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    jne .LBB0_16
 ; CHECK-NEXT:  # %bb.17: # %if.then31
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    movb %dl, %ch
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; CHECK-NEXT:    xorl %ebp, %ebp
 ; CHECK-NEXT:  .LBB0_18: # %for.inc
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movl %ecx, %edx
+; CHECK-NEXT:    movb %dh, %al
 ; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_3: # %if.then
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl $.str, (%esp)
-; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; CHECK-NEXT:    calll printf
-; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
 ; CHECK-NEXT:    # implicit-def: $eax
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
 ; CHECK-NEXT:  .LBB0_6: # %for.cond35
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    testl %edi, %edi
@@ -115,26 +115,25 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    calll printf
 ; CHECK-NEXT:  .LBB0_21: # %for.end46
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    # implicit-def: $ch
-; CHECK-NEXT:    # implicit-def: $cl
+; CHECK-NEXT:    # implicit-def: $al
+; CHECK-NEXT:    # implicit-def: $dh
 ; CHECK-NEXT:    # implicit-def: $ebp
 ; CHECK-NEXT:    jmp .LBB0_22
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_8: # %if.end21
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    # implicit-def: $ebp
-; CHECK-NEXT:    testb %bl, %bl
-; CHECK-NEXT:    je .LBB0_13
-; CHECK-NEXT:    jmp .LBB0_10
+; CHECK-NEXT:    jmp .LBB0_9
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_16: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movb %dl, %ch
-; CHECK-NEXT:    movl %ecx, %edx
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dh, %al
 ; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_7: # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    xorl %edi, %edi
-; CHECK-NEXT:    movb %dl, %cl
+; CHECK-NEXT:    movb %dl, %dh
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_22: # %for.cond47
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
@@ -145,17 +144,18 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    # in Loop: Header=BB0_22 Depth=2
 ; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    jne .LBB0_22
-; CHECK-NEXT:  # %bb.24: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movb %ch, %dl
+; CHECK-NEXT:  .LBB0_9: # %ae
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    je .LBB0_13
-; CHECK-NEXT:  .LBB0_10: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:  # %bb.10: # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    # implicit-def: $eax
 ; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    je .LBB0_19
 ; CHECK-NEXT:  .LBB0_12: # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    # implicit-def: $edi
-; CHECK-NEXT:    # implicit-def: $ch
+; CHECK-NEXT:    # implicit-def: $cl
+; CHECK-NEXT:    # kill: killed $cl
 ; CHECK-NEXT:    # implicit-def: $dl
 ; CHECK-NEXT:    # implicit-def: $ebp
 ; CHECK-NEXT:    testl %edi, %edi
diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll b/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
index 7c8618af83d3d7b..308ae65928e4d10 100644
--- a/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
+++ b/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
@@ -267,6 +267,14 @@ define dso_local i32 @test_indirectbr(ptr %ptr) nounwind {
 ; X64-NEXT:    movl $2, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_2: # Block address taken
+; X64-NEXT:    # %bb1
+; X64-NEXT:    cmpq $.LBB4_2, %rdx
+; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    shlq $47, %rcx
+; X64-NEXT:    movl $7, %eax
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    retq
 ; X64-NEXT:  .LBB4_3: # Block address taken
 ; X64-NEXT:    # %bb2
 ; X64-NEXT:    cmpq $.LBB4_3, %rdx
@@ -283,14 +291,6 @@ define dso_local i32 @test_indirectbr(ptr %ptr) nounwind {
 ; X64-NEXT:    movl $42, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB4_2: # Block address taken
-; X64-NEXT:    # %bb1
-; X64-NEXT:    cmpq $.LBB4_2, %rdx
-; X64-NEXT:    cmovneq %rax, %rcx
-; X64-NEXT:    shlq $47, %rcx
-; X64-NEXT:    movl $7, %eax
-; X64-NEXT:    orq %rcx, %rsp
-; X64-NEXT:    retq
 ;
 ; X64-PIC-LABEL: test_indirectbr:
 ; X64-PIC:       # %bb.0: # %entry
@@ -309,6 +309,15 @@ define dso_local i32 @test_indirectbr(ptr %ptr) nounwind {
 ; X64-PIC-NEXT:    movl $2, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
+; X64-PIC-NEXT:  .LBB4_2: # Block address taken
+; X64-PIC-NEXT:    # %bb1
+; X64-PIC-NEXT:    leaq .LBB4_2(%rip), %rsi
+; X64-PIC-NEXT:    cmpq %rsi, %rdx
+; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
+; X64-PIC-NEXT:    movl $7, %eax
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
 ; X64-PIC-NEXT:  .LBB4_3: # Block address taken
 ; X64-PIC-NEXT:    # %bb2
 ; X64-PIC-NEXT:    leaq .LBB4_3(%rip), %rsi
@@ -327,15 +336,6 @@ define dso_local i32 @test_indirectbr(ptr %ptr) nounwind {
 ; X64-PIC-NEXT:    movl $42, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
-; X64-PIC-NEXT:  .LBB4_2: # Block address taken
-; X64-PIC-NEXT:    # %bb1
-; X64-PIC-NEXT:    leaq .LBB4_2(%rip), %rsi
-; X64-PIC-NEXT:    cmpq %rsi, %rdx
-; X64-PIC-NEXT:    cmovneq %rax, %rcx
-; X64-PIC-NEXT:    shlq $47, %rcx
-; X64-PIC-NEXT:    movl $7, %eax
-; X64-PIC-NEXT:    orq %rcx, %rsp
-; X64-PIC-NEXT:    retq
 ;
 ; X64-RETPOLINE-LABEL: test_indirectbr:
 ; X64-RETPOLINE:       # %bb.0: # %entry
@@ -375,27 +375,27 @@ define dso_local i32 @test_indirectbr_global(i32 %idx) nounwind {
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
 ; X64-NEXT:  .Ltmp1: # Block address taken
-; X64-NEXT:  .LBB5_3: # %bb2
-; X64-NEXT:    cmpq $.LBB5_3, %rdx
+; X64-NEXT:  .LBB5_2: # %bb1
+; X64-NEXT:    cmpq $.LBB5_2, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    shlq $47, %rcx
-; X64-NEXT:    movl $13, %eax
+; X64-NEXT:    movl $7, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
 ; X64-NEXT:  .Ltmp2: # Block address taken
-; X64-NEXT:  .LBB5_4: # %bb3
-; X64-NEXT:    cmpq $.LBB5_4, %rdx
+; X64-NEXT:  .LBB5_3: # %bb2
+; X64-NEXT:    cmpq $.LBB5_3, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    shlq $47, %rcx
-; X64-NEXT:    movl $42, %eax
+; X64-NEXT:    movl $13, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
 ; X64-NEXT:  .Ltmp3: # Block address taken
-; X64-NEXT:  .LBB5_2: # %bb1
-; X64-NEXT:    cmpq $.LBB5_2, %rdx
+; X64-NEXT:  .LBB5_4: # %bb3
+; X64-NEXT:    cmpq $.LBB5_4, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    shlq $47, %rcx
-; X64-NEXT:    movl $7, %eax
+; X64-NEXT:    movl $42, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
 ;
@@ -419,30 +419,30 @@ define dso_local i32 @test_indirectbr_global(i32 %idx) nounwind {
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
 ; X64-PIC-NEXT:  .Ltmp1: # Block address taken
-; X64-PIC-NEXT:  .LBB5_3: # %bb2
-; X64-PIC-NEXT:    leaq .LBB5_3(%rip), %rsi
+; X64-PIC-NEXT:  .LBB5_2: # %bb1
+; X64-PIC-NEXT:    leaq .LBB5_2(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
 ; X64-PIC-NEXT:    shlq $47, %rcx
-; X64-PIC-NEXT:    movl $13, %eax
+; X64-PIC-NEXT:    movl $7, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
 ; X64-PIC-NEXT:  .Ltmp2: # Block address taken
-; X64-PIC-NEXT:  .LBB5_4: # %bb3
-; X64-PIC-NEXT:    leaq .LBB5_4(%rip), %rsi
+; X64-PIC-NEXT:  .LBB5_3: # %bb2
+; X64-PIC-NEXT:    leaq .LBB5_3(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
 ; X64-PIC-NEXT:    shlq $47, %rcx
-; X64-PIC-NEXT:    movl $42, %eax
+; X64-PIC-NEXT:    movl $13, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
 ; X64-PIC-NEXT:  .Ltmp3: # Block address taken
-; X64-PIC-NEXT:  .LBB5_2: # %bb1
-; X64-PIC-NEXT:    leaq .LBB5_2(%rip), %rsi
+; X64-PIC-NEXT:  .LBB5_4: # %bb3
+; X64-PIC-NEXT:    leaq .LBB5_4(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
 ; X64-PIC-NEXT:    shlq $47, %rcx
-; X64-PIC-NEXT:    movl $7, %eax
+; X64-PIC-NEXT:    movl $42, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
 ;
@@ -658,12 +658,6 @@ define dso_local i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-RETPOLINE-NEXT:    movl $11, %eax
 ; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
 ; X64-RETPOLINE-NEXT:    retq
-; X64-RETPOLINE-NEXT:  .LBB7_6:
-; X64-RETPOLINE-NEXT:    cmoveq %rax, %rcx
-; X64-RETPOLINE-NEXT:    shlq $47, %rcx
-; X64-RETPOLINE-NEXT:    movl $2, %eax
-; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
-; X64-RETPOLINE-NEXT:    retq
 ; X64-RETPOLINE-NEXT:  .LBB7_7: # %bb1
 ; X64-RETPOLINE-NEXT:    cmovneq %rax, %rcx
 ; X64-RETPOLINE-NEXT:    shlq $47, %rcx
@@ -676,6 +670,12 @@ define dso_local i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-RETPOLINE-NEXT:    movl $42, %eax
 ; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
 ; X64-RETPOLINE-NEXT:    retq
+; X64-RETPOLINE-NEXT:  .LBB7_6:
+; X64-RETPOLINE-NEXT:    cmoveq %rax, %rcx
+; X64-RETPOLINE-NEXT:    shlq $47, %rcx
+; X64-RETPOLINE-NEXT:    movl $2, %eax
+; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
+; X64-RETPOLINE-NEXT:    retq
 entry:
   switch i32 %idx, label %bb0 [
     i32 0, label %bb1
@@ -839,13 +839,16 @@ define dso_local i32 @test_switch_jumptable_fallthrough(i32 %idx, ptr %a.ptr, pt
 ; X64-RETPOLINE-NEXT:  # %bb.13:
 ; X64-RETPOLINE-NEXT:    cmovneq %r10, %r9
 ; X64-RETPOLINE-NEXT:    jmp .LBB8_12
+; X64-RETPOLINE-NEXT:  .LBB8_2:
+; X64-RETPOLINE-NEXT:    cmovneq %r10, %r9
+; X64-RETPOLINE-NEXT:    jmp .LBB8_9
+; X64-RETPOLINE-NEXT:  .LBB8_6:
+; X64-RETPOLINE-NEXT:    cmovneq %r10, %r9
+; X64-RETPOLINE-NEXT:    jmp .LBB8_11
 ; X64-RETPOLINE-NEXT:  .LBB8_8:
 ; X64-RETPOLINE-NEXT:    cmoveq %r10, %r9
 ; X64-RETPOLINE-NEXT:    movl (%rsi), %edi
 ; X64-RETPOLINE-NEXT:    orl %r9d, %edi
-; X64-RETPOLINE-NEXT:    jmp .LBB8_9
-; X64-RETPOLINE-NEXT:  .LBB8_2:
-; X64-RETPOLINE-NEXT:    cmovneq %r10, %r9
 ; X64-RETPOLINE-NEXT:  .LBB8_9: # %bb1
 ; X64-RETPOLINE-NEXT:    addl (%rdx), %edi
 ; X64-RETPOLINE-NEXT:    orl %r9d, %edi
@@ -853,9 +856,6 @@ define dso_local i32 @test_switch_jumptable_fallthrough(i32 %idx, ptr %a.ptr, pt
 ; X64-RETPOLINE-NEXT:  .LBB8_10: # %bb2
 ; X64-RETPOLINE-NEXT:    addl (%rcx), %eax
 ; X64-RETPOLINE-NEXT:    orl %r9d, %eax
-; X64-RETPOLINE-NEXT:    jmp .LBB8_11
-; X64-RETPOLINE-NEXT:  .LBB8_6:
-; X64-RETPOLINE-NEXT:    cmovneq %r10, %r9
 ; X64-RETPOLINE-NEXT:  .LBB8_11: # %bb3
 ; X64-RETPOLINE-NEXT:    addl (%r8), %eax
 ; X64-RETPOLINE-NEXT:    orl %r9d, %eax
diff --git a/llvm/test/CodeGen/X86/statepoint-ra.ll b/llvm/test/CodeGen/X86/statepoint-ra.ll
index 4e57648820c4b30..5a4e04dd70553a6 100644
--- a/llvm/test/CodeGen/X86/statepoint-ra.ll
+++ b/llvm/test/CodeGen/X86/statepoint-ra.ll
@@ -16,7 +16,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ;YAML:   - String:          ' total spills cost '
 ;YAML:   - NumReloads:      '7'
 ;YAML:   - String:          ' reloads '
-;YAML:   - TotalReloadsCost: '3.109004e-15'
+;YAML:   - TotalReloadsCost: '3.108624e-15'
 ;YAML:   - String:          ' total reloads cost '
 ;YAML:   - NumZeroCostFoldedReloads: '20'
 ;YAML:   - String:          ' zero cost folded reloads '
diff --git a/llvm/test/CodeGen/X86/switch.ll b/llvm/test/CodeGen/X86/switch.ll
index 264c5bf15aa28bb..57191eba8e6c460 100644
--- a/llvm/test/CodeGen/X86/switch.ll
+++ b/llvm/test/CodeGen/X86/switch.ll
@@ -1880,6 +1880,13 @@ define void @left_leaning_weight_balanced_tree(i32 %x) {
 ; CHECK-NEXT:  # %bb.7: # %bb2
 ; CHECK-NEXT:    movl $2, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
+; CHECK-NEXT:  .LBB19_12: # %entry
+; CHECK-NEXT:    cmpl $50, %edi
+; CHECK-NEXT:    je .LBB19_17
+; CHECK-NEXT:  # %bb.13: # %entry
+; CHECK-NEXT:    cmpl $60, %edi
+; CHECK-NEXT:    je .LBB19_14
+; CHECK-NEXT:    jmp .LBB19_18
 ; CHECK-NEXT:  .LBB19_8: # %entry
 ; CHECK-NEXT:    cmpl $30, %edi
 ; CHECK-NEXT:    je .LBB19_16
@@ -1889,20 +1896,14 @@ define void @left_leaning_weight_balanced_tree(i32 %x) {
 ; CHECK-NEXT:  # %bb.10: # %bb4
 ; CHECK-NEXT:    movl $4, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
-; CHECK-NEXT:  .LBB19_12: # %entry
-; CHECK-NEXT:    cmpl $50, %edi
-; CHECK-NEXT:    je .LBB19_17
-; CHECK-NEXT:  # %bb.13: # %entry
-; CHECK-NEXT:    cmpl $60, %edi
-; CHECK-NEXT:    je .LBB19_14
-; CHECK-NEXT:  .LBB19_18: # %return
-; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB19_15: # %bb1
 ; CHECK-NEXT:    movl $1, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
 ; CHECK-NEXT:  .LBB19_16: # %bb3
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
+; CHECK-NEXT:  .LBB19_18: # %return
+; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB19_17: # %bb5
 ; CHECK-NEXT:    movl $5, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
@@ -2351,12 +2352,12 @@ define void @jump_table_affects_balance(i32 %x) {
 ; CHECK-NEXT:  .LBB22_6: # %bb0
 ; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
-; CHECK-NEXT:  .LBB22_8: # %bb2
-; CHECK-NEXT:    movl $2, %edi
-; CHECK-NEXT:    jmp g at PLT # TAILCALL
 ; CHECK-NEXT:  .LBB22_7: # %bb1
 ; CHECK-NEXT:    movl $1, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
+; CHECK-NEXT:  .LBB22_8: # %bb2
+; CHECK-NEXT:    movl $2, %edi
+; CHECK-NEXT:    jmp g at PLT # TAILCALL
 ; CHECK-NEXT:  .LBB22_10: # %return
 ; CHECK-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/tail-dup-no-other-successor.ll b/llvm/test/CodeGen/X86/tail-dup-no-other-successor.ll
index 6fa6f94e6530a97..1b8bf8eea5df25e 100644
--- a/llvm/test/CodeGen/X86/tail-dup-no-other-successor.ll
+++ b/llvm/test/CodeGen/X86/tail-dup-no-other-successor.ll
@@ -12,10 +12,10 @@ declare void @effect(i32);
 ; CHECK: %entry
 ; CHECK: %loop.top
 ; CHECK: %loop.latch
-; CHECK: %top.fakephi
 ; CHECK: %loop.end
 ; CHECK: %false
 ; CHECK: %ret
+; CHECK: %top.fakephi
 define void @no_successor_still_no_taildup (i32 %count, i32 %key) {
 entry:
   br label %loop.top
diff --git a/llvm/test/CodeGen/X86/tail-opts.ll b/llvm/test/CodeGen/X86/tail-opts.ll
index ae3401ece7ce114..d54110d1fa8119a 100644
--- a/llvm/test/CodeGen/X86/tail-opts.ll
+++ b/llvm/test/CodeGen/X86/tail-opts.ll
@@ -279,11 +279,7 @@ define fastcc void @c_expand_expr_stmt(ptr %expr) nounwind {
 ; CHECK-NEXT:  .LBB3_9: # %bb3
 ; CHECK-NEXT:  .LBB3_15:
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:  .LBB3_16: # %lvalue_p.exit4
-; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    jne .LBB3_9
-; CHECK-NEXT:  # %bb.17: # %lvalue_p.exit4
-; CHECK-NEXT:    testb %bl, %bl
+; CHECK-NEXT:    jmp .LBB3_16
 ; CHECK-NEXT:  .LBB3_10: # %bb2.i3
 ; CHECK-NEXT:    movq 8(%rax), %rax
 ; CHECK-NEXT:    movzbl 16(%rax), %ecx
@@ -302,8 +298,12 @@ define fastcc void @c_expand_expr_stmt(ptr %expr) nounwind {
 ; CHECK-NEXT:    je .LBB3_16
 ; CHECK-NEXT:  # %bb.14: # %bb2.i.i2
 ; CHECK-NEXT:    cmpl $23, %ecx
-; CHECK-NEXT:    je .LBB3_16
-; CHECK-NEXT:    jmp .LBB3_9
+; CHECK-NEXT:    jne .LBB3_9
+; CHECK-NEXT:  .LBB3_16: # %lvalue_p.exit4
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB3_9
+; CHECK-NEXT:  # %bb.17: # %lvalue_p.exit4
+; CHECK-NEXT:    testb %bl, %bl
 entry:
   %tmp4 = load i8, ptr null, align 8                  ; <i8> [#uses=3]
   switch i8 %tmp4, label %bb3 [
diff --git a/llvm/test/Other/cfg-printer-branch-weights.ll b/llvm/test/Other/cfg-printer-branch-weights.ll
index c8d57ecbbc2b223..803087f3318e969 100644
--- a/llvm/test/Other/cfg-printer-branch-weights.ll
+++ b/llvm/test/Other/cfg-printer-branch-weights.ll
@@ -6,11 +6,11 @@ entry:
   %check = icmp sgt i32 %0, 0
   br i1 %check, label %if, label %exit, !prof !0
 
-; CHECK: label="W:7"
+; CHECK: label="W:89623871094784"
 ; CHECK-NOT: ["];
 if:                     ; preds = %entry
   br label %exit
-; CHECK: label="W:1600"
+; CHECK: label="W:17924774638387200"
 ; CHECK-NOT: ["];
 exit:                   ; preds = %entry, %if
   ret void
diff --git a/llvm/test/ThinLTO/X86/function_entry_count.ll b/llvm/test/ThinLTO/X86/function_entry_count.ll
index 12cedba6b9c83dd..b65bc226040bfcb 100644
--- a/llvm/test/ThinLTO/X86/function_entry_count.ll
+++ b/llvm/test/ThinLTO/X86/function_entry_count.ll
@@ -18,7 +18,7 @@
 ; CHECK: define void @f(i32{{.*}}) [[ATTR:#[0-9]+]] !prof ![[PROF1:[0-9]+]]
 ; CHECK: define available_externally void @g() !prof ![[PROF2]]
 ; CHECK-DAG: ![[PROF1]] = !{!"synthetic_function_entry_count", i64 10}
-; CHECK-DAG: ![[PROF2]] = !{!"synthetic_function_entry_count", i64 198}
+; CHECK-DAG: ![[PROF2]] = !{!"synthetic_function_entry_count", i64 200}
 ; CHECK-DAG: attributes [[ATTR]] = { norecurse nounwind }
 
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll b/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
index 63568456d0e58c8..ca50a04a328151c 100644
--- a/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
+++ b/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
@@ -31,4 +31,4 @@ ret i32 %val
 !2 = !{!"branch_weights", i32 5, i32 5}
 !3 = !{!"branch_weights", i32 4, i32 1}
 
-; CHECK: [[COUNT1]] = !{!"branch_weights", i32 31, i32 8}
+; CHECK: [[COUNT1]] = !{!"branch_weights", i32 858993459, i32 214748365}
diff --git a/llvm/test/Transforms/ConstantHoisting/X86/pr52689-not-all-uses-rebased.ll b/llvm/test/Transforms/ConstantHoisting/X86/pr52689-not-all-uses-rebased.ll
index 88ba4d3562c826c..e4352e4d98b77e6 100644
--- a/llvm/test/Transforms/ConstantHoisting/X86/pr52689-not-all-uses-rebased.ll
+++ b/llvm/test/Transforms/ConstantHoisting/X86/pr52689-not-all-uses-rebased.ll
@@ -2,6 +2,10 @@
 
 ; REQUIRES: asserts
 
+; My changes fixed this likely by accident, please update as necessary when
+; you work on this:
+; XFAIL: *
+
 ; Matching assertion strings is not easy as they might differ on different
 ; platforms. So limit this to x86_64-linux.
 ; REQUIRES: x86_64-linux
diff --git a/llvm/test/Transforms/JumpThreading/thread-prob-7.ll b/llvm/test/Transforms/JumpThreading/thread-prob-7.ll
index f11bfd026688192..8c9d89871d00b32 100644
--- a/llvm/test/Transforms/JumpThreading/thread-prob-7.ll
+++ b/llvm/test/Transforms/JumpThreading/thread-prob-7.ll
@@ -52,5 +52,5 @@ bb_join:
 ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 400, i32 600}
 ; CHECK: [[PROF2]] = !{!"branch_weights", i32 300, i32 300}
-; CHECK: [[PROF3]] = !{!"branch_weights", i32 678152731, i32 1469330917}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 613566756, i32 1533916892}
 ;.
diff --git a/llvm/test/Transforms/JumpThreading/update-edge-weight.ll b/llvm/test/Transforms/JumpThreading/update-edge-weight.ll
index ff82fb0b214d401..6313a87993303fd 100644
--- a/llvm/test/Transforms/JumpThreading/update-edge-weight.ll
+++ b/llvm/test/Transforms/JumpThreading/update-edge-weight.ll
@@ -2,7 +2,7 @@
 
 ; Test if edge weights are properly updated after jump threading.
 
-; CHECK: !2 = !{!"branch_weights", i32 1629125526, i32 518358122}
+; CHECK: !2 = !{!"branch_weights", i32 1561806291, i32 585677357}
 
 define void @foo(i32 %n) !prof !0 {
 entry:
diff --git a/llvm/test/Transforms/LICM/loopsink.ll b/llvm/test/Transforms/LICM/loopsink.ll
index c08b992f35f41b9..ea7b0e06264d711 100644
--- a/llvm/test/Transforms/LICM/loopsink.ll
+++ b/llvm/test/Transforms/LICM/loopsink.ll
@@ -195,23 +195,27 @@ define i32 @t3(i32, i32) #0 !prof !0 {
   ret i32 10
 }
 
-; For single-BB loop with <=1 avg trip count, sink load to b1
+; For single-BB loop with <=1 avg trip count, sink load to body
 ; CHECK: t4
-; CHECK: .preheader:
+; CHECK: .header:
 ; CHECK-NOT: load i32, ptr @g
-; CHECK: .b1:
+; CHECK: .body:
 ; CHECK: load i32, ptr @g
 ; CHECK: .exit:
 define i32 @t4(i32, i32) #0 !prof !0 {
-.preheader:
+.entry:
   %invariant = load i32, ptr @g
-  br label %.b1
+  br label %.header
 
-.b1:
-  %iv = phi i32 [ %t1, %.b1 ], [ 0, %.preheader ]
+.header:
+  %iv = phi i32 [ %t1, %.body ], [ 0, %.entry ]
+  %c0 = icmp sgt i32 %iv, %0
+  br i1 %c0, label %.body, label %.exit, !prof !1
+
+.body:
   %t1 = add nsw i32 %invariant, %iv
   %c1 = icmp sgt i32 %iv, %0
-  br i1 %c1, label %.b1, label %.exit, !prof !1
+  br label %.header
 
 .exit:
   ret i32 10
diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/opt-remark-with-hotness.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/opt-remark-with-hotness.ll
index 174d55651171c11..2dc515758afebb6 100644
--- a/llvm/test/Transforms/LoopDataPrefetch/AArch64/opt-remark-with-hotness.ll
+++ b/llvm/test/Transforms/LoopDataPrefetch/AArch64/opt-remark-with-hotness.ll
@@ -78,5 +78,5 @@ for.body:                                         ; preds = %for.body, %for.body
 !19 = !{!"int", !13, i64 0}
 !20 = !DILocation(line: 9, column: 11, scope: !6)
 !21 = !{!"function_entry_count", i64 6}
-!22 = !{!"branch_weights", i32 99, i32 1}
+!22 = !{!"branch_weights", i32 2000, i32 1}
 !23 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll b/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
index 0b31fd8d45e8380..6f36f4d263f4301 100644
--- a/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
+++ b/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
@@ -79,5 +79,5 @@ for.cond.cleanup:
 !20 = distinct !{!20, !21}
 !21 = !{!"llvm.loop.distribute.enable", i1 true}
 !22 = !{!"function_entry_count", i64 3}
-!23 = !{!"branch_weights", i32 99, i32 1}
+!23 = !{!"branch_weights", i32 2000, i32 1}
 !24 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopRotate/update-branch-weights.ll b/llvm/test/Transforms/LoopRotate/update-branch-weights.ll
index f587ed99ab84daa..5d742b64e0adbf4 100644
--- a/llvm/test/Transforms/LoopRotate/update-branch-weights.ll
+++ b/llvm/test/Transforms/LoopRotate/update-branch-weights.ll
@@ -70,9 +70,9 @@ outer_loop_exit:
 
 ; BFI_AFTER-LABEL: block-frequency-info: func1
 ; BFI_AFTER: - entry: {{.*}} count = 1024
-; BFI_AFTER: - loop_body.lr.ph: {{.*}} count = 1024
-; BFI_AFTER: - loop_body: {{.*}} count = 20608
-; BFI_AFTER: - loop_header.loop_exit_crit_edge: {{.*}} count = 1024
+; BFI_AFTER: - loop_body.lr.ph: {{.*}} count = 1016
+; BFI_AFTER: - loop_body: {{.*}} count = 20480
+; BFI_AFTER: - loop_header.loop_exit_crit_edge: {{.*}} count = 1016
 ; BFI_AFTER: - loop_exit: {{.*}} count = 1024
 
 ; IR-LABEL: define void @func1
@@ -146,14 +146,14 @@ loop_exit:
 
 ; BFI_BEFORE-LABEL: block-frequency-info: func3_zero_branch_weight
 ; BFI_BEFORE: - entry: {{.*}} count = 1024
-; BFI_BEFORE: - loop_header: {{.*}} count = 2199023255296
-; BFI_BEFORE: - loop_body: {{.*}} count = 2199023254272
+; BFI_BEFORE: - loop_header: {{.*}} count = 2199023255552
+; BFI_BEFORE: - loop_body: {{.*}} count = 2199023254528
 ; BFI_BEFORE: - loop_exit: {{.*}} count = 1024
 
 ; BFI_AFTER-LABEL: block-frequency-info: func3_zero_branch_weight
 ; BFI_AFTER: - entry: {{.*}} count = 1024
 ; BFI_AFTER: - loop_body.lr.ph: {{.*}} count = 1024
-; BFI_AFTER: - loop_body: {{.*}} count = 2199023255296
+; BFI_AFTER: - loop_body: {{.*}} count = 2199023255552
 ; BFI_AFTER: - loop_header.loop_exit_crit_edge: {{.*}} count = 1024
 ; BFI_AFTER: - loop_exit: {{.*}} count = 1024
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx512.ll b/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
index 44aae477bf71c15..33d1d3f0d22191d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
@@ -7,11 +7,12 @@ target triple = "x86_64-apple-macosx10.9.0"
 ; Verify that we generate 512-bit wide vectors for a basic integer memset
 ; loop.
 
-; CHECK-LABEL: f:
-; CHECK: vmovdqu64 %zmm{{.}},
-; CHECK-NOT: %ymm
-; CHECK: epilog
+; CHECK-LABEL: _f:
+; CHECK: %vec.epilog.vector.body
 ; CHECK: %ymm
+; CHECK: %vector.body
+; CHECK-NOT: %ymm
+; CHECK: vmovdqu64 %zmm{{.}},
 
 ; Verify that we don't generate 512-bit wide vectors when subtarget feature says not to
 
@@ -46,7 +47,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; Verify that the "prefer-vector-width=256" attribute prevents the use of 512-bit
 ; vectors
 
-; CHECK-LABEL: g:
+; CHECK-LABEL: _g:
 ; CHECK: vmovdqu %ymm{{.}},
 ; CHECK-NOT: %zmm
 
@@ -81,17 +82,19 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; Verify that the "prefer-vector-width=512" attribute override the subtarget
 ; vectors
 
-; CHECK-LABEL: h:
+; CHECK-LABEL: _h:
+; CHECK: %vec.epilog.vector.body
+; CHECK: %ymm
+; CHECK: %vector.body
 ; CHECK: vmovdqu64 %zmm{{.}},
 ; CHECK-NOT: %ymm
-; CHECK: epilog
-; CHECK: %ymm
 
 ; CHECK-PREFER-AVX256-LABEL: h:
+; CHECK-PREFER-AVX256: %vec.epilog.vector.body
+; CHECK-PREFER-AVX256: %ymm
+; CHECK-PREFER-AVX256: %vector.body
 ; CHECK-PREFER-AVX256: vmovdqu64 %zmm{{.}},
 ; CHECK-PREFER-AVX256-NOT: %ymm
-; CHECK-PREFER-AVX256: epilog
-; CHECK-PREFER-AVX256: %ymm
 
 define void @h(ptr %a, i32 %n) "prefer-vector-width"="512" {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
index b1fc96ea77ed034..4f413a50837dd69 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
@@ -108,5 +108,5 @@ attributes #0 = { nounwind }
                              isOptimized: true, flags: "-O2",
                              splitDebugFilename: "abc.debug", emissionKind: 2)
 !29 = !{!"function_entry_count", i64 3}
-!30 = !{!"branch_weights", i32 99, i32 1}
+!30 = !{!"branch_weights", i32 10000, i32 1}
 !31 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
index ed107b10dcd9874..4da1d099645bee2 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
@@ -198,5 +198,5 @@ attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "l
 !55 = distinct !{!55, !43}
 !56 = !{!"function_entry_count", i64 3}
 !57 = !{!"function_entry_count", i64 50}
-!58 = !{!"branch_weights", i32 99, i32 1}
+!58 = !{!"branch_weights", i32 10000, i32 1}
 !59 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
index 30d11a12c79c4bc..4b7b714a2562800 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
@@ -209,5 +209,5 @@ attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "l
 !55 = distinct !{!55, !43}
 !56 = !{!"function_entry_count", i64 3}
 !57 = !{!"function_entry_count", i64 50}
-!58 = !{!"branch_weights", i32 99, i32 1}
+!58 = !{!"branch_weights", i32 10000, i32 1}
 !59 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/PR41279_2.proftext b/llvm/test/Transforms/PGOProfile/Inputs/PR41279_2.proftext
index c6cb02aaddd1d6d..651ca44caf808d0 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/PR41279_2.proftext
+++ b/llvm/test/Transforms/PGOProfile/Inputs/PR41279_2.proftext
@@ -1,7 +1,8 @@
 :ir
 f
 1096621589180411894
-2
+3
 3
 2
+1
 
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/criticaledge.proftext b/llvm/test/Transforms/PGOProfile/Inputs/criticaledge.proftext
index 85b9779abeece66..6757a1ad6185e0b 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/criticaledge.proftext
+++ b/llvm/test/Transforms/PGOProfile/Inputs/criticaledge.proftext
@@ -7,10 +7,10 @@ test_criticalEdge
 1
 2
 2
-0
-1
 2
 1
+0
+1
 
 <stdin>:bar
 742261418966908927
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/criticaledge_entry.proftext b/llvm/test/Transforms/PGOProfile/Inputs/criticaledge_entry.proftext
index f1497d6c01c9f89..3cc0bb0be65bf26 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/criticaledge_entry.proftext
+++ b/llvm/test/Transforms/PGOProfile/Inputs/criticaledge_entry.proftext
@@ -8,10 +8,10 @@ test_criticalEdge
 2
 1
 2
-0
-1
 2
 1
+0
+1
 
 <stdin>:bar
 742261418966908927
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/indirectbr.proftext b/llvm/test/Transforms/PGOProfile/Inputs/indirectbr.proftext
index 49fafd9d99bf91f..0cbdea7aacb6144 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/indirectbr.proftext
+++ b/llvm/test/Transforms/PGOProfile/Inputs/indirectbr.proftext
@@ -7,6 +7,6 @@ foo
 4
 # Counter Values:
 139
-20
 5
+20
 63
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/indirectbr_entry.proftext b/llvm/test/Transforms/PGOProfile/Inputs/indirectbr_entry.proftext
index 6910f7e21d677e7..70d2844ba5ade02 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/indirectbr_entry.proftext
+++ b/llvm/test/Transforms/PGOProfile/Inputs/indirectbr_entry.proftext
@@ -8,6 +8,6 @@ foo
 4
 # Counter Values:
 202
-20
 5
+20
 63
diff --git a/llvm/test/Transforms/PGOProfile/PR41279_2.ll b/llvm/test/Transforms/PGOProfile/PR41279_2.ll
index fc3e54fcb4c17a3..8c3c5695c1a5d6a 100644
--- a/llvm/test/Transforms/PGOProfile/PR41279_2.ll
+++ b/llvm/test/Transforms/PGOProfile/PR41279_2.ll
@@ -9,7 +9,21 @@ define dso_local void @f() personality ptr @__C_specific_handler {
 ; USE-SAME: !prof ![[FUNC_ENTRY_COUNT:[0-9]+]]
 ; USE-DAG: {{![0-9]+}} = !{i32 1, !"ProfileSummary", {{![0-9]+}}}
 ; USE-DAG: {{![0-9]+}} = !{!"DetailedSummary", {{![0-9]+}}}
-; USE-DAG: ![[FUNC_ENTRY_COUNT]] = !{!"function_entry_count", i64 5}
+; USE-DAG: ![[FUNC_ENTRY_COUNT]] = !{!"function_entry_count", i64 6}
+;
+; GEN-LABEL: @f
+;
+; GEN: catch.dispatch:
+; GEN-NOT: call void @llvm.instrprof.increment
+;
+; GEN:  _except1:
+; GEN:    call void @llvm.instrprof.increment(ptr @__profn_f, i64 {{[0-9]+}}, i32 3, i32 1)
+;
+; GEN: __except6:
+; GEN:   call void @llvm.instrprof.increment(ptr @__profn_f, i64 {{[0-9]+}}, i32 3, i32 2)
+;
+; GEN: invoke.cont3:
+; GEN:   call void @llvm.instrprof.increment(ptr @__profn_f, i64 1096621589180411894, i32 3, i32 0)
 entry:
   %__exception_code = alloca i32, align 4
   %__exception_code2 = alloca i32, align 4
@@ -27,8 +41,6 @@ __except1:
   %2 = call i32 @llvm.eh.exceptioncode(token %1)
   store i32 %2, ptr %__exception_code, align 4
   br label %__try.cont7
-;GEN:  _except1:
-;GEN:    call void @llvm.instrprof.increment(ptr @__profn_f, i64 {{[0-9]+}}, i32 2, i32 1)
 
 invoke.cont:
   br label %__try.cont
@@ -39,8 +51,6 @@ __try.cont:
 
 catch.dispatch4:
   %3 = catchswitch within none [label %__except5] unwind to caller
-; GEN: catch.dispatch4:
-; GEN-NOT: call void @llvm.instrprof.increment
 
 __except5:
   %4 = catchpad within %3 [ptr null]
@@ -56,9 +66,6 @@ __try.cont7:
 
 invoke.cont3:
   br label %__try.cont7
-;GEN: invoke.cont3:
-;GEN:  call void @llvm.instrprof.increment(ptr @__profn_f, i64 {{[0-9]+}}, i32 2, i32 0)
-
 }
 
 declare dso_local i32 @__C_specific_handler(...)
diff --git a/llvm/test/Transforms/PGOProfile/bfi_verification.ll b/llvm/test/Transforms/PGOProfile/bfi_verification.ll
index 9d07842a3122177..0963f28c996d174 100644
--- a/llvm/test/Transforms/PGOProfile/bfi_verification.ll
+++ b/llvm/test/Transforms/PGOProfile/bfi_verification.ll
@@ -95,15 +95,7 @@ if.then25:
 if.end26:
   ret void
 }
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB do.body Count=39637749 BFI_Count=40801304
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB while.cond Count=80655628 BFI_Count=83956530
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB while.body Count=41017879 BFI_Count=42370585
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB while.cond3 Count=71254487 BFI_Count=73756204
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB while.body7 Count=31616738 BFI_Count=32954900
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB while.end8 Count=39637749 BFI_Count=40801304
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB if.then Count=32743703 BFI_Count=33739540
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB if.end Count=39637749 BFI_Count=40801304
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB if.then25 Count=6013544 BFI_Count=6277124
-; THRESHOLD-CHECK: remark: <unknown>:0:0: In Func sort_basket: Num_of_BB=14, Num_of_non_zerovalue_BB=14, Num_of_mis_matching_BB=9
-; HOTONLY-CHECK: remark: <unknown>:0:0: BB if.then25 Count=6013544 BFI_Count=6277124 (raw-Cold to BFI-Hot)
-; HOTONLY-CHECK: remark: <unknown>:0:0: In Func sort_basket: Num_of_BB=14, Num_of_non_zerovalue_BB=14, Num_of_mis_matching_BB=1
+
+; TODO: All the inaccuracies that were tested here disappeared after improving BFI precision...
+; THRESHOLD-CHECK:{{.*}}
+; HOTONLY-CHECK:{{.*}}
diff --git a/llvm/test/Transforms/PGOProfile/criticaledge.ll b/llvm/test/Transforms/PGOProfile/criticaledge.ll
index c24925c68fa32db..388ba6f353b3603 100644
--- a/llvm/test/Transforms/PGOProfile/criticaledge.ll
+++ b/llvm/test/Transforms/PGOProfile/criticaledge.ll
@@ -48,7 +48,7 @@ sw.bb:
 
 sw.bb1:
 ; GEN: sw.bb1:
-; GEN: call void @llvm.instrprof.increment(ptr @__profn_test_criticalEdge, i64 {{[0-9]+}}, i32 8, i32 4)
+; GEN: call void @llvm.instrprof.increment(ptr @__profn_test_criticalEdge, i64 {{[0-9]+}}, i32 8, i32 6)
   %call2 = call i32 @bar(i32 1024)
   br label %sw.epilog
 
@@ -75,7 +75,7 @@ if.end:
 
 sw.default:
 ; GEN: sw.default:
-; GEN-NOT: call void @llvm.instrprof.increment
+; GEN: call void @llvm.instrprof.increment(ptr @__profn_test_criticalEdge, i64 {{[0-9]+}}, i32 8, i32 4)
   %call6 = call i32 @bar(i32 32)
   %cmp7 = icmp sgt i32 %j, 10
   br i1 %cmp7, label %if.then8, label %if.end9
@@ -90,7 +90,7 @@ if.then8:
 
 if.end9:
 ; GEN: if.end9:
-; GEN: call void @llvm.instrprof.increment(ptr @__profn_test_criticalEdge, i64 {{[0-9]+}}, i32 8, i32 6)
+; GEN-NOT: call void @llvm.instrprof.increment
   %res.0 = phi i32 [ %add, %if.then8 ], [ %call6, %sw.default ]
   br label %sw.epilog
 
diff --git a/llvm/test/Transforms/PGOProfile/fix_bfi.ll b/llvm/test/Transforms/PGOProfile/fix_bfi.ll
index fcfe3aa7b3a9cc1..aedef436210ef07 100644
--- a/llvm/test/Transforms/PGOProfile/fix_bfi.ll
+++ b/llvm/test/Transforms/PGOProfile/fix_bfi.ll
@@ -96,4 +96,4 @@ if.end26:
 }
 
 ; CHECK: define dso_local void @sort_basket(i64 %min, i64 %max) #0 !prof [[ENTRY_COUNT:![0-9]+]]
-; CHECK: [[ENTRY_COUNT]] = !{!"function_entry_count", i64 12949310}
+; CHECK: [[ENTRY_COUNT]] = !{!"function_entry_count", i64 13338888}
diff --git a/llvm/test/Transforms/PGOProfile/loop2.ll b/llvm/test/Transforms/PGOProfile/loop2.ll
index 071f8a6d5ad5949..c872c618a64be66 100644
--- a/llvm/test/Transforms/PGOProfile/loop2.ll
+++ b/llvm/test/Transforms/PGOProfile/loop2.ll
@@ -30,7 +30,8 @@ for.cond.outer:
 
 for.body.outer:
 ; GEN: for.body.outer:
-; GEN-NOT: call void @llvm.instrprof.increment
+; NOTENTRY: call void @llvm.instrprof.increment(ptr @__profn_test_nested_for, i64 798733566382720768, i32 3, i32 1)
+; ENTRY: call void @llvm.instrprof.increment(ptr @__profn_test_nested_for, i64 798733566382720768, i32 3, i32 2)
   br label %for.cond.inner
 
 for.cond.inner:
@@ -62,8 +63,7 @@ for.end.inner:
 
 for.inc.outer:
 ; GEN: for.inc.outer:
-; NOTENTRY: call void @llvm.instrprof.increment(ptr @__profn_test_nested_for, i64 {{[0-9]+}}, i32 3, i32 1)
-; ENTRY: call void @llvm.instrprof.increment(ptr @__profn_test_nested_for, i64 {{[0-9]+}}, i32 3, i32 2)
+; GEN-NOT: call void @llvm.instrprof.increment
   %inc.2 = add nsw i32 %i.0, 1
   br label %for.cond.outer
 
diff --git a/llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll b/llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll
index f5c3ca4aca470df..ef2fcc6a9e2485a 100644
--- a/llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll
@@ -58,19 +58,19 @@ b1:
 b2:
   call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 2, i32 0, i64 -1)
   br i1 %cmp, label %b7, label %b3
-; CHECK: - b2: float = {{.*}}, int = {{.*}}, count = 625
+; CHECK: - b2: float = {{.*}}, int = {{.*}}, count = 586
 
 b3:
   call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 3, i32 0, i64 -1)
   br i1 %cmp, label %b7, label %b4
-; CHECK: - b3: float = {{.*}}, int = {{.*}}, count = 625
+; CHECK: - b3: float = {{.*}}, int = {{.*}}, count = 586
 ; CHECK2: br i1 %cmp, label %b7, label %b4,
 ; CHECK2-SAME: !prof ![[END172_PROF:[0-9]+]]
 
 b4:
   call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 4, i32 0, i64 -1)
   br label %b2
-; CHECK: - b4: float = {{.*}}, int = {{.*}}, count = 624
+; CHECK: - b4: float = {{.*}}, int = {{.*}}, count = 585
 
 b5:
   call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 5, i32 0, i64 -1)
diff --git a/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll b/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll
index 36772eda1ede76e..9d38f8889396a6e 100644
--- a/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll
@@ -148,26 +148,26 @@ b1:
   br i1 %cmp, label %b2, label %b3
 ; CHECK:  edge b1 -> b2 probability is 0x40000000 / 0x80000000 = 50.00%
 ; CHECK:  edge b1 -> b3 probability is 0x40000000 / 0x80000000 = 50.00%
-; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 1973
+; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 2000
 
 b2:
   call void @llvm.pseudoprobe(i64 2506109673213838996, i64 3, i32 0, i64 -1)
   br i1 %cmp, label %b3, label %b4
 ; CHECK:  edge b2 -> b3 probability is 0x40000000 / 0x80000000 = 50.00%
 ; CHECK:  edge b2 -> b4 probability is 0x40000000 / 0x80000000 = 50.00%
-; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 955
+; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 1000
 
 b3:
   call void @llvm.pseudoprobe(i64 2506109673213838996, i64 4, i32 0, i64 -1)
   br label %b5
 ; CHECK:  edge b3 -> b5 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
-; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 1527
+; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 1500
 
 b4:
   call void @llvm.pseudoprobe(i64 2506109673213838996, i64 5, i32 0, i64 -1)
   br label %b5
 ; CHECK:  edge b4 -> b5 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
-; CHECK2: - b4: float = {{.*}}, int = {{.*}}, count = 445
+; CHECK2: - b4: float = {{.*}}, int = {{.*}}, count = 500
 
 b5:
   call void @llvm.pseudoprobe(i64 2506109673213838996, i64 6, i32 0, i64 -1)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll
index 19e83649723d642..105494942d383d5 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll
@@ -14,8 +14,8 @@ T1:                                               ; preds = %0
   %v1 = call i32 @f1(), !prof !12
   %cond3 = icmp eq i32 %v1, 412
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 2, i32 0, i64 -1)
-;; The distribution factor -8513881372706734080 stands for 53.85%, whic is from 7/6+7.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -8513881372706734080)
+;; The distribution factor -9223372036854775808 stands for 53.85%, whic is from 7/6+7.
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -9223372036854775808)
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1), !dbg !13
 ;; Probe 7 has two copies, since they don't share the same inline context, they are not
 ;; considered sharing samples, thus their distribution factors are not fixed up.
@@ -29,8 +29,8 @@ T1:                                               ; preds = %0
 Merge:                                            ; preds = %0
   %v2 = call i32 @f2(), !prof !12
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 3, i32 0, i64 -1)
-;; The distribution factor 8513881922462547968 stands for 46.25%, which is from 6/6+7.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 4, i32 0, i64 8513881922462547968)
+;; The distribution factor  -9223372036854775808 stands for 46.25%, which is from 6/6+7.
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 4, i32 0, i64  -9223372036854775808)
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 8513881922462547968), !dbg !13
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 7, i32 0, i64 -1)
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 7, i32 0, i64 -1), !dbg !18
@@ -77,4 +77,4 @@ attributes #0 = { inaccessiblememonly nounwind willreturn }
 !16 = distinct !DILocation(line: 10, column: 11, scope: !17)
 !17 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 186646551)
 !18 = !DILocation(line: 53, column: 3, scope: !15, inlinedAt: !19)
-!19 = !DILocation(line: 12, column: 3, scope: !4)
\ No newline at end of file
+!19 = !DILocation(line: 12, column: 3, scope: !4)



More information about the llvm-commits mailing list