[compiler-rt] Bfi precision (PR #66285)

Matthias Braun via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 13 15:40:07 PDT 2023


https://github.com/MatzeB updated https://github.com/llvm/llvm-project/pull/66285

>From 7e20dc4192af9964eab4dbb205bb786e2a569aa4 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Fri, 22 Sep 2023 11:00:01 -0700
Subject: [PATCH 1/4] Switch some tests to use update_llc_test_checks.py

---
 .../CodeGen/AMDGPU/optimize-negated-cond.ll   | 123 +++++++++++++++---
 llvm/test/CodeGen/VE/Scalar/br_jt.ll          |  93 ++++++-------
 .../CodeGen/X86/2008-04-17-CoalescerBug.ll    |   1 +
 llvm/test/CodeGen/X86/dup-cost.ll             |  54 ++++++--
 4 files changed, 195 insertions(+), 76 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
index ca51994b92203c3..f284df4d8a70b1b 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -1,10 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}negated_cond:
-; GCN: .LBB0_2:
-; GCN:   v_cndmask_b32_e64
-; GCN:   v_cmp_ne_u32_e64
 define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
+; GCN-LABEL: negated_cond:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_branch .LBB0_2
+; GCN-NEXT:  .LBB0_1: ; %loop.exit.guard
+; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; GCN-NEXT:    s_and_b64 vcc, exec, s[14:15]
+; GCN-NEXT:    s_cbranch_vccnz .LBB0_9
+; GCN-NEXT:  .LBB0_2: ; %bb1
+; GCN-NEXT:    ; =>This Loop Header: Depth=1
+; GCN-NEXT:    ; Child Loop BB0_4 Depth 2
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    buffer_load_dword v1, off, s[8:11], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v1
+; GCN-NEXT:    s_mov_b32 s12, s6
+; GCN-NEXT:    s_branch .LBB0_4
+; GCN-NEXT:  .LBB0_3: ; %Flow1
+; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
+; GCN-NEXT:    s_cbranch_vccz .LBB0_1
+; GCN-NEXT:  .LBB0_4: ; %bb2
+; GCN-NEXT:    ; Parent Loop BB0_2 Depth=1
+; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
+; GCN-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GCN-NEXT:    s_lshl_b32 s12, s12, 5
+; GCN-NEXT:    s_cbranch_vccz .LBB0_6
+; GCN-NEXT:  ; %bb.5: ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_mov_b64 s[14:15], s[2:3]
+; GCN-NEXT:    s_branch .LBB0_7
+; GCN-NEXT:  .LBB0_6: ; %bb3
+; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_add_i32 s12, s12, 1
+; GCN-NEXT:    s_mov_b64 s[14:15], -1
+; GCN-NEXT:  .LBB0_7: ; %Flow
+; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
+; GCN-NEXT:    s_mov_b64 s[16:17], -1
+; GCN-NEXT:    s_cbranch_vccnz .LBB0_3
+; GCN-NEXT:  ; %bb.8: ; %bb4
+; GCN-NEXT:    ; in Loop: Header=BB0_4 Depth=2
+; GCN-NEXT:    s_ashr_i32 s13, s12, 31
+; GCN-NEXT:    s_lshl_b64 s[16:17], s[12:13], 2
+; GCN-NEXT:    s_mov_b64 s[14:15], 0
+; GCN-NEXT:    v_mov_b32_e32 v1, s16
+; GCN-NEXT:    v_mov_b32_e32 v2, s17
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_cmp_eq_u32 s12, 32
+; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; GCN-NEXT:    s_branch .LBB0_3
+; GCN-NEXT:  .LBB0_9: ; %DummyReturnBlock
+; GCN-NEXT:    s_endpgm
 bb:
   br label %bb1
 
@@ -30,20 +88,51 @@ bb4:
   br i1 %tmp7, label %bb1, label %bb2
 }
 
-; GCN-LABEL: {{^}}negated_cond_dominated_blocks:
-; GCN:   s_cmp_lg_u32
-; GCN:   s_cselect_b64  [[CC1:[^,]+]], -1, 0
-; GCN:   s_branch [[BB1:.LBB[0-9]+_[0-9]+]]
-; GCN: [[BB0:.LBB[0-9]+_[0-9]+]]
-; GCN-NOT: v_cndmask_b32
-; GCN-NOT: v_cmp
-; GCN: [[BB1]]:
-; GCN:   s_mov_b64 vcc, [[CC1]]
-; GCN:   s_cbranch_vccz [[BB2:.LBB[0-9]+_[0-9]+]]
-; GCN:   s_mov_b64 vcc, exec
-; GCN:   s_cbranch_execnz [[BB0]]
-; GCN: [[BB2]]:
 define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1) {
+; GCN-LABEL: negated_cond_dominated_blocks:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u32 s0, 0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_mov_b32 s3, s6
+; GCN-NEXT:    s_branch .LBB1_2
+; GCN-NEXT:  .LBB1_1: ; %bb7
+; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
+; GCN-NEXT:    s_ashr_i32 s3, s2, 31
+; GCN-NEXT:    s_lshl_b64 s[8:9], s[2:3], 2
+; GCN-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-NEXT:    v_mov_b32_e32 v2, s9
+; GCN-NEXT:    s_cmp_eq_u32 s2, 32
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b32 s3, s2
+; GCN-NEXT:    s_cbranch_scc1 .LBB1_6
+; GCN-NEXT:  .LBB1_2: ; %bb4
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_mov_b64 vcc, s[0:1]
+; GCN-NEXT:    s_cbranch_vccz .LBB1_4
+; GCN-NEXT:  ; %bb.3: ; %bb6
+; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
+; GCN-NEXT:    s_add_i32 s2, s3, 1
+; GCN-NEXT:    s_mov_b64 vcc, exec
+; GCN-NEXT:    s_cbranch_execnz .LBB1_1
+; GCN-NEXT:    s_branch .LBB1_5
+; GCN-NEXT:  .LBB1_4: ; in Loop: Header=BB1_2 Depth=1
+; GCN-NEXT:    ; implicit-def: $sgpr2
+; GCN-NEXT:    s_mov_b64 vcc, 0
+; GCN-NEXT:  .LBB1_5: ; %bb5
+; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
+; GCN-NEXT:    s_lshl_b32 s2, s3, 5
+; GCN-NEXT:    s_or_b32 s2, s2, 1
+; GCN-NEXT:    s_branch .LBB1_1
+; GCN-NEXT:  .LBB1_6: ; %bb3
+; GCN-NEXT:    s_endpgm
 bb:
   br label %bb2
 
diff --git a/llvm/test/CodeGen/VE/Scalar/br_jt.ll b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
index bc7b26abe7e046f..216d4cca097001c 100644
--- a/llvm/test/CodeGen/VE/Scalar/br_jt.ll
+++ b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple=ve | FileCheck %s
 ; RUN: llc < %s -mtriple=ve -relocation-model=pic \
 ; RUN:     | FileCheck %s -check-prefix=PIC
@@ -11,22 +12,22 @@ define signext i32 @br_jt3(i32 signext %0) {
 ; CHECK-LABEL: br_jt3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    and %s0, %s0, (32)0
-; CHECK-NEXT:    breq.w 1, %s0, .LBB{{[0-9]+}}_1
+; CHECK-NEXT:    breq.w 1, %s0, .LBB0_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    breq.w 4, %s0, .LBB{{[0-9]+}}_5
+; CHECK-NEXT:    breq.w 4, %s0, .LBB0_5
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    brne.w 2, %s0, .LBB{{[0-9]+}}_6
+; CHECK-NEXT:    brne.w 2, %s0, .LBB0_6
 ; CHECK-NEXT:  # %bb.4:
 ; CHECK-NEXT:    or %s0, 0, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_1:
+; CHECK-NEXT:  .LBB0_1:
 ; CHECK-NEXT:    or %s0, 3, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_5:
+; CHECK-NEXT:  .LBB0_5:
 ; CHECK-NEXT:    or %s0, 7, (0)1
-; CHECK-NEXT:  .LBB{{[0-9]+}}_6:
+; CHECK-NEXT:  .LBB0_6:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
 ;
@@ -78,7 +79,7 @@ define signext i32 @br_jt4(i32 signext %0) {
 ; CHECK-NEXT:    and %s0, %s0, (32)0
 ; CHECK-NEXT:    adds.w.sx %s1, -1, %s0
 ; CHECK-NEXT:    cmpu.w %s2, 3, %s1
-; CHECK-NEXT:    brgt.w 0, %s2, .LBB{{[0-9]+}}_2
+; CHECK-NEXT:    brgt.w 0, %s2, .LBB1_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    adds.w.sx %s0, %s1, (0)1
 ; CHECK-NEXT:    sll %s0, %s0, 2
@@ -87,7 +88,7 @@ define signext i32 @br_jt4(i32 signext %0) {
 ; CHECK-NEXT:    lea.sl %s1, .Lswitch.table.br_jt4 at hi(, %s1)
 ; CHECK-NEXT:    ldl.sx %s0, (%s0, %s1)
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
 ;
@@ -138,18 +139,18 @@ define signext i32 @br_jt7(i32 signext %0) {
 ; CHECK-NEXT:    and %s0, %s0, (32)0
 ; CHECK-NEXT:    adds.w.sx %s1, -1, %s0
 ; CHECK-NEXT:    cmpu.w %s2, 8, %s1
-; CHECK-NEXT:    brgt.w 0, %s2, .LBB{{[0-9]+}}_3
+; CHECK-NEXT:    brgt.w 0, %s2, .LBB2_3
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    and %s2, %s1, (48)0
 ; CHECK-NEXT:    lea %s3, 463
 ; CHECK-NEXT:    and %s3, %s3, (32)0
 ; CHECK-NEXT:    srl %s2, %s3, %s2
 ; CHECK-NEXT:    and %s2, 1, %s2
-; CHECK-NEXT:    brne.w 0, %s2, .LBB{{[0-9]+}}_2
-; CHECK-NEXT:  .LBB{{[0-9]+}}_3:
+; CHECK-NEXT:    brne.w 0, %s2, .LBB2_2
+; CHECK-NEXT:  .LBB2_3:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:  .LBB2_2:
 ; CHECK-NEXT:    adds.w.sx %s0, %s1, (0)1
 ; CHECK-NEXT:    sll %s0, %s0, 2
 ; CHECK-NEXT:    lea %s1, .Lswitch.table.br_jt7 at lo
@@ -219,18 +220,18 @@ define signext i32 @br_jt8(i32 signext %0) {
 ; CHECK-NEXT:    and %s0, %s0, (32)0
 ; CHECK-NEXT:    adds.w.sx %s1, -1, %s0
 ; CHECK-NEXT:    cmpu.w %s2, 8, %s1
-; CHECK-NEXT:    brgt.w 0, %s2, .LBB{{[0-9]+}}_3
+; CHECK-NEXT:    brgt.w 0, %s2, .LBB3_3
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    and %s2, %s1, (48)0
 ; CHECK-NEXT:    lea %s3, 495
 ; CHECK-NEXT:    and %s3, %s3, (32)0
 ; CHECK-NEXT:    srl %s2, %s3, %s2
 ; CHECK-NEXT:    and %s2, 1, %s2
-; CHECK-NEXT:    brne.w 0, %s2, .LBB{{[0-9]+}}_2
-; CHECK-NEXT:  .LBB{{[0-9]+}}_3:
+; CHECK-NEXT:    brne.w 0, %s2, .LBB3_2
+; CHECK-NEXT:  .LBB3_3:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:  .LBB3_2:
 ; CHECK-NEXT:    adds.w.sx %s0, %s1, (0)1
 ; CHECK-NEXT:    sll %s0, %s0, 2
 ; CHECK-NEXT:    lea %s1, .Lswitch.table.br_jt8 at lo
@@ -298,23 +299,23 @@ define signext i32 @br_jt3_m(i32 signext %0, i32 signext %1) {
 ; CHECK-LABEL: br_jt3_m:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    and %s0, %s0, (32)0
-; CHECK-NEXT:    breq.w 1, %s0, .LBB{{[0-9]+}}_1
+; CHECK-NEXT:    breq.w 1, %s0, .LBB4_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    breq.w 4, %s0, .LBB{{[0-9]+}}_5
+; CHECK-NEXT:    breq.w 4, %s0, .LBB4_5
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    brne.w 2, %s0, .LBB{{[0-9]+}}_6
+; CHECK-NEXT:    brne.w 2, %s0, .LBB4_6
 ; CHECK-NEXT:  # %bb.4:
 ; CHECK-NEXT:    or %s0, 0, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_1:
+; CHECK-NEXT:  .LBB4_1:
 ; CHECK-NEXT:    or %s0, 3, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_5:
+; CHECK-NEXT:  .LBB4_5:
 ; CHECK-NEXT:    and %s0, %s1, (32)0
 ; CHECK-NEXT:    adds.w.sx %s0, 3, %s0
-; CHECK-NEXT:  .LBB{{[0-9]+}}_6:
+; CHECK-NEXT:  .LBB4_6:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
 ;
@@ -368,7 +369,7 @@ define signext i32 @br_jt4_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    and %s0, %s0, (32)0
 ; CHECK-NEXT:    adds.w.sx %s2, -1, %s0
 ; CHECK-NEXT:    cmpu.w %s3, 3, %s2
-; CHECK-NEXT:    brgt.w 0, %s3, .LBB{{[0-9]+}}_5
+; CHECK-NEXT:    brgt.w 0, %s3, .LBB5_5
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    adds.w.zx %s0, %s2, (0)1
 ; CHECK-NEXT:    sll %s0, %s0, 3
@@ -378,18 +379,18 @@ define signext i32 @br_jt4_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    ld %s2, (%s2, %s0)
 ; CHECK-NEXT:    or %s0, 3, (0)1
 ; CHECK-NEXT:    b.l.t (, %s2)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:  .LBB5_2:
 ; CHECK-NEXT:    or %s0, 0, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_3:
+; CHECK-NEXT:  .LBB5_3:
 ; CHECK-NEXT:    or %s0, 4, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_4:
+; CHECK-NEXT:  .LBB5_4:
 ; CHECK-NEXT:    and %s0, %s1, (32)0
 ; CHECK-NEXT:    adds.w.sx %s0, 3, %s0
-; CHECK-NEXT:  .LBB{{[0-9]+}}_5:
+; CHECK-NEXT:  .LBB5_5:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
 ;
@@ -455,7 +456,7 @@ define signext i32 @br_jt7_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    and %s2, %s0, (32)0
 ; CHECK-NEXT:    adds.w.sx %s0, -1, %s2
 ; CHECK-NEXT:    cmpu.w %s3, 8, %s0
-; CHECK-NEXT:    brgt.w 0, %s3, .LBB{{[0-9]+}}_8
+; CHECK-NEXT:    brgt.w 0, %s3, .LBB6_8
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
 ; CHECK-NEXT:    sll %s0, %s0, 3
@@ -466,32 +467,32 @@ define signext i32 @br_jt7_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    and %s1, %s1, (32)0
 ; CHECK-NEXT:    or %s0, 3, (0)1
 ; CHECK-NEXT:    b.l.t (, %s3)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:  .LBB6_2:
 ; CHECK-NEXT:    or %s0, 0, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_3:
+; CHECK-NEXT:  .LBB6_3:
 ; CHECK-NEXT:    or %s0, 4, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_4:
+; CHECK-NEXT:  .LBB6_4:
 ; CHECK-NEXT:    adds.w.sx %s0, 3, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_8:
+; CHECK-NEXT:  .LBB6_8:
 ; CHECK-NEXT:    or %s0, 0, %s2
-; CHECK-NEXT:  .LBB{{[0-9]+}}_9:
+; CHECK-NEXT:  .LBB6_9:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_7:
+; CHECK-NEXT:  .LBB6_7:
 ; CHECK-NEXT:    or %s0, 11, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_6:
+; CHECK-NEXT:  .LBB6_6:
 ; CHECK-NEXT:    or %s0, 10, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_5:
+; CHECK-NEXT:  .LBB6_5:
 ; CHECK-NEXT:    adds.w.sx %s0, -2, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
@@ -591,7 +592,7 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    and %s2, %s0, (32)0
 ; CHECK-NEXT:    adds.w.sx %s0, -1, %s2
 ; CHECK-NEXT:    cmpu.w %s3, 8, %s0
-; CHECK-NEXT:    brgt.w 0, %s3, .LBB{{[0-9]+}}_9
+; CHECK-NEXT:    brgt.w 0, %s3, .LBB7_9
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
 ; CHECK-NEXT:    sll %s0, %s0, 3
@@ -602,36 +603,36 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    and %s1, %s1, (32)0
 ; CHECK-NEXT:    or %s0, 3, (0)1
 ; CHECK-NEXT:    b.l.t (, %s3)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_2:
+; CHECK-NEXT:  .LBB7_2:
 ; CHECK-NEXT:    or %s0, 0, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_3:
+; CHECK-NEXT:  .LBB7_3:
 ; CHECK-NEXT:    or %s0, 4, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_4:
+; CHECK-NEXT:  .LBB7_4:
 ; CHECK-NEXT:    adds.w.sx %s0, 3, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_9:
+; CHECK-NEXT:  .LBB7_9:
 ; CHECK-NEXT:    or %s0, 0, %s2
-; CHECK-NEXT:  .LBB{{[0-9]+}}_10:
+; CHECK-NEXT:  .LBB7_10:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_5:
+; CHECK-NEXT:  .LBB7_5:
 ; CHECK-NEXT:    adds.w.sx %s0, -5, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_6:
+; CHECK-NEXT:  .LBB7_6:
 ; CHECK-NEXT:    adds.w.sx %s0, -2, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_8:
+; CHECK-NEXT:  .LBB7_8:
 ; CHECK-NEXT:    or %s0, 11, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB{{[0-9]+}}_7:
+; CHECK-NEXT:  .LBB7_7:
 ; CHECK-NEXT:    or %s0, 10, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
diff --git a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
index 6d596195fe7f696..bf939c4131080d3 100644
--- a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
+++ b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
 ; Make sure xorl operands are 32-bit registers.
 
diff --git a/llvm/test/CodeGen/X86/dup-cost.ll b/llvm/test/CodeGen/X86/dup-cost.ll
index 523f0f1154e94d3..ec9d36aa2a11b65 100644
--- a/llvm/test/CodeGen/X86/dup-cost.ll
+++ b/llvm/test/CodeGen/X86/dup-cost.ll
@@ -1,14 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
 ; Cold function, %dup should not be duplicated into predecessors.
 define i32 @cold(i32 %a, ptr %p, ptr %q) !prof !21 {
-; CHECK-LABEL: cold
-; CHECK:       %entry
-; CHECK:       %true1
-; CHECK:       %dup
-; CHECK:       %true2
-; CHECK:       %false1
-; CHECK:       %false2
+; CHECK-LABEL: cold:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $2, %edi
+; CHECK-NEXT:    jl .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %true1
+; CHECK-NEXT:    movl (%rsi), %eax
+; CHECK-NEXT:    addl $2, %eax
+; CHECK-NEXT:  .LBB0_3: # %dup
+; CHECK-NEXT:    cmpl $5, %eax
+; CHECK-NEXT:    jl .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %true2
+; CHECK-NEXT:    xorl %edi, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_2: # %false1
+; CHECK-NEXT:    movl (%rdx), %eax
+; CHECK-NEXT:    addl $-3, %eax
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_5: # %false2
+; CHECK-NEXT:    andl %edi, %eax
+; CHECK-NEXT:    retq
 entry:
   %cond1 = icmp sgt i32 %a, 1
   br i1 %cond1, label %true1, label %false1, !prof !30
@@ -44,12 +58,26 @@ exit:
 ; Same code as previous function, but with hot profile count.
 ; So %dup should be duplicated into predecessors.
 define i32 @hot(i32 %a, ptr %p, ptr %q) !prof !22 {
-; CHECK-LABEL: hot
-; CHECK:       %entry
-; CHECK:       %true1
-; CHECK:       %false2
-; CHECK:       %false1
-; CHECK:       %true2
+; CHECK-LABEL: hot:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpl $2, %edi
+; CHECK-NEXT:    jl .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %true1
+; CHECK-NEXT:    movl (%rsi), %eax
+; CHECK-NEXT:    addl $2, %eax
+; CHECK-NEXT:    cmpl $5, %eax
+; CHECK-NEXT:    jge .LBB1_4
+; CHECK-NEXT:  .LBB1_5: # %false2
+; CHECK-NEXT:    andl %edi, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB1_2: # %false1
+; CHECK-NEXT:    movl (%rdx), %eax
+; CHECK-NEXT:    addl $-3, %eax
+; CHECK-NEXT:    cmpl $5, %eax
+; CHECK-NEXT:    jl .LBB1_5
+; CHECK-NEXT:  .LBB1_4: # %true2
+; CHECK-NEXT:    xorl %edi, %eax
+; CHECK-NEXT:    retq
 entry:
   %cond1 = icmp sgt i32 %a, 1
   br i1 %cond1, label %true1, label %false1, !prof !30

>From ebe61e18a163d27bfd60908b9d3ca316eb92c2bc Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Thu, 21 Sep 2023 11:18:57 -0700
Subject: [PATCH 2/4] MachineBlockPlacement: Add tolerance to comparisons

The LLVM BFI data is susceptible to numerical "noise" in the lower bits:
To get some feeling for this consider that we store `BlockFrequency` as
64-bit integers and branch weights as 32-bit. There is a natural
numerical difference when the branch weights divisions are not naturally
representable as integers and because block frequencies are scaled by
loop factors the absolute values of those errors can appear big.
Regardless it should be a small relative difference relative to the
frequency numbers.

- Add `BlockFrequency::almostEqual` function to check whether two
  frequencies are close enough to each other that the topmost N bits
  are equal and only lower bits differ.
- Change `MachineBlockPlacement::selectBestCandidateBlock` to be a
  two-pass algorithm that first finds determines the candidates with the
  biggest frequency and then performs a 2nd pass to check among blocks
  with "almostEqual" frequency to prefer the one already being the
  layout successor or first in the worklist.

The currently value of 20 significant bits works well to reduce
unnecessary changes in my upcoming BFI precision changes.
---
 llvm/include/llvm/Support/BlockFrequency.h    |  28 +++
 llvm/lib/CodeGen/MachineBlockPlacement.cpp    |  25 ++-
 llvm/test/CodeGen/AArch64/cfi-fixup.ll        |  24 +-
 .../Generic/machine-function-splitter.ll      |   1 -
 .../Mips/indirect-jump-hazard/jumptables.ll   | 208 +++++++++---------
 llvm/test/CodeGen/Mips/jump-table-mul.ll      |  32 +--
 llvm/test/CodeGen/Mips/nacl-align.ll          |  10 +-
 llvm/test/CodeGen/Mips/pseudo-jump-fill.ll    |  14 +-
 .../PowerPC/jump-tables-collapse-rotate.ll    |  39 ++--
 llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll   |   8 +-
 llvm/test/CodeGen/PowerPC/p10-spill-crlt.ll   |  20 +-
 llvm/test/CodeGen/PowerPC/p10-spill-crun.ll   |  32 +--
 llvm/test/CodeGen/PowerPC/pr43527.ll          |   8 +-
 llvm/test/CodeGen/PowerPC/pr45448.ll          |  14 +-
 .../CodeGen/RISCV/shrinkwrap-jump-table.ll    |  20 +-
 .../CodeGen/Thumb2/bti-indirect-branches.ll   |  37 +---
 llvm/test/CodeGen/Thumb2/constant-hoisting.ll |  18 +-
 llvm/test/CodeGen/VE/Scalar/br_jt.ll          |  64 +++---
 .../CodeGen/X86/2008-04-17-CoalescerBug.ll    |  18 +-
 llvm/test/CodeGen/X86/2009-08-12-badswitch.ll |   6 +-
 llvm/test/CodeGen/X86/block-placement.ll      |   2 +-
 llvm/test/CodeGen/X86/callbr-asm-outputs.ll   |  20 +-
 llvm/test/CodeGen/X86/dup-cost.ll             |   6 +-
 llvm/test/CodeGen/X86/mul-constant-result.ll  | 112 +++++-----
 llvm/test/CodeGen/X86/pic.ll                  |  22 +-
 llvm/test/CodeGen/X86/pr38743.ll              |  10 +-
 llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll |   8 +-
 .../speculative-load-hardening-indirect.ll    |  24 +-
 llvm/test/CodeGen/X86/switch.ll               |  21 +-
 llvm/test/CodeGen/X86/win-catchpad.ll         |   8 +-
 llvm/unittests/Support/BlockFrequencyTest.cpp |  41 ++++
 31 files changed, 485 insertions(+), 415 deletions(-)

diff --git a/llvm/include/llvm/Support/BlockFrequency.h b/llvm/include/llvm/Support/BlockFrequency.h
index 8b172ee486aab85..52348b9cba7be81 100644
--- a/llvm/include/llvm/Support/BlockFrequency.h
+++ b/llvm/include/llvm/Support/BlockFrequency.h
@@ -14,8 +14,10 @@
 #define LLVM_SUPPORT_BLOCKFREQUENCY_H
 
 #include <cassert>
+#include <climits>
 #include <cstdint>
 #include <optional>
+#include <utility>
 
 namespace llvm {
 
@@ -94,6 +96,32 @@ class BlockFrequency {
     return *this;
   }
 
+  /// Returns true if `this` is smaller or greater than `other` and the
+  /// magnitude of the difference is below the topmost `SignificantBits` of
+  /// `this`.
+  ///
+  /// The formula is related to comparing a "relative change" to a threshold.
+  /// When choosing the threshold as a negative power-of-two things can be
+  /// computed cheaply:
+  ///   with A = max(this->Frequency, Other.Frequency);
+  ///        B = min(Other.Frequency, this->Frequency); T = SignificantBits
+  ///   relative_change(A, B) = abs(A - B) / A
+  ///         (A - B) / A  <=  2**(-T)
+  ///   <=>   A - B        <=  2**(-T) * A
+  ///   <=>   A - B        <=  shr(A, T)
+  bool almostEqual(BlockFrequency Other, unsigned SignificantBits = 20) const {
+    assert(0 <= SignificantBits &&
+           SignificantBits < sizeof(Frequency) * CHAR_BIT &&
+           "invalid SignificantBits value");
+    uint64_t Max = Frequency;
+    uint64_t Min = Other.Frequency;
+    if (Max < Min)
+      std::swap(Min, Max);
+    uint64_t Diff = Max - Min;
+    uint64_t Threshold = Max >> SignificantBits;
+    return Diff <= Threshold;
+  }
+
   bool operator<(BlockFrequency RHS) const {
     return Frequency < RHS.Frequency;
   }
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index d0d3574b30bfd88..7eabad2bcb1f4e0 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -1715,7 +1715,8 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
 
   bool IsEHPad = WorkList[0]->isEHPad();
 
-  MachineBasicBlock *BestBlock = nullptr;
+  MachineBasicBlock *ChainEnd = *std::prev(Chain.end());
+  MachineBasicBlock *LayoutSucc = ChainEnd->getNextNode();
   BlockFrequency BestFreq;
   for (MachineBasicBlock *MBB : WorkList) {
     assert(MBB->isEHPad() == IsEHPad &&
@@ -1731,7 +1732,8 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
     BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
     LLVM_DEBUG(dbgs() << "    " << getBlockName(MBB) << " -> "
                       << printBlockFreq(MBFI->getMBFI(), CandidateFreq)
-                      << " (freq)\n");
+                      << " (freq) is_layoutsucc " << (MBB == LayoutSucc)
+                      << '\n');
 
     // For ehpad, we layout the least probable first as to avoid jumping back
     // from least probable landingpads to more probable ones.
@@ -1751,12 +1753,27 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
     //                 +-------------------------------------+
     //                 V                                     |
     // OuterLp -> OuterCleanup -> Resume     InnerLp -> InnerCleanup
-    if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
+    if (BestFreq > BlockFrequency(0) && (IsEHPad ^ (BestFreq >= CandidateFreq)))
       continue;
 
-    BestBlock = MBB;
     BestFreq = CandidateFreq;
   }
+  // Perform a 2nd pass to make the output more stable: Check for blocks with
+  // almost the same frequency as `BestFreq` and prefer the one that is already
+  // a successor to `ChainEnd` first or whichever comes first in `WorkList`.
+  MachineBasicBlock *BestBlock = nullptr;
+  for (MachineBasicBlock *MBB : WorkList) {
+    // Give precedence to the layout successor otherwise the first candidate.
+    if (BestBlock != nullptr && MBB != LayoutSucc)
+      continue;
+    BlockChain &SuccChain = *BlockToChain[MBB];
+    if (&SuccChain == &Chain)
+      continue;
+    BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
+    if (!BestFreq.almostEqual(CandidateFreq))
+      continue;
+    BestBlock = MBB;
+  }
 
   return BestBlock;
 }
diff --git a/llvm/test/CodeGen/AArch64/cfi-fixup.ll b/llvm/test/CodeGen/AArch64/cfi-fixup.ll
index 9a4ad3bb07ee364..e746ea745b92a8c 100644
--- a/llvm/test/CodeGen/AArch64/cfi-fixup.ll
+++ b/llvm/test/CodeGen/AArch64/cfi-fixup.ll
@@ -8,13 +8,13 @@ define i32 @f0(i32 %x) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    .cfi_remember_state
-; CHECK-NEXT:    cbz w0, .LBB0_4
+; CHECK-NEXT:    cbz w0, .LBB0_6
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    cmp w0, #2
-; CHECK-NEXT:    b.eq .LBB0_5
+; CHECK-NEXT:    b.eq .LBB0_4
 ; CHECK-NEXT:  // %bb.2: // %entry
 ; CHECK-NEXT:    cmp w0, #1
-; CHECK-NEXT:    b.ne .LBB0_6
+; CHECK-NEXT:    b.ne .LBB0_5
 ; CHECK-NEXT:  // %bb.3: // %if.then2
 ; CHECK-NEXT:    bl g1
 ; CHECK-NEXT:    add w0, w0, #1
@@ -22,27 +22,27 @@ define i32 @f0(i32 %x) #0 {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:  .LBB0_4: // %if.then5
 ; CHECK-NEXT:    .cfi_restore_state
 ; CHECK-NEXT:    .cfi_remember_state
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    bl g0
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    sub w0, w8, w0
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_5: // %if.then5
+; CHECK-NEXT:  .LBB0_5: // %if.end7
 ; CHECK-NEXT:    .cfi_restore_state
 ; CHECK-NEXT:    .cfi_remember_state
-; CHECK-NEXT:    bl g0
-; CHECK-NEXT:    mov w8, #1
-; CHECK-NEXT:    sub w0, w8, w0
+; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_6: // %if.end7
+; CHECK-NEXT:  .LBB0_6:
 ; CHECK-NEXT:    .cfi_restore_state
-; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w30
@@ -115,7 +115,7 @@ define i32 @f2(i32 %x) #0 {
 ; CHECK-NEXT:    cbz w0, .LBB2_2
 ; CHECK-NEXT:  // %bb.1: // %if.end
 ; CHECK-NEXT:    bl g1
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    sub w0, w8, w0
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/Generic/machine-function-splitter.ll b/llvm/test/CodeGen/Generic/machine-function-splitter.ll
index 364eadd64c7556d..cdbf0690356b628 100644
--- a/llvm/test/CodeGen/Generic/machine-function-splitter.ll
+++ b/llvm/test/CodeGen/Generic/machine-function-splitter.ll
@@ -523,7 +523,6 @@ define i32 @foo18(i32 %in) !prof !14 !section_prefix !15 {
 ; MFS-DEFAULTS-LABEL:        foo18
 ; MFS-DEFAULTS:              .section        .text.split.foo18
 ; MFS-DEFAULTS-NEXT:         foo18.cold:
-; MFS-DEFAULTS-SAME:           %common.ret
 ; MFS-DEFAULTS-X86-DAG:        jmp     qux
 ; MFS-DEFAULTS-X86-DAG:        jmp     bam
 ; MFS-DEFAULTS-AARCH64-NOT:    b       bar
diff --git a/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll b/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
index b079169974d8b85..ff863a19b4918fb 100644
--- a/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
+++ b/llvm/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
@@ -40,7 +40,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS32R2-NEXT:    addiu $sp, $sp, -16
 ; MIPS32R2-NEXT:    .cfi_def_cfa_offset 16
 ; MIPS32R2-NEXT:    sltiu $1, $4, 7
-; MIPS32R2-NEXT:    beqz $1, $BB0_3
+; MIPS32R2-NEXT:    beqz $1, $BB0_9
 ; MIPS32R2-NEXT:    sw $4, 4($sp)
 ; MIPS32R2-NEXT:  $BB0_1: # %entry
 ; MIPS32R2-NEXT:    sll $1, $4, 2
@@ -54,39 +54,39 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str)
 ; MIPS32R2-NEXT:    j $BB0_10
 ; MIPS32R2-NEXT:    sw $1, 8($sp)
-; MIPS32R2-NEXT:  $BB0_3: # %sw.epilog
-; MIPS32R2-NEXT:    lui $1, %hi($.str.7)
-; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.7)
-; MIPS32R2-NEXT:    j $BB0_10
-; MIPS32R2-NEXT:    sw $1, 8($sp)
-; MIPS32R2-NEXT:  $BB0_4: # %sw.bb1
+; MIPS32R2-NEXT:  $BB0_3: # %sw.bb1
 ; MIPS32R2-NEXT:    lui $1, %hi($.str.1)
 ; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.1)
 ; MIPS32R2-NEXT:    j $BB0_10
 ; MIPS32R2-NEXT:    sw $1, 8($sp)
-; MIPS32R2-NEXT:  $BB0_5: # %sw.bb2
+; MIPS32R2-NEXT:  $BB0_4: # %sw.bb2
 ; MIPS32R2-NEXT:    lui $1, %hi($.str.2)
 ; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.2)
 ; MIPS32R2-NEXT:    j $BB0_10
 ; MIPS32R2-NEXT:    sw $1, 8($sp)
-; MIPS32R2-NEXT:  $BB0_6: # %sw.bb3
+; MIPS32R2-NEXT:  $BB0_5: # %sw.bb3
 ; MIPS32R2-NEXT:    lui $1, %hi($.str.3)
 ; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.3)
 ; MIPS32R2-NEXT:    j $BB0_10
 ; MIPS32R2-NEXT:    sw $1, 8($sp)
-; MIPS32R2-NEXT:  $BB0_7: # %sw.bb4
+; MIPS32R2-NEXT:  $BB0_6: # %sw.bb4
 ; MIPS32R2-NEXT:    lui $1, %hi($.str.4)
 ; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.4)
 ; MIPS32R2-NEXT:    j $BB0_10
 ; MIPS32R2-NEXT:    sw $1, 8($sp)
-; MIPS32R2-NEXT:  $BB0_8: # %sw.bb5
+; MIPS32R2-NEXT:  $BB0_7: # %sw.bb5
 ; MIPS32R2-NEXT:    lui $1, %hi($.str.5)
 ; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.5)
 ; MIPS32R2-NEXT:    j $BB0_10
 ; MIPS32R2-NEXT:    sw $1, 8($sp)
-; MIPS32R2-NEXT:  $BB0_9: # %sw.bb6
+; MIPS32R2-NEXT:  $BB0_8: # %sw.bb6
 ; MIPS32R2-NEXT:    lui $1, %hi($.str.6)
 ; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.6)
+; MIPS32R2-NEXT:    j $BB0_10
+; MIPS32R2-NEXT:    sw $1, 8($sp)
+; MIPS32R2-NEXT:  $BB0_9: # %sw.epilog
+; MIPS32R2-NEXT:    lui $1, %hi($.str.7)
+; MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.7)
 ; MIPS32R2-NEXT:    sw $1, 8($sp)
 ; MIPS32R2-NEXT:  $BB0_10: # %return
 ; MIPS32R2-NEXT:    lw $2, 8($sp)
@@ -98,7 +98,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS32R6-NEXT:    addiu $sp, $sp, -16
 ; MIPS32R6-NEXT:    .cfi_def_cfa_offset 16
 ; MIPS32R6-NEXT:    sltiu $1, $4, 7
-; MIPS32R6-NEXT:    beqz $1, $BB0_3
+; MIPS32R6-NEXT:    beqz $1, $BB0_9
 ; MIPS32R6-NEXT:    sw $4, 4($sp)
 ; MIPS32R6-NEXT:  $BB0_1: # %entry
 ; MIPS32R6-NEXT:    sll $1, $4, 2
@@ -112,39 +112,39 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str)
 ; MIPS32R6-NEXT:    j $BB0_10
 ; MIPS32R6-NEXT:    sw $1, 8($sp)
-; MIPS32R6-NEXT:  $BB0_3: # %sw.epilog
-; MIPS32R6-NEXT:    lui $1, %hi($.str.7)
-; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.7)
-; MIPS32R6-NEXT:    j $BB0_10
-; MIPS32R6-NEXT:    sw $1, 8($sp)
-; MIPS32R6-NEXT:  $BB0_4: # %sw.bb1
+; MIPS32R6-NEXT:  $BB0_3: # %sw.bb1
 ; MIPS32R6-NEXT:    lui $1, %hi($.str.1)
 ; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.1)
 ; MIPS32R6-NEXT:    j $BB0_10
 ; MIPS32R6-NEXT:    sw $1, 8($sp)
-; MIPS32R6-NEXT:  $BB0_5: # %sw.bb2
+; MIPS32R6-NEXT:  $BB0_4: # %sw.bb2
 ; MIPS32R6-NEXT:    lui $1, %hi($.str.2)
 ; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.2)
 ; MIPS32R6-NEXT:    j $BB0_10
 ; MIPS32R6-NEXT:    sw $1, 8($sp)
-; MIPS32R6-NEXT:  $BB0_6: # %sw.bb3
+; MIPS32R6-NEXT:  $BB0_5: # %sw.bb3
 ; MIPS32R6-NEXT:    lui $1, %hi($.str.3)
 ; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.3)
 ; MIPS32R6-NEXT:    j $BB0_10
 ; MIPS32R6-NEXT:    sw $1, 8($sp)
-; MIPS32R6-NEXT:  $BB0_7: # %sw.bb4
+; MIPS32R6-NEXT:  $BB0_6: # %sw.bb4
 ; MIPS32R6-NEXT:    lui $1, %hi($.str.4)
 ; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.4)
 ; MIPS32R6-NEXT:    j $BB0_10
 ; MIPS32R6-NEXT:    sw $1, 8($sp)
-; MIPS32R6-NEXT:  $BB0_8: # %sw.bb5
+; MIPS32R6-NEXT:  $BB0_7: # %sw.bb5
 ; MIPS32R6-NEXT:    lui $1, %hi($.str.5)
 ; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.5)
 ; MIPS32R6-NEXT:    j $BB0_10
 ; MIPS32R6-NEXT:    sw $1, 8($sp)
-; MIPS32R6-NEXT:  $BB0_9: # %sw.bb6
+; MIPS32R6-NEXT:  $BB0_8: # %sw.bb6
 ; MIPS32R6-NEXT:    lui $1, %hi($.str.6)
 ; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.6)
+; MIPS32R6-NEXT:    j $BB0_10
+; MIPS32R6-NEXT:    sw $1, 8($sp)
+; MIPS32R6-NEXT:  $BB0_9: # %sw.epilog
+; MIPS32R6-NEXT:    lui $1, %hi($.str.7)
+; MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.7)
 ; MIPS32R6-NEXT:    sw $1, 8($sp)
 ; MIPS32R6-NEXT:  $BB0_10: # %return
 ; MIPS32R6-NEXT:    lw $2, 8($sp)
@@ -157,7 +157,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    .cfi_def_cfa_offset 16
 ; MIPS64R2-NEXT:    dext $2, $4, 0, 32
 ; MIPS64R2-NEXT:    sltiu $1, $2, 7
-; MIPS64R2-NEXT:    beqz $1, .LBB0_3
+; MIPS64R2-NEXT:    beqz $1, .LBB0_9
 ; MIPS64R2-NEXT:    sw $4, 4($sp)
 ; MIPS64R2-NEXT:  .LBB0_1: # %entry
 ; MIPS64R2-NEXT:    dsll $1, $2, 3
@@ -179,16 +179,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str)
 ; MIPS64R2-NEXT:    j .LBB0_10
 ; MIPS64R2-NEXT:    sd $1, 8($sp)
-; MIPS64R2-NEXT:  .LBB0_3: # %sw.epilog
-; MIPS64R2-NEXT:    lui $1, %highest(.L.str.7)
-; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.7)
-; MIPS64R2-NEXT:    dsll $1, $1, 16
-; MIPS64R2-NEXT:    daddiu $1, $1, %hi(.L.str.7)
-; MIPS64R2-NEXT:    dsll $1, $1, 16
-; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.7)
-; MIPS64R2-NEXT:    j .LBB0_10
-; MIPS64R2-NEXT:    sd $1, 8($sp)
-; MIPS64R2-NEXT:  .LBB0_4: # %sw.bb1
+; MIPS64R2-NEXT:  .LBB0_3: # %sw.bb1
 ; MIPS64R2-NEXT:    lui $1, %highest(.L.str.1)
 ; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.1)
 ; MIPS64R2-NEXT:    dsll $1, $1, 16
@@ -197,7 +188,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.1)
 ; MIPS64R2-NEXT:    j .LBB0_10
 ; MIPS64R2-NEXT:    sd $1, 8($sp)
-; MIPS64R2-NEXT:  .LBB0_5: # %sw.bb2
+; MIPS64R2-NEXT:  .LBB0_4: # %sw.bb2
 ; MIPS64R2-NEXT:    lui $1, %highest(.L.str.2)
 ; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.2)
 ; MIPS64R2-NEXT:    dsll $1, $1, 16
@@ -206,7 +197,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.2)
 ; MIPS64R2-NEXT:    j .LBB0_10
 ; MIPS64R2-NEXT:    sd $1, 8($sp)
-; MIPS64R2-NEXT:  .LBB0_6: # %sw.bb3
+; MIPS64R2-NEXT:  .LBB0_5: # %sw.bb3
 ; MIPS64R2-NEXT:    lui $1, %highest(.L.str.3)
 ; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.3)
 ; MIPS64R2-NEXT:    dsll $1, $1, 16
@@ -215,7 +206,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.3)
 ; MIPS64R2-NEXT:    j .LBB0_10
 ; MIPS64R2-NEXT:    sd $1, 8($sp)
-; MIPS64R2-NEXT:  .LBB0_7: # %sw.bb4
+; MIPS64R2-NEXT:  .LBB0_6: # %sw.bb4
 ; MIPS64R2-NEXT:    lui $1, %highest(.L.str.4)
 ; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.4)
 ; MIPS64R2-NEXT:    dsll $1, $1, 16
@@ -224,7 +215,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.4)
 ; MIPS64R2-NEXT:    j .LBB0_10
 ; MIPS64R2-NEXT:    sd $1, 8($sp)
-; MIPS64R2-NEXT:  .LBB0_8: # %sw.bb5
+; MIPS64R2-NEXT:  .LBB0_7: # %sw.bb5
 ; MIPS64R2-NEXT:    lui $1, %highest(.L.str.5)
 ; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.5)
 ; MIPS64R2-NEXT:    dsll $1, $1, 16
@@ -233,13 +224,22 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.5)
 ; MIPS64R2-NEXT:    j .LBB0_10
 ; MIPS64R2-NEXT:    sd $1, 8($sp)
-; MIPS64R2-NEXT:  .LBB0_9: # %sw.bb6
+; MIPS64R2-NEXT:  .LBB0_8: # %sw.bb6
 ; MIPS64R2-NEXT:    lui $1, %highest(.L.str.6)
 ; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.6)
 ; MIPS64R2-NEXT:    dsll $1, $1, 16
 ; MIPS64R2-NEXT:    daddiu $1, $1, %hi(.L.str.6)
 ; MIPS64R2-NEXT:    dsll $1, $1, 16
 ; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.6)
+; MIPS64R2-NEXT:    j .LBB0_10
+; MIPS64R2-NEXT:    sd $1, 8($sp)
+; MIPS64R2-NEXT:  .LBB0_9: # %sw.epilog
+; MIPS64R2-NEXT:    lui $1, %highest(.L.str.7)
+; MIPS64R2-NEXT:    daddiu $1, $1, %higher(.L.str.7)
+; MIPS64R2-NEXT:    dsll $1, $1, 16
+; MIPS64R2-NEXT:    daddiu $1, $1, %hi(.L.str.7)
+; MIPS64R2-NEXT:    dsll $1, $1, 16
+; MIPS64R2-NEXT:    daddiu $1, $1, %lo(.L.str.7)
 ; MIPS64R2-NEXT:    sd $1, 8($sp)
 ; MIPS64R2-NEXT:  .LBB0_10: # %return
 ; MIPS64R2-NEXT:    ld $2, 8($sp)
@@ -252,7 +252,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    .cfi_def_cfa_offset 16
 ; MIPS64R6-NEXT:    dext $2, $4, 0, 32
 ; MIPS64R6-NEXT:    sltiu $1, $2, 7
-; MIPS64R6-NEXT:    beqz $1, .LBB0_3
+; MIPS64R6-NEXT:    beqz $1, .LBB0_9
 ; MIPS64R6-NEXT:    sw $4, 4($sp)
 ; MIPS64R6-NEXT:  .LBB0_1: # %entry
 ; MIPS64R6-NEXT:    dsll $1, $2, 3
@@ -274,16 +274,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str)
 ; MIPS64R6-NEXT:    j .LBB0_10
 ; MIPS64R6-NEXT:    sd $1, 8($sp)
-; MIPS64R6-NEXT:  .LBB0_3: # %sw.epilog
-; MIPS64R6-NEXT:    lui $1, %highest(.L.str.7)
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.7)
-; MIPS64R6-NEXT:    dsll $1, $1, 16
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(.L.str.7)
-; MIPS64R6-NEXT:    dsll $1, $1, 16
-; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.7)
-; MIPS64R6-NEXT:    j .LBB0_10
-; MIPS64R6-NEXT:    sd $1, 8($sp)
-; MIPS64R6-NEXT:  .LBB0_4: # %sw.bb1
+; MIPS64R6-NEXT:  .LBB0_3: # %sw.bb1
 ; MIPS64R6-NEXT:    lui $1, %highest(.L.str.1)
 ; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.1)
 ; MIPS64R6-NEXT:    dsll $1, $1, 16
@@ -292,7 +283,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.1)
 ; MIPS64R6-NEXT:    j .LBB0_10
 ; MIPS64R6-NEXT:    sd $1, 8($sp)
-; MIPS64R6-NEXT:  .LBB0_5: # %sw.bb2
+; MIPS64R6-NEXT:  .LBB0_4: # %sw.bb2
 ; MIPS64R6-NEXT:    lui $1, %highest(.L.str.2)
 ; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.2)
 ; MIPS64R6-NEXT:    dsll $1, $1, 16
@@ -301,7 +292,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.2)
 ; MIPS64R6-NEXT:    j .LBB0_10
 ; MIPS64R6-NEXT:    sd $1, 8($sp)
-; MIPS64R6-NEXT:  .LBB0_6: # %sw.bb3
+; MIPS64R6-NEXT:  .LBB0_5: # %sw.bb3
 ; MIPS64R6-NEXT:    lui $1, %highest(.L.str.3)
 ; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.3)
 ; MIPS64R6-NEXT:    dsll $1, $1, 16
@@ -310,7 +301,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.3)
 ; MIPS64R6-NEXT:    j .LBB0_10
 ; MIPS64R6-NEXT:    sd $1, 8($sp)
-; MIPS64R6-NEXT:  .LBB0_7: # %sw.bb4
+; MIPS64R6-NEXT:  .LBB0_6: # %sw.bb4
 ; MIPS64R6-NEXT:    lui $1, %highest(.L.str.4)
 ; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.4)
 ; MIPS64R6-NEXT:    dsll $1, $1, 16
@@ -319,7 +310,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.4)
 ; MIPS64R6-NEXT:    j .LBB0_10
 ; MIPS64R6-NEXT:    sd $1, 8($sp)
-; MIPS64R6-NEXT:  .LBB0_8: # %sw.bb5
+; MIPS64R6-NEXT:  .LBB0_7: # %sw.bb5
 ; MIPS64R6-NEXT:    lui $1, %highest(.L.str.5)
 ; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.5)
 ; MIPS64R6-NEXT:    dsll $1, $1, 16
@@ -328,13 +319,22 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.5)
 ; MIPS64R6-NEXT:    j .LBB0_10
 ; MIPS64R6-NEXT:    sd $1, 8($sp)
-; MIPS64R6-NEXT:  .LBB0_9: # %sw.bb6
+; MIPS64R6-NEXT:  .LBB0_8: # %sw.bb6
 ; MIPS64R6-NEXT:    lui $1, %highest(.L.str.6)
 ; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.6)
 ; MIPS64R6-NEXT:    dsll $1, $1, 16
 ; MIPS64R6-NEXT:    daddiu $1, $1, %hi(.L.str.6)
 ; MIPS64R6-NEXT:    dsll $1, $1, 16
 ; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.6)
+; MIPS64R6-NEXT:    j .LBB0_10
+; MIPS64R6-NEXT:    sd $1, 8($sp)
+; MIPS64R6-NEXT:  .LBB0_9: # %sw.epilog
+; MIPS64R6-NEXT:    lui $1, %highest(.L.str.7)
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(.L.str.7)
+; MIPS64R6-NEXT:    dsll $1, $1, 16
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(.L.str.7)
+; MIPS64R6-NEXT:    dsll $1, $1, 16
+; MIPS64R6-NEXT:    daddiu $1, $1, %lo(.L.str.7)
 ; MIPS64R6-NEXT:    sd $1, 8($sp)
 ; MIPS64R6-NEXT:  .LBB0_10: # %return
 ; MIPS64R6-NEXT:    ld $2, 8($sp)
@@ -349,7 +349,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS32R2-NEXT:    .cfi_def_cfa_offset 16
 ; PIC-MIPS32R2-NEXT:    addu $2, $2, $25
 ; PIC-MIPS32R2-NEXT:    sltiu $1, $4, 7
-; PIC-MIPS32R2-NEXT:    beqz $1, $BB0_3
+; PIC-MIPS32R2-NEXT:    beqz $1, $BB0_9
 ; PIC-MIPS32R2-NEXT:    sw $4, 4($sp)
 ; PIC-MIPS32R2-NEXT:  $BB0_1: # %entry
 ; PIC-MIPS32R2-NEXT:    sll $1, $4, 2
@@ -364,39 +364,39 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str)
 ; PIC-MIPS32R2-NEXT:    b $BB0_10
 ; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_3: # %sw.epilog
-; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.7)($2)
-; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.7)
-; PIC-MIPS32R2-NEXT:    b $BB0_10
-; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_4: # %sw.bb1
+; PIC-MIPS32R2-NEXT:  $BB0_3: # %sw.bb1
 ; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.1)($2)
 ; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.1)
 ; PIC-MIPS32R2-NEXT:    b $BB0_10
 ; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_5: # %sw.bb2
+; PIC-MIPS32R2-NEXT:  $BB0_4: # %sw.bb2
 ; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.2)($2)
 ; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.2)
 ; PIC-MIPS32R2-NEXT:    b $BB0_10
 ; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_6: # %sw.bb3
+; PIC-MIPS32R2-NEXT:  $BB0_5: # %sw.bb3
 ; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.3)($2)
 ; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.3)
 ; PIC-MIPS32R2-NEXT:    b $BB0_10
 ; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_7: # %sw.bb4
+; PIC-MIPS32R2-NEXT:  $BB0_6: # %sw.bb4
 ; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.4)($2)
 ; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.4)
 ; PIC-MIPS32R2-NEXT:    b $BB0_10
 ; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_8: # %sw.bb5
+; PIC-MIPS32R2-NEXT:  $BB0_7: # %sw.bb5
 ; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.5)($2)
 ; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.5)
 ; PIC-MIPS32R2-NEXT:    b $BB0_10
 ; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R2-NEXT:  $BB0_9: # %sw.bb6
+; PIC-MIPS32R2-NEXT:  $BB0_8: # %sw.bb6
 ; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.6)($2)
 ; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.6)
+; PIC-MIPS32R2-NEXT:    b $BB0_10
+; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
+; PIC-MIPS32R2-NEXT:  $BB0_9: # %sw.epilog
+; PIC-MIPS32R2-NEXT:    lw $1, %got($.str.7)($2)
+; PIC-MIPS32R2-NEXT:    addiu $1, $1, %lo($.str.7)
 ; PIC-MIPS32R2-NEXT:    sw $1, 8($sp)
 ; PIC-MIPS32R2-NEXT:  $BB0_10: # %return
 ; PIC-MIPS32R2-NEXT:    lw $2, 8($sp)
@@ -411,7 +411,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS32R6-NEXT:    .cfi_def_cfa_offset 16
 ; PIC-MIPS32R6-NEXT:    addu $2, $2, $25
 ; PIC-MIPS32R6-NEXT:    sltiu $1, $4, 7
-; PIC-MIPS32R6-NEXT:    beqz $1, $BB0_3
+; PIC-MIPS32R6-NEXT:    beqz $1, $BB0_9
 ; PIC-MIPS32R6-NEXT:    sw $4, 4($sp)
 ; PIC-MIPS32R6-NEXT:  $BB0_1: # %entry
 ; PIC-MIPS32R6-NEXT:    sll $1, $4, 2
@@ -426,39 +426,39 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str)
 ; PIC-MIPS32R6-NEXT:    b $BB0_10
 ; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_3: # %sw.epilog
-; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.7)($2)
-; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.7)
-; PIC-MIPS32R6-NEXT:    b $BB0_10
-; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_4: # %sw.bb1
+; PIC-MIPS32R6-NEXT:  $BB0_3: # %sw.bb1
 ; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.1)($2)
 ; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.1)
 ; PIC-MIPS32R6-NEXT:    b $BB0_10
 ; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_5: # %sw.bb2
+; PIC-MIPS32R6-NEXT:  $BB0_4: # %sw.bb2
 ; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.2)($2)
 ; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.2)
 ; PIC-MIPS32R6-NEXT:    b $BB0_10
 ; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_6: # %sw.bb3
+; PIC-MIPS32R6-NEXT:  $BB0_5: # %sw.bb3
 ; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.3)($2)
 ; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.3)
 ; PIC-MIPS32R6-NEXT:    b $BB0_10
 ; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_7: # %sw.bb4
+; PIC-MIPS32R6-NEXT:  $BB0_6: # %sw.bb4
 ; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.4)($2)
 ; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.4)
 ; PIC-MIPS32R6-NEXT:    b $BB0_10
 ; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_8: # %sw.bb5
+; PIC-MIPS32R6-NEXT:  $BB0_7: # %sw.bb5
 ; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.5)($2)
 ; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.5)
 ; PIC-MIPS32R6-NEXT:    b $BB0_10
 ; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
-; PIC-MIPS32R6-NEXT:  $BB0_9: # %sw.bb6
+; PIC-MIPS32R6-NEXT:  $BB0_8: # %sw.bb6
 ; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.6)($2)
 ; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.6)
+; PIC-MIPS32R6-NEXT:    b $BB0_10
+; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
+; PIC-MIPS32R6-NEXT:  $BB0_9: # %sw.epilog
+; PIC-MIPS32R6-NEXT:    lw $1, %got($.str.7)($2)
+; PIC-MIPS32R6-NEXT:    addiu $1, $1, %lo($.str.7)
 ; PIC-MIPS32R6-NEXT:    sw $1, 8($sp)
 ; PIC-MIPS32R6-NEXT:  $BB0_10: # %return
 ; PIC-MIPS32R6-NEXT:    lw $2, 8($sp)
@@ -474,7 +474,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS64R2-NEXT:    daddiu $2, $1, %lo(%neg(%gp_rel(_Z3fooi)))
 ; PIC-MIPS64R2-NEXT:    dext $3, $4, 0, 32
 ; PIC-MIPS64R2-NEXT:    sltiu $1, $3, 7
-; PIC-MIPS64R2-NEXT:    beqz $1, .LBB0_3
+; PIC-MIPS64R2-NEXT:    beqz $1, .LBB0_9
 ; PIC-MIPS64R2-NEXT:    sw $4, 4($sp)
 ; PIC-MIPS64R2-NEXT:  .LBB0_1: # %entry
 ; PIC-MIPS64R2-NEXT:    dsll $1, $3, 3
@@ -489,39 +489,39 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str)
 ; PIC-MIPS64R2-NEXT:    b .LBB0_10
 ; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_3: # %sw.epilog
-; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.7)($2)
-; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.7)
-; PIC-MIPS64R2-NEXT:    b .LBB0_10
-; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_4: # %sw.bb1
+; PIC-MIPS64R2-NEXT:  .LBB0_3: # %sw.bb1
 ; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.1)($2)
 ; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.1)
 ; PIC-MIPS64R2-NEXT:    b .LBB0_10
 ; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_5: # %sw.bb2
+; PIC-MIPS64R2-NEXT:  .LBB0_4: # %sw.bb2
 ; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.2)($2)
 ; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.2)
 ; PIC-MIPS64R2-NEXT:    b .LBB0_10
 ; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_6: # %sw.bb3
+; PIC-MIPS64R2-NEXT:  .LBB0_5: # %sw.bb3
 ; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.3)($2)
 ; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.3)
 ; PIC-MIPS64R2-NEXT:    b .LBB0_10
 ; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_7: # %sw.bb4
+; PIC-MIPS64R2-NEXT:  .LBB0_6: # %sw.bb4
 ; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.4)($2)
 ; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.4)
 ; PIC-MIPS64R2-NEXT:    b .LBB0_10
 ; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_8: # %sw.bb5
+; PIC-MIPS64R2-NEXT:  .LBB0_7: # %sw.bb5
 ; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.5)($2)
 ; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.5)
 ; PIC-MIPS64R2-NEXT:    b .LBB0_10
 ; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R2-NEXT:  .LBB0_9: # %sw.bb6
+; PIC-MIPS64R2-NEXT:  .LBB0_8: # %sw.bb6
 ; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.6)($2)
 ; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.6)
+; PIC-MIPS64R2-NEXT:    b .LBB0_10
+; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
+; PIC-MIPS64R2-NEXT:  .LBB0_9: # %sw.epilog
+; PIC-MIPS64R2-NEXT:    ld $1, %got_page(.L.str.7)($2)
+; PIC-MIPS64R2-NEXT:    daddiu $1, $1, %got_ofst(.L.str.7)
 ; PIC-MIPS64R2-NEXT:    sd $1, 8($sp)
 ; PIC-MIPS64R2-NEXT:  .LBB0_10: # %return
 ; PIC-MIPS64R2-NEXT:    ld $2, 8($sp)
@@ -537,7 +537,7 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS64R6-NEXT:    daddiu $2, $1, %lo(%neg(%gp_rel(_Z3fooi)))
 ; PIC-MIPS64R6-NEXT:    dext $3, $4, 0, 32
 ; PIC-MIPS64R6-NEXT:    sltiu $1, $3, 7
-; PIC-MIPS64R6-NEXT:    beqz $1, .LBB0_3
+; PIC-MIPS64R6-NEXT:    beqz $1, .LBB0_9
 ; PIC-MIPS64R6-NEXT:    sw $4, 4($sp)
 ; PIC-MIPS64R6-NEXT:  .LBB0_1: # %entry
 ; PIC-MIPS64R6-NEXT:    dsll $1, $3, 3
@@ -552,39 +552,39 @@ define ptr @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str)
 ; PIC-MIPS64R6-NEXT:    b .LBB0_10
 ; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_3: # %sw.epilog
-; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.7)($2)
-; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.7)
-; PIC-MIPS64R6-NEXT:    b .LBB0_10
-; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_4: # %sw.bb1
+; PIC-MIPS64R6-NEXT:  .LBB0_3: # %sw.bb1
 ; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.1)($2)
 ; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.1)
 ; PIC-MIPS64R6-NEXT:    b .LBB0_10
 ; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_5: # %sw.bb2
+; PIC-MIPS64R6-NEXT:  .LBB0_4: # %sw.bb2
 ; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.2)($2)
 ; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.2)
 ; PIC-MIPS64R6-NEXT:    b .LBB0_10
 ; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_6: # %sw.bb3
+; PIC-MIPS64R6-NEXT:  .LBB0_5: # %sw.bb3
 ; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.3)($2)
 ; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.3)
 ; PIC-MIPS64R6-NEXT:    b .LBB0_10
 ; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_7: # %sw.bb4
+; PIC-MIPS64R6-NEXT:  .LBB0_6: # %sw.bb4
 ; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.4)($2)
 ; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.4)
 ; PIC-MIPS64R6-NEXT:    b .LBB0_10
 ; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_8: # %sw.bb5
+; PIC-MIPS64R6-NEXT:  .LBB0_7: # %sw.bb5
 ; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.5)($2)
 ; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.5)
 ; PIC-MIPS64R6-NEXT:    b .LBB0_10
 ; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
-; PIC-MIPS64R6-NEXT:  .LBB0_9: # %sw.bb6
+; PIC-MIPS64R6-NEXT:  .LBB0_8: # %sw.bb6
 ; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.6)($2)
 ; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.6)
+; PIC-MIPS64R6-NEXT:    b .LBB0_10
+; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
+; PIC-MIPS64R6-NEXT:  .LBB0_9: # %sw.epilog
+; PIC-MIPS64R6-NEXT:    ld $1, %got_page(.L.str.7)($2)
+; PIC-MIPS64R6-NEXT:    daddiu $1, $1, %got_ofst(.L.str.7)
 ; PIC-MIPS64R6-NEXT:    sd $1, 8($sp)
 ; PIC-MIPS64R6-NEXT:  .LBB0_10: # %return
 ; PIC-MIPS64R6-NEXT:    ld $2, 8($sp)
diff --git a/llvm/test/CodeGen/Mips/jump-table-mul.ll b/llvm/test/CodeGen/Mips/jump-table-mul.ll
index ef7452cf253fee6..22f41f53d154bf2 100644
--- a/llvm/test/CodeGen/Mips/jump-table-mul.ll
+++ b/llvm/test/CodeGen/Mips/jump-table-mul.ll
@@ -8,15 +8,11 @@ define i64 @test(i64 %arg) {
 ; CHECK-NEXT:    lui $1, %hi(%neg(%gp_rel(test)))
 ; CHECK-NEXT:    daddu $2, $1, $25
 ; CHECK-NEXT:    sltiu $1, $4, 11
-; CHECK-NEXT:    beqz $1, .LBB0_3
+; CHECK-NEXT:    beqz $1, .LBB0_4
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:  .LBB0_1: # %entry
 ; CHECK-NEXT:    daddiu $1, $2, %lo(%neg(%gp_rel(test)))
 ; CHECK-NEXT:    dsll $2, $4, 3
-; Previously this dsll was the following sequence:
-;	daddiu	$2, $zero, 8
-;	dmult	$4, $2
-;	mflo	$2
 ; CHECK-NEXT:    ld $3, %got_page(.LJTI0_0)($1)
 ; CHECK-NEXT:    daddu $2, $2, $3
 ; CHECK-NEXT:    ld $2, %got_ofst(.LJTI0_0)($2)
@@ -26,12 +22,16 @@ define i64 @test(i64 %arg) {
 ; CHECK-NEXT:  .LBB0_2: # %sw.bb
 ; CHECK-NEXT:    jr $ra
 ; CHECK-NEXT:    daddiu $2, $zero, 1
-; CHECK-NEXT:  .LBB0_3: # %default
-; CHECK-NEXT:    jr $ra
-; CHECK-NEXT:    daddiu $2, $zero, 1234
-; CHECK-NEXT:  .LBB0_4: # %sw.bb1
+; CHECK-NEXT:  .LBB0_3: # %sw.bb1
 ; CHECK-NEXT:    jr $ra
 ; CHECK-NEXT:    daddiu $2, $zero, 0
+; CHECK-NEXT:  .LBB0_4: # %default
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    daddiu $2, $zero, 1234
+; Previously this dsll was the following sequence:
+;	daddiu	$2, $zero, 8
+;	dmult	$4, $2
+;	mflo	$2
 entry:
   switch i64 %arg, label %default [
     i64 0, label %sw.bb
@@ -54,13 +54,13 @@ sw.bb1:
 ; CHECK-NEXT: 	.p2align	3
 ; CHECK-LABEL: .LJTI0_0:
 ; CHECK-NEXT: 	.gpdword	.LBB0_2
-; CHECK-NEXT: 	.gpdword	.LBB0_3
-; CHECK-NEXT: 	.gpdword	.LBB0_3
+; CHECK-NEXT: 	.gpdword	.LBB0_4
+; CHECK-NEXT: 	.gpdword	.LBB0_4
 ; CHECK-NEXT: 	.gpdword	.LBB0_2
-; CHECK-NEXT: 	.gpdword	.LBB0_3
+; CHECK-NEXT: 	.gpdword	.LBB0_4
 ; CHECK-NEXT: 	.gpdword	.LBB0_2
-; CHECK-NEXT: 	.gpdword	.LBB0_3
-; CHECK-NEXT: 	.gpdword	.LBB0_3
-; CHECK-NEXT: 	.gpdword	.LBB0_3
-; CHECK-NEXT: 	.gpdword	.LBB0_3
 ; CHECK-NEXT: 	.gpdword	.LBB0_4
+; CHECK-NEXT: 	.gpdword	.LBB0_4
+; CHECK-NEXT: 	.gpdword	.LBB0_4
+; CHECK-NEXT: 	.gpdword	.LBB0_4
+; CHECK-NEXT: 	.gpdword	.LBB0_3
diff --git a/llvm/test/CodeGen/Mips/nacl-align.ll b/llvm/test/CodeGen/Mips/nacl-align.ll
index bca6c93de2624d3..964ddfc963c4ec3 100644
--- a/llvm/test/CodeGen/Mips/nacl-align.ll
+++ b/llvm/test/CodeGen/Mips/nacl-align.ll
@@ -44,9 +44,6 @@ default:
 ; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
 ; CHECK-NEXT:        jr      $ra
 ; CHECK-NEXT:        addiu   $2, $zero, 111
-; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
-; CHECK-NEXT:        jr      $ra
-; CHECK-NEXT:        addiu   $2, $zero, 555
 ; CHECK-NEXT:        .p2align  4
 ; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
 ; CHECK-NEXT:        jr      $ra
@@ -55,6 +52,13 @@ default:
 ; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
 ; CHECK-NEXT:        jr      $ra
 ; CHECK-NEXT:        addiu   $2, $zero, 333
+; CHECK-NEXT:        .p2align  4
+; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
+; CHECK-NEXT:        jr      $ra
+; CHECK-NEXT:        addiu   $2, $zero, 444
+; CHECK-NEXT:    ${{BB[0-9]+_[0-9]+}}:
+; CHECK-NEXT:        jr      $ra
+; CHECK-NEXT:        addiu   $2, $zero, 555
 
 }
 
diff --git a/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll b/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll
index 31f077d57a93355..afb79e55f4f90b8 100644
--- a/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll
+++ b/llvm/test/CodeGen/Mips/pseudo-jump-fill.ll
@@ -12,7 +12,7 @@ define i32 @test(i32 signext %x, i32 signext %c) {
 ; CHECK-NEXT:    addiu $2, $2, %lo(_gp_disp)
 ; CHECK-NEXT:    addiur2 $5, $5, -1
 ; CHECK-NEXT:    sltiu $1, $5, 4
-; CHECK-NEXT:    beqz $1, $BB0_3
+; CHECK-NEXT:    beqz $1, $BB0_6
 ; CHECK-NEXT:    addu $3, $2, $25
 ; CHECK-NEXT:  $BB0_1: # %entry
 ; CHECK-NEXT:    li16 $2, 0
@@ -26,17 +26,17 @@ define i32 @test(i32 signext %x, i32 signext %c) {
 ; CHECK-NEXT:  $BB0_2: # %sw.bb2
 ; CHECK-NEXT:    addiur2 $2, $4, 1
 ; CHECK-NEXT:    jrc $ra
-; CHECK-NEXT:  $BB0_3:
-; CHECK-NEXT:    move $2, $4
-; CHECK-NEXT:    jrc $ra
-; CHECK-NEXT:  $BB0_4: # %sw.bb3
+; CHECK-NEXT:  $BB0_3: # %sw.bb3
 ; CHECK-NEXT:    addius5 $4, 2
 ; CHECK-NEXT:    move $2, $4
 ; CHECK-NEXT:    jrc $ra
-; CHECK-NEXT:  $BB0_5: # %sw.bb5
+; CHECK-NEXT:  $BB0_4: # %sw.bb5
 ; CHECK-NEXT:    addius5 $4, 3
 ; CHECK-NEXT:    move $2, $4
-; CHECK-NEXT:  $BB0_6: # %for.cond.cleanup
+; CHECK-NEXT:  $BB0_5: # %for.cond.cleanup
+; CHECK-NEXT:    jrc $ra
+; CHECK-NEXT:  $BB0_6:
+; CHECK-NEXT:    move $2, $4
 ; CHECK-NEXT:    jrc $ra
 entry:
   switch i32 %c, label %sw.epilog [
diff --git a/llvm/test/CodeGen/PowerPC/jump-tables-collapse-rotate.ll b/llvm/test/CodeGen/PowerPC/jump-tables-collapse-rotate.ll
index ccc9adbc2bdd1dd..efb356b7ed817f1 100644
--- a/llvm/test/CodeGen/PowerPC/jump-tables-collapse-rotate.ll
+++ b/llvm/test/CodeGen/PowerPC/jump-tables-collapse-rotate.ll
@@ -11,7 +11,7 @@ define dso_local zeroext i32 @test(i32 signext %l) nounwind {
 ; CHECK-NEXT:    addi r3, r3, -1
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    cmplwi r3, 5
-; CHECK-NEXT:    bgt cr0, .LBB0_3
+; CHECK-NEXT:    bgt cr0, .LBB0_9
 ; CHECK-NEXT:  # %bb.1: # %entry
 ; CHECK-NEXT:    addis r4, r2, .LC0 at toc@ha
 ; CHECK-NEXT:    rldic r3, r3, 2, 30
@@ -24,42 +24,41 @@ define dso_local zeroext i32 @test(i32 signext %l) nounwind {
 ; CHECK-NEXT:    li r3, 2
 ; CHECK-NEXT:    bl test1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    b .LBB0_10
-; CHECK-NEXT:  .LBB0_3: # %sw.default
-; CHECK-NEXT:    li r3, 1
-; CHECK-NEXT:    bl test1
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    bl test3
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    b .LBB0_10
-; CHECK-NEXT:  .LBB0_4: # %sw.bb3
+; CHECK-NEXT:    b .LBB0_11
+; CHECK-NEXT:  .LBB0_3: # %sw.bb3
 ; CHECK-NEXT:    li r3, 3
-; CHECK-NEXT:    b .LBB0_9
-; CHECK-NEXT:  .LBB0_5: # %sw.bb5
+; CHECK-NEXT:    b .LBB0_8
+; CHECK-NEXT:  .LBB0_4: # %sw.bb5
 ; CHECK-NEXT:    li r3, 4
 ; CHECK-NEXT:    bl test2
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    bl test3
-; CHECK-NEXT:    nop
 ; CHECK-NEXT:    b .LBB0_10
-; CHECK-NEXT:  .LBB0_6: # %sw.bb8
+; CHECK-NEXT:  .LBB0_5: # %sw.bb8
 ; CHECK-NEXT:    li r3, 5
 ; CHECK-NEXT:    bl test4
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    b .LBB0_10
-; CHECK-NEXT:  .LBB0_7: # %sw.bb10
+; CHECK-NEXT:    b .LBB0_11
+; CHECK-NEXT:  .LBB0_6: # %sw.bb10
 ; CHECK-NEXT:    li r3, 66
 ; CHECK-NEXT:    bl test4
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    bl test1
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    b .LBB0_10
-; CHECK-NEXT:  .LBB0_8: # %sw.bb13
+; CHECK-NEXT:    b .LBB0_11
+; CHECK-NEXT:  .LBB0_7: # %sw.bb13
 ; CHECK-NEXT:    li r3, 66
-; CHECK-NEXT:  .LBB0_9: # %return
+; CHECK-NEXT:  .LBB0_8: # %return
 ; CHECK-NEXT:    bl test2
 ; CHECK-NEXT:    nop
+; CHECK-NEXT:    b .LBB0_11
+; CHECK-NEXT:  .LBB0_9: # %sw.default
+; CHECK-NEXT:    li r3, 1
+; CHECK-NEXT:    bl test1
+; CHECK-NEXT:    nop
 ; CHECK-NEXT:  .LBB0_10: # %return
+; CHECK-NEXT:    bl test3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  .LBB0_11: # %return
 ; CHECK-NEXT:    clrldi r3, r3, 32
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll b/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
index eeadb73b9db2cff..e97e17fc64782f4 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
@@ -155,11 +155,11 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_25
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_26: # %bb62
+; CHECK-NEXT:  .LBB0_26: # %bb56
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_26
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_27: # %bb56
+; CHECK-NEXT:  .LBB0_27: # %bb62
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_27
 ; CHECK-NEXT:    .p2align 4
@@ -348,11 +348,11 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_25
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_26: # %bb62
+; CHECK-BE-NEXT:  .LBB0_26: # %bb56
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_26
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_27: # %bb56
+; CHECK-BE-NEXT:  .LBB0_27: # %bb62
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_27
 ; CHECK-BE-NEXT:    .p2align 4
diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-crlt.ll b/llvm/test/CodeGen/PowerPC/p10-spill-crlt.ll
index 32f3342243904e6..4b032781c3764cf 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-crlt.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-crlt.ll
@@ -59,10 +59,10 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    plwz r3, call_1 at PCREL(0), 1
 ; CHECK-NEXT:    cmplwi r3, 0
-; CHECK-NEXT:    bne- cr0, .LBB0_10
+; CHECK-NEXT:    bne- cr0, .LBB0_9
 ; CHECK-NEXT:  # %bb.5: # %bb30
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    bc 12, 4*cr3+eq, .LBB0_9
+; CHECK-NEXT:    bc 12, 4*cr3+eq, .LBB0_11
 ; CHECK-NEXT:  # %bb.6: # %bb32
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    rlwinm r30, r30, 0, 24, 22
@@ -72,10 +72,10 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
 ; CHECK-NEXT:    beq+ cr2, .LBB0_3
 ; CHECK-NEXT:  # %bb.7: # %bb37
 ; CHECK-NEXT:  .LBB0_8: # %bb22
-; CHECK-NEXT:  .LBB0_9: # %bb35
-; CHECK-NEXT:  .LBB0_10: # %bb27
+; CHECK-NEXT:  .LBB0_9: # %bb27
 ; CHECK-NEXT:    bc 4, 4*cr3+lt, .LBB0_12
-; CHECK-NEXT:  # %bb.11: # %bb28
+; CHECK-NEXT:  # %bb.10: # %bb28
+; CHECK-NEXT:  .LBB0_11: # %bb35
 ; CHECK-NEXT:  .LBB0_12: # %bb29
 ; CHECK-NEXT:  .LBB0_13: # %bb3
 ; CHECK-NEXT:  .LBB0_14: # %bb2
@@ -120,10 +120,10 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    lwz r3, call_1 at toc@l(r30)
 ; CHECK-BE-NEXT:    cmplwi r3, 0
-; CHECK-BE-NEXT:    bne- cr0, .LBB0_10
+; CHECK-BE-NEXT:    bne- cr0, .LBB0_9
 ; CHECK-BE-NEXT:  # %bb.5: # %bb30
 ; CHECK-BE-NEXT:    #
-; CHECK-BE-NEXT:    bc 12, 4*cr3+eq, .LBB0_9
+; CHECK-BE-NEXT:    bc 12, 4*cr3+eq, .LBB0_11
 ; CHECK-BE-NEXT:  # %bb.6: # %bb32
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    rlwinm r29, r29, 0, 24, 22
@@ -134,10 +134,10 @@ define dso_local void @P10_Spill_CR_LT() local_unnamed_addr {
 ; CHECK-BE-NEXT:    beq+ cr2, .LBB0_3
 ; CHECK-BE-NEXT:  # %bb.7: # %bb37
 ; CHECK-BE-NEXT:  .LBB0_8: # %bb22
-; CHECK-BE-NEXT:  .LBB0_9: # %bb35
-; CHECK-BE-NEXT:  .LBB0_10: # %bb27
+; CHECK-BE-NEXT:  .LBB0_9: # %bb27
 ; CHECK-BE-NEXT:    bc 4, 4*cr3+lt, .LBB0_12
-; CHECK-BE-NEXT:  # %bb.11: # %bb28
+; CHECK-BE-NEXT:  # %bb.10: # %bb28
+; CHECK-BE-NEXT:  .LBB0_11: # %bb35
 ; CHECK-BE-NEXT:  .LBB0_12: # %bb29
 ; CHECK-BE-NEXT:  .LBB0_13: # %bb3
 ; CHECK-BE-NEXT:  .LBB0_14: # %bb2
diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-crun.ll b/llvm/test/CodeGen/PowerPC/p10-spill-crun.ll
index 4ca2dc5dbe04da4..fd049280c9f39f6 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-crun.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-crun.ll
@@ -82,7 +82,7 @@ define dso_local void @P10_Spill_CR_UN(ptr %arg, ptr %arg1, i32 %arg2) local_unn
 ; CHECK-NEXT:    lwz r28, 0(r3)
 ; CHECK-NEXT:    bc 12, 4*cr5+lt, .LBB0_5
 ; CHECK-NEXT:  # %bb.4: # %bb37
-; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_14
+; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_12
 ; CHECK-NEXT:  .LBB0_5: # %bb42
 ; CHECK-NEXT:    paddi r3, 0, global_1 at PCREL, 1
 ; CHECK-NEXT:    li r4, 0
@@ -95,10 +95,10 @@ define dso_local void @P10_Spill_CR_UN(ptr %arg, ptr %arg1, i32 %arg2) local_unn
 ; CHECK-NEXT:    lxsihzx v2, 0, r3
 ; CHECK-NEXT:    vextsh2d v2, v2
 ; CHECK-NEXT:    xscvsxdsp f0, v2
-; CHECK-NEXT:    bc 12, 4*cr2+lt, .LBB0_12
+; CHECK-NEXT:    bc 12, 4*cr2+lt, .LBB0_13
 ; CHECK-NEXT:  # %bb.6: # %bb42
 ; CHECK-NEXT:    xxspltidp vs1, 1069547520
-; CHECK-NEXT:    b .LBB0_13
+; CHECK-NEXT:    b .LBB0_14
 ; CHECK-NEXT:  .LBB0_7: # %bb19
 ; CHECK-NEXT:    setnbc r3, 4*cr2+un
 ; CHECK-NEXT:    paddi r4, 0, global_4 at PCREL, 1
@@ -134,15 +134,15 @@ define dso_local void @P10_Spill_CR_UN(ptr %arg, ptr %arg1, i32 %arg2) local_unn
 ; CHECK-NEXT:    rlwimi r3, r4, 21, 11, 11
 ; CHECK-NEXT:    mtocrf 32, r3
 ; CHECK-NEXT:    b .LBB0_16
-; CHECK-NEXT:  .LBB0_12:
+; CHECK-NEXT:  .LBB0_12: # %bb41
+; CHECK-NEXT:    # implicit-def: $r3
+; CHECK-NEXT:    b .LBB0_15
+; CHECK-NEXT:  .LBB0_13:
 ; CHECK-NEXT:    xxspltidp vs1, 1071644672
-; CHECK-NEXT:  .LBB0_13: # %bb42
+; CHECK-NEXT:  .LBB0_14: # %bb42
 ; CHECK-NEXT:    xsmulsp f0, f1, f0
 ; CHECK-NEXT:    xscvdpsxws f0, f0
 ; CHECK-NEXT:    mffprwz r3, f0
-; CHECK-NEXT:    b .LBB0_15
-; CHECK-NEXT:  .LBB0_14: # %bb41
-; CHECK-NEXT:    # implicit-def: $r3
 ; CHECK-NEXT:  .LBB0_15: # %bb50
 ; CHECK-NEXT:    li r4, 0
 ; CHECK-NEXT:    xxspltidp vs3, -1082130432
@@ -232,7 +232,7 @@ define dso_local void @P10_Spill_CR_UN(ptr %arg, ptr %arg1, i32 %arg2) local_unn
 ; CHECK-BE-NEXT:    addis r3, r2, global_1 at toc@ha
 ; CHECK-BE-NEXT:    bc 12, 4*cr5+lt, .LBB0_5
 ; CHECK-BE-NEXT:  # %bb.4: # %bb37
-; CHECK-BE-NEXT:    bc 4, 4*cr5+lt, .LBB0_14
+; CHECK-BE-NEXT:    bc 4, 4*cr5+lt, .LBB0_12
 ; CHECK-BE-NEXT:  .LBB0_5: # %bb42
 ; CHECK-BE-NEXT:    addi r3, r3, global_1 at toc@l
 ; CHECK-BE-NEXT:    li r4, 0
@@ -247,10 +247,10 @@ define dso_local void @P10_Spill_CR_UN(ptr %arg, ptr %arg1, i32 %arg2) local_unn
 ; CHECK-BE-NEXT:    lxsihzx v2, 0, r3
 ; CHECK-BE-NEXT:    vextsh2d v2, v2
 ; CHECK-BE-NEXT:    xscvsxdsp f0, v2
-; CHECK-BE-NEXT:    bc 12, 4*cr2+lt, .LBB0_12
+; CHECK-BE-NEXT:    bc 12, 4*cr2+lt, .LBB0_13
 ; CHECK-BE-NEXT:  # %bb.6: # %bb42
 ; CHECK-BE-NEXT:    xxspltidp vs1, 1069547520
-; CHECK-BE-NEXT:    b .LBB0_13
+; CHECK-BE-NEXT:    b .LBB0_14
 ; CHECK-BE-NEXT:  .LBB0_7: # %bb19
 ; CHECK-BE-NEXT:    setnbc r3, 4*cr2+un
 ; CHECK-BE-NEXT:    addis r4, r2, global_4 at toc@ha
@@ -292,15 +292,15 @@ define dso_local void @P10_Spill_CR_UN(ptr %arg, ptr %arg1, i32 %arg2) local_unn
 ; CHECK-BE-NEXT:    rlwimi r3, r4, 21, 11, 11
 ; CHECK-BE-NEXT:    mtocrf 32, r3
 ; CHECK-BE-NEXT:    b .LBB0_16
-; CHECK-BE-NEXT:  .LBB0_12:
+; CHECK-BE-NEXT:  .LBB0_12: # %bb41
+; CHECK-BE-NEXT:    # implicit-def: $r3
+; CHECK-BE-NEXT:    b .LBB0_15
+; CHECK-BE-NEXT:  .LBB0_13:
 ; CHECK-BE-NEXT:    xxspltidp vs1, 1071644672
-; CHECK-BE-NEXT:  .LBB0_13: # %bb42
+; CHECK-BE-NEXT:  .LBB0_14: # %bb42
 ; CHECK-BE-NEXT:    xsmulsp f0, f1, f0
 ; CHECK-BE-NEXT:    xscvdpsxws f0, f0
 ; CHECK-BE-NEXT:    mffprwz r3, f0
-; CHECK-BE-NEXT:    b .LBB0_15
-; CHECK-BE-NEXT:  .LBB0_14: # %bb41
-; CHECK-BE-NEXT:    # implicit-def: $r3
 ; CHECK-BE-NEXT:  .LBB0_15: # %bb50
 ; CHECK-BE-NEXT:    li r4, 0
 ; CHECK-BE-NEXT:    xxspltidp vs3, -1082130432
diff --git a/llvm/test/CodeGen/PowerPC/pr43527.ll b/llvm/test/CodeGen/PowerPC/pr43527.ll
index 379bd6c070c777f..d7c70ab7f066c80 100644
--- a/llvm/test/CodeGen/PowerPC/pr43527.ll
+++ b/llvm/test/CodeGen/PowerPC/pr43527.ll
@@ -5,9 +5,9 @@
 define dso_local void @test(i64 %arg, i64 %arg1) {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0: # %bb
-; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_5
+; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_6
 ; CHECK-NEXT:  # %bb.1: # %bb3
-; CHECK-NEXT:    bc 12, 4*cr5+lt, .LBB0_6
+; CHECK-NEXT:    bc 12, 4*cr5+lt, .LBB0_5
 ; CHECK-NEXT:  # %bb.2: # %bb4
 ; CHECK-NEXT:    mflr r0
 ; CHECK-NEXT:    .cfi_def_cfa_offset 64
@@ -37,8 +37,8 @@ define dso_local void @test(i64 %arg, i64 %arg1) {
 ; CHECK-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
-; CHECK-NEXT:  .LBB0_5: # %bb2
-; CHECK-NEXT:  .LBB0_6: # %bb14
+; CHECK-NEXT:  .LBB0_5: # %bb14
+; CHECK-NEXT:  .LBB0_6: # %bb2
 bb:
   br i1 undef, label %bb3, label %bb2
 
diff --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll
index 0f8014df8adca93..6b3d578f6b33829 100644
--- a/llvm/test/CodeGen/PowerPC/pr45448.ll
+++ b/llvm/test/CodeGen/PowerPC/pr45448.ll
@@ -7,17 +7,17 @@ define hidden void @julia_tryparse_internal_45896() #0 {
 ; CHECK:       # %bb.0: # %top
 ; CHECK-NEXT:    ld r3, 0(r3)
 ; CHECK-NEXT:    cmpldi r3, 0
-; CHECK-NEXT:    beq cr0, .LBB0_3
+; CHECK-NEXT:    beq cr0, .LBB0_6
 ; CHECK-NEXT:  # %bb.1: # %top
 ; CHECK-NEXT:    cmpldi r3, 10
-; CHECK-NEXT:    beq cr0, .LBB0_4
+; CHECK-NEXT:    beq cr0, .LBB0_3
 ; CHECK-NEXT:  # %bb.2: # %top
-; CHECK-NEXT:  .LBB0_3: # %fail194
-; CHECK-NEXT:  .LBB0_4: # %L294
-; CHECK-NEXT:    bc 12, 4*cr5+lt, .LBB0_6
-; CHECK-NEXT:  # %bb.5: # %L294
+; CHECK-NEXT:  .LBB0_3: # %L294
+; CHECK-NEXT:    bc 12, 4*cr5+lt, .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %L294
 ; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_7
-; CHECK-NEXT:  .LBB0_6: # %L1057.preheader
+; CHECK-NEXT:  .LBB0_5: # %L1057.preheader
+; CHECK-NEXT:  .LBB0_6: # %fail194
 ; CHECK-NEXT:  .LBB0_7: # %L670
 ; CHECK-NEXT:    li r5, -3
 ; CHECK-NEXT:    cmpdi r3, 0
diff --git a/llvm/test/CodeGen/RISCV/shrinkwrap-jump-table.ll b/llvm/test/CodeGen/RISCV/shrinkwrap-jump-table.ll
index 99780c5e0d444b6..75af75ce75ed9c0 100644
--- a/llvm/test/CodeGen/RISCV/shrinkwrap-jump-table.ll
+++ b/llvm/test/CodeGen/RISCV/shrinkwrap-jump-table.ll
@@ -14,7 +14,7 @@ define dso_local signext i32 @test_shrinkwrap_jump_table(ptr noundef %m) local_u
 ; CHECK-NEXT:    lw a1, 0(a0)
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    li a2, 4
-; CHECK-NEXT:    bltu a2, a1, .LBB0_3
+; CHECK-NEXT:    bltu a2, a1, .LBB0_7
 ; CHECK-NEXT:  # %bb.1: # %entry
 ; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    lui a2, %hi(.LJTI0_0)
@@ -24,7 +24,15 @@ define dso_local signext i32 @test_shrinkwrap_jump_table(ptr noundef %m) local_u
 ; CHECK-NEXT:    jr a1
 ; CHECK-NEXT:  .LBB0_2: # %sw.bb
 ; CHECK-NEXT:    tail func1 at plt
-; CHECK-NEXT:  .LBB0_3: # %sw.default
+; CHECK-NEXT:  .LBB0_3: # %sw.bb1
+; CHECK-NEXT:    tail func2 at plt
+; CHECK-NEXT:  .LBB0_4: # %sw.bb3
+; CHECK-NEXT:    tail func3 at plt
+; CHECK-NEXT:  .LBB0_5: # %sw.bb5
+; CHECK-NEXT:    tail func4 at plt
+; CHECK-NEXT:  .LBB0_6: # %sw.bb7
+; CHECK-NEXT:    tail func5 at plt
+; CHECK-NEXT:  .LBB0_7: # %sw.default
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
@@ -34,14 +42,6 @@ define dso_local signext i32 @test_shrinkwrap_jump_table(ptr noundef %m) local_u
 ; CHECK-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_4: # %sw.bb1
-; CHECK-NEXT:    tail func2 at plt
-; CHECK-NEXT:  .LBB0_5: # %sw.bb3
-; CHECK-NEXT:    tail func3 at plt
-; CHECK-NEXT:  .LBB0_6: # %sw.bb5
-; CHECK-NEXT:    tail func4 at plt
-; CHECK-NEXT:  .LBB0_7: # %sw.bb7
-; CHECK-NEXT:    tail func5 at plt
 entry:
   %0 = load i32, ptr %m, align 4
   switch i32 %0, label %sw.default [
diff --git a/llvm/test/CodeGen/Thumb2/bti-indirect-branches.ll b/llvm/test/CodeGen/Thumb2/bti-indirect-branches.ll
index 59e346588754a40..ed895be96cd8910 100644
--- a/llvm/test/CodeGen/Thumb2/bti-indirect-branches.ll
+++ b/llvm/test/CodeGen/Thumb2/bti-indirect-branches.ll
@@ -7,30 +7,30 @@ define internal i32 @table_switch(i32 %x) {
 ; CHECK-NEXT:    bti
 ; CHECK-NEXT:    subs r1, r0, #1
 ; CHECK-NEXT:    cmp r1, #3
-; CHECK-NEXT:    bhi .LBB0_4
+; CHECK-NEXT:    bhi .LBB0_6
 ; CHECK-NEXT:  @ %bb.1: @ %entry
 ; CHECK-NEXT:  .LCPI0_0:
 ; CHECK-NEXT:    tbb [pc, r1]
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:  .LJTI0_0:
-; CHECK-NEXT:    .byte (.LBB0_5-(.LCPI0_0+4))/2
-; CHECK-NEXT:    .byte (.LBB0_3-(.LCPI0_0+4))/2
-; CHECK-NEXT:    .byte (.LBB0_6-(.LCPI0_0+4))/2
 ; CHECK-NEXT:    .byte (.LBB0_7-(.LCPI0_0+4))/2
+; CHECK-NEXT:    .byte (.LBB0_3-(.LCPI0_0+4))/2
+; CHECK-NEXT:    .byte (.LBB0_4-(.LCPI0_0+4))/2
+; CHECK-NEXT:    .byte (.LBB0_5-(.LCPI0_0+4))/2
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  .LBB0_3: @ %bb2
 ; CHECK-NEXT:    movs r0, #2
 ; CHECK-NEXT:    bx lr
-; CHECK-NEXT:  .LBB0_4: @ %sw.epilog
-; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:  .LBB0_5: @ %return
-; CHECK-NEXT:    bx lr
-; CHECK-NEXT:  .LBB0_6: @ %bb3
+; CHECK-NEXT:  .LBB0_4: @ %bb3
 ; CHECK-NEXT:    movs r0, #3
 ; CHECK-NEXT:    bx lr
-; CHECK-NEXT:  .LBB0_7: @ %bb4
+; CHECK-NEXT:  .LBB0_5: @ %bb4
 ; CHECK-NEXT:    movs r0, #4
 ; CHECK-NEXT:    bx lr
+; CHECK-NEXT:  .LBB0_6: @ %sw.epilog
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:  .LBB0_7: @ %return
+; CHECK-NEXT:    bx lr
 entry:
   switch i32 %x, label %sw.epilog [
     i32 1, label %bb1
@@ -93,23 +93,6 @@ declare void @consume_exception(ptr)
 declare i32 @__gxx_personality_v0(...)
 
 define internal i32 @exception_handling(i32 %0) personality ptr @__gxx_personality_v0 {
-; CHECK-LABEL: exception_handling:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    bti
-; CHECK-NEXT:    .save  {r7, lr}
-; CHECK-NEXT:    push   {r7, lr}
-; CHECK-NEXT:  .Ltmp0:
-; CHECK-NEXT:    bl     may_throw
-; CHECK-NEXT:  .Ltmp1:
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:    movs   r0, #0
-; CHECK-NEXT:    pop    {r7, pc}
-; CHECK-NEXT:  .LBB2_2:
-; CHECK-NEXT:  .Ltmp2:
-; CHECK-NEXT:    bti
-; CHECK-NEXT:    bl     consume_exception
-; CHECK-NEXT:    movs   r0, #1
-; CHECK-NEXT:    pop    {r7, pc}
 entry:
   invoke void @may_throw()
           to label %return unwind label %lpad
diff --git a/llvm/test/CodeGen/Thumb2/constant-hoisting.ll b/llvm/test/CodeGen/Thumb2/constant-hoisting.ll
index 98fe30039259f03..1aeecdf1e08f36e 100644
--- a/llvm/test/CodeGen/Thumb2/constant-hoisting.ll
+++ b/llvm/test/CodeGen/Thumb2/constant-hoisting.ll
@@ -7,27 +7,27 @@ define i32 @test_values(i32 %a, i32 %b) minsize optsize {
 ; CHECK-V6M:         mov r2, r0
 ; CHECK-V6M-NEXT:    ldr r0, .LCPI0_0
 ; CHECK-V6M-NEXT:    cmp r2, #50
-; CHECK-V6M-NEXT:    beq .LBB0_5
-; CHECK-V6M-NEXT:    cmp r2, #1
 ; CHECK-V6M-NEXT:    beq .LBB0_7
+; CHECK-V6M-NEXT:    cmp r2, #1
+; CHECK-V6M-NEXT:    beq .LBB0_5
 ; CHECK-V6M-NEXT:    cmp r2, #30
-; CHECK-V6M-NEXT:    beq .LBB0_8
+; CHECK-V6M-NEXT:    beq .LBB0_6
 ; CHECK-V6M-NEXT:    cmp r2, #0
-; CHECK-V6M-NEXT:    bne .LBB0_6
+; CHECK-V6M-NEXT:    bne .LBB0_8
 ; CHECK-V6M-NEXT:    adds r0, r1, r0
 ; CHECK-V6M-NEXT:    bx lr
 ; CHECK-V6M-NEXT:  .LBB0_5:
 ; CHECK-V6M-NEXT:    adds r0, r0, r1
-; CHECK-V6M-NEXT:    adds r0, r0, #4
+; CHECK-V6M-NEXT:    adds r0, r0, #1
+; CHECK-V6M-NEXT:    bx lr
 ; CHECK-V6M-NEXT:  .LBB0_6:
+; CHECK-V6M-NEXT:    adds r0, r0, r1
+; CHECK-V6M-NEXT:    adds r0, r0, #2
 ; CHECK-V6M-NEXT:    bx lr
 ; CHECK-V6M-NEXT:  .LBB0_7:
 ; CHECK-V6M-NEXT:    adds r0, r0, r1
-; CHECK-V6M-NEXT:    adds r0, r0, #1
-; CHECK-V6M-NEXT:    bx lr
+; CHECK-V6M-NEXT:    adds r0, r0, #4
 ; CHECK-V6M-NEXT:  .LBB0_8:
-; CHECK-V6M-NEXT:    adds r0, r0, r1
-; CHECK-V6M-NEXT:    adds r0, r0, #2
 ; CHECK-V6M-NEXT:    bx lr
 ; CHECK-V6M-NEXT:    .p2align 2
 ; CHECK-V6M-NEXT:  .LCPI0_0:
diff --git a/llvm/test/CodeGen/VE/Scalar/br_jt.ll b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
index 216d4cca097001c..fd44a54a7273801 100644
--- a/llvm/test/CodeGen/VE/Scalar/br_jt.ll
+++ b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
@@ -21,15 +21,15 @@ define signext i32 @br_jt3(i32 signext %0) {
 ; CHECK-NEXT:    or %s0, 0, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB0_1:
-; CHECK-NEXT:    or %s0, 3, (0)1
-; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
-; CHECK-NEXT:    b.l.t (, %s10)
 ; CHECK-NEXT:  .LBB0_5:
 ; CHECK-NEXT:    or %s0, 7, (0)1
 ; CHECK-NEXT:  .LBB0_6:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    or %s0, 3, (0)1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
 ;
 ; PIC-LABEL: br_jt3:
 ; PIC:       # %bb.0:
@@ -43,14 +43,14 @@ define signext i32 @br_jt3(i32 signext %0) {
 ; PIC-NEXT:    or %s0, 0, (0)1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
-; PIC-NEXT:  .LBB0_1:
-; PIC-NEXT:    or %s0, 3, (0)1
-; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
-; PIC-NEXT:    b.l.t (, %s10)
 ; PIC-NEXT:  .LBB0_5:
 ; PIC-NEXT:    or %s0, 7, (0)1
 ; PIC-NEXT:  .LBB0_6:
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
+; PIC-NEXT:    b.l.t (, %s10)
+; PIC-NEXT:  .LBB0_1:
+; PIC-NEXT:    or %s0, 3, (0)1
+; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
   switch i32 %0, label %4 [
     i32 1, label %5
@@ -308,16 +308,16 @@ define signext i32 @br_jt3_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    or %s0, 0, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB4_1:
-; CHECK-NEXT:    or %s0, 3, (0)1
-; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
-; CHECK-NEXT:    b.l.t (, %s10)
 ; CHECK-NEXT:  .LBB4_5:
 ; CHECK-NEXT:    and %s0, %s1, (32)0
 ; CHECK-NEXT:    adds.w.sx %s0, 3, %s0
 ; CHECK-NEXT:  .LBB4_6:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
+; CHECK-NEXT:  .LBB4_1:
+; CHECK-NEXT:    or %s0, 3, (0)1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
 ;
 ; PIC-LABEL: br_jt3_m:
 ; PIC:       # %bb.0:
@@ -331,15 +331,15 @@ define signext i32 @br_jt3_m(i32 signext %0, i32 signext %1) {
 ; PIC-NEXT:    or %s0, 0, (0)1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
-; PIC-NEXT:  .LBB4_1:
-; PIC-NEXT:    or %s0, 3, (0)1
-; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
-; PIC-NEXT:    b.l.t (, %s10)
 ; PIC-NEXT:  .LBB4_5:
 ; PIC-NEXT:    and %s0, %s1, (32)0
 ; PIC-NEXT:    adds.w.sx %s0, 3, %s0
 ; PIC-NEXT:  .LBB4_6:
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
+; PIC-NEXT:    b.l.t (, %s10)
+; PIC-NEXT:  .LBB4_1:
+; PIC-NEXT:    or %s0, 3, (0)1
+; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
   switch i32 %0, label %6 [
     i32 1, label %7
@@ -534,15 +534,15 @@ define signext i32 @br_jt7_m(i32 signext %0, i32 signext %1) {
 ; PIC-NEXT:    adds.w.sx %s0, 3, %s1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
-; PIC-NEXT:  .LBB6_2:
-; PIC-NEXT:    or %s0, 3, (0)1
-; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
-; PIC-NEXT:    b.l.t (, %s10)
 ; PIC-NEXT:  .LBB6_15:
 ; PIC-NEXT:    or %s0, 11, (0)1
 ; PIC-NEXT:  .LBB6_16:
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
+; PIC-NEXT:  .LBB6_2:
+; PIC-NEXT:    or %s0, 3, (0)1
+; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
+; PIC-NEXT:    b.l.t (, %s10)
 ; PIC-NEXT:  .LBB6_13:
 ; PIC-NEXT:    or %s0, 0, (0)1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
@@ -615,11 +615,6 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    adds.w.sx %s0, 3, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB7_9:
-; CHECK-NEXT:    or %s0, 0, %s2
-; CHECK-NEXT:  .LBB7_10:
-; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
-; CHECK-NEXT:    b.l.t (, %s10)
 ; CHECK-NEXT:  .LBB7_5:
 ; CHECK-NEXT:    adds.w.sx %s0, -5, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
@@ -628,12 +623,17 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    adds.w.sx %s0, -2, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
+; CHECK-NEXT:  .LBB7_7:
+; CHECK-NEXT:    or %s0, 10, (0)1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
 ; CHECK-NEXT:  .LBB7_8:
 ; CHECK-NEXT:    or %s0, 11, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB7_7:
-; CHECK-NEXT:    or %s0, 10, (0)1
+; CHECK-NEXT:  .LBB7_9:
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:  .LBB7_10:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
 ;
@@ -672,20 +672,20 @@ define signext i32 @br_jt8_m(i32 signext %0, i32 signext %1) {
 ; PIC-NEXT:  .LBB7_4:
 ; PIC-NEXT:    adds.w.sx %s0, 3, %s1
 ; PIC-NEXT:    br.l.t .LBB7_10
-; PIC-NEXT:  .LBB7_9:
-; PIC-NEXT:    or %s0, 0, %s2
-; PIC-NEXT:    br.l.t .LBB7_10
 ; PIC-NEXT:  .LBB7_5:
 ; PIC-NEXT:    adds.w.sx %s0, -5, %s1
 ; PIC-NEXT:    br.l.t .LBB7_10
 ; PIC-NEXT:  .LBB7_6:
 ; PIC-NEXT:    adds.w.sx %s0, -2, %s1
 ; PIC-NEXT:    br.l.t .LBB7_10
+; PIC-NEXT:  .LBB7_7:
+; PIC-NEXT:    or %s0, 10, (0)1
+; PIC-NEXT:    br.l.t .LBB7_10
 ; PIC-NEXT:  .LBB7_8:
 ; PIC-NEXT:    or %s0, 11, (0)1
 ; PIC-NEXT:    br.l.t .LBB7_10
-; PIC-NEXT:  .LBB7_7:
-; PIC-NEXT:    or %s0, 10, (0)1
+; PIC-NEXT:  .LBB7_9:
+; PIC-NEXT:    or %s0, 0, %s2
 ; PIC-NEXT:  .LBB7_10:
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    ld %s16, 32(, %s11)
diff --git a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
index bf939c4131080d3..332a4596eb337e0 100644
--- a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
+++ b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll
@@ -137,15 +137,6 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    calll __ZN12wxStringBase10ConcatSelfEmPKwm
 ; CHECK-NEXT:  Ltmp11:
 ; CHECK-NEXT:    jmp LBB0_5
-; CHECK-NEXT:  LBB0_9: ## %bb5657
-; CHECK-NEXT:  Ltmp13:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl %eax, (%esp)
-; CHECK-NEXT:    calll __ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE
-; CHECK-NEXT:  Ltmp14:
-; CHECK-NEXT:    jmp LBB0_25
 ; CHECK-NEXT:  LBB0_20: ## %bb5968
 ; CHECK-NEXT:  Ltmp2:
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -154,6 +145,15 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(ptr noalias sret(%struct
 ; CHECK-NEXT:    calll __ZN8wxString6FormatEPKwz
 ; CHECK-NEXT:    subl $4, %esp
 ; CHECK-NEXT:  Ltmp3:
+; CHECK-NEXT:    jmp LBB0_25
+; CHECK-NEXT:  LBB0_9: ## %bb5657
+; CHECK-NEXT:  Ltmp13:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, (%esp)
+; CHECK-NEXT:    calll __ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE
+; CHECK-NEXT:  Ltmp14:
 ; CHECK-NEXT:  LBB0_25: ## %bb115.critedge.i
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    addl $28, %esp
diff --git a/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll b/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll
index 214da14322d511e..5ad218f6cdc8934 100644
--- a/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll
+++ b/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll
@@ -45,9 +45,6 @@ define internal fastcc i32 @foo(i64 %bar) nounwind ssp {
 ; CHECK-NEXT:  LBB0_3: ## %RRETURN_6
 ; CHECK-NEXT:    callq _f2
 ; CHECK-NEXT:    jmp LBB0_28
-; CHECK-NEXT:  LBB0_2: ## %RETURN
-; CHECK-NEXT:    callq _f1
-; CHECK-NEXT:    jmp LBB0_28
 ; CHECK-NEXT:  LBB0_4: ## %RRETURN_7
 ; CHECK-NEXT:    callq _f3
 ; CHECK-NEXT:    jmp LBB0_28
@@ -119,6 +116,9 @@ define internal fastcc i32 @foo(i64 %bar) nounwind ssp {
 ; CHECK-NEXT:    jmp LBB0_28
 ; CHECK-NEXT:  LBB0_27: ## %RRETURN_1
 ; CHECK-NEXT:    callq _f26
+; CHECK-NEXT:    jmp LBB0_28
+; CHECK-NEXT:  LBB0_2: ## %RETURN
+; CHECK-NEXT:    callq _f1
 ; CHECK-NEXT:  LBB0_28: ## %EXIT
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    popq %rcx
diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll
index a522f0e9828a054..5cdabc0a79b0d8e 100644
--- a/llvm/test/CodeGen/X86/block-placement.ll
+++ b/llvm/test/CodeGen/X86/block-placement.ll
@@ -1149,8 +1149,8 @@ define void @test_flow_unwind() personality i32 (...)* @pers {
 ; CHECK: %entry
 ; CHECK: %then
 ; CHECK: %exit
-; CHECK: %innerlp
 ; CHECK: %outerlp
+; CHECK: %innerlp
 ; CHECK: %outercleanup
 entry:
   %0 = invoke i32 @foo()
diff --git a/llvm/test/CodeGen/X86/callbr-asm-outputs.ll b/llvm/test/CodeGen/X86/callbr-asm-outputs.ll
index f5f033398310116..880f85cc7f17b78 100644
--- a/llvm/test/CodeGen/X86/callbr-asm-outputs.ll
+++ b/llvm/test/CodeGen/X86/callbr-asm-outputs.ll
@@ -50,12 +50,12 @@ define i32 @test2(i32 %out1, i32 %out2) nounwind {
 ; CHECK-NEXT:  .LBB1_2: # Block address taken
 ; CHECK-NEXT:    # %if.then.label_true_crit_edge
 ; CHECK-NEXT:    # Label of block must be emitted
-; CHECK-NEXT:    jmp .LBB1_8
+; CHECK-NEXT:    jmp .LBB1_9
 ; CHECK-NEXT:  .LBB1_3: # %if.else
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    testl %esi, %edi
 ; CHECK-NEXT:    testl %esi, %edi
-; CHECK-NEXT:    jne .LBB1_9
+; CHECK-NEXT:    jne .LBB1_6
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:  .LBB1_4:
 ; CHECK-NEXT:    movl %esi, %eax
@@ -64,20 +64,20 @@ define i32 @test2(i32 %out1, i32 %out2) nounwind {
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    popl %edi
 ; CHECK-NEXT:    retl
-; CHECK-NEXT:  .LBB1_7: # Block address taken
-; CHECK-NEXT:    # %if.else.label_true_crit_edge
-; CHECK-NEXT:    # Label of block must be emitted
-; CHECK-NEXT:  .LBB1_8: # %label_true
-; CHECK-NEXT:    movl $-2, %eax
-; CHECK-NEXT:    jmp .LBB1_5
-; CHECK-NEXT:  .LBB1_9: # Block address taken
+; CHECK-NEXT:  .LBB1_6: # Block address taken
 ; CHECK-NEXT:    # %if.else.return_crit_edge
 ; CHECK-NEXT:    # Label of block must be emitted
-; CHECK-NEXT:  .LBB1_6: # Block address taken
+; CHECK-NEXT:  .LBB1_7: # Block address taken
 ; CHECK-NEXT:    # %if.then.return_crit_edge
 ; CHECK-NEXT:    # Label of block must be emitted
 ; CHECK-NEXT:    movl $-1, %eax
 ; CHECK-NEXT:    jmp .LBB1_5
+; CHECK-NEXT:  .LBB1_8: # Block address taken
+; CHECK-NEXT:    # %if.else.label_true_crit_edge
+; CHECK-NEXT:    # Label of block must be emitted
+; CHECK-NEXT:  .LBB1_9: # %label_true
+; CHECK-NEXT:    movl $-2, %eax
+; CHECK-NEXT:    jmp .LBB1_5
 entry:
   %cmp = icmp slt i32 %out1, %out2
   br i1 %cmp, label %if.then, label %if.else
diff --git a/llvm/test/CodeGen/X86/dup-cost.ll b/llvm/test/CodeGen/X86/dup-cost.ll
index ec9d36aa2a11b65..93af487c64ca09d 100644
--- a/llvm/test/CodeGen/X86/dup-cost.ll
+++ b/llvm/test/CodeGen/X86/dup-cost.ll
@@ -16,13 +16,13 @@ define i32 @cold(i32 %a, ptr %p, ptr %q) !prof !21 {
 ; CHECK-NEXT:  # %bb.4: # %true2
 ; CHECK-NEXT:    xorl %edi, %eax
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_5: # %false2
+; CHECK-NEXT:    andl %edi, %eax
+; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB0_2: # %false1
 ; CHECK-NEXT:    movl (%rdx), %eax
 ; CHECK-NEXT:    addl $-3, %eax
 ; CHECK-NEXT:    jmp .LBB0_3
-; CHECK-NEXT:  .LBB0_5: # %false2
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    retq
 entry:
   %cond1 = icmp sgt i32 %a, 1
   br i1 %cond1, label %true1, label %false1, !prof !30
diff --git a/llvm/test/CodeGen/X86/mul-constant-result.ll b/llvm/test/CodeGen/X86/mul-constant-result.ll
index beb2dba05e85ac3..9b74ef08a8f5482 100644
--- a/llvm/test/CodeGen/X86/mul-constant-result.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-result.ll
@@ -28,7 +28,7 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-NEXT:  .LBB0_4:
 ; X86-NEXT:    decl %ecx
 ; X86-NEXT:    cmpl $31, %ecx
-; X86-NEXT:    ja .LBB0_7
+; X86-NEXT:    ja .LBB0_12
 ; X86-NEXT:  # %bb.5:
 ; X86-NEXT:    jmpl *.LJTI0_0(,%ecx,4)
 ; X86-NEXT:  .LBB0_6:
@@ -36,27 +36,27 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_7:
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB0_8:
-; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 4
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_10:
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    shll $2, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB0_12:
+; X86-NEXT:  .LBB0_10:
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    jmp .LBB0_9
-; X86-NEXT:  .LBB0_13:
+; X86-NEXT:    jmp .LBB0_7
+; X86-NEXT:  .LBB0_11:
 ; X86-NEXT:    leal (,%eax,8), %ecx
 ; X86-NEXT:    jmp .LBB0_42
+; X86-NEXT:  .LBB0_12:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB0_13:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
 ; X86-NEXT:  .LBB0_14:
+; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    shll $3, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -64,13 +64,13 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-NEXT:  .LBB0_16:
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    jmp .LBB0_11
+; X86-NEXT:    jmp .LBB0_9
 ; X86-NEXT:  .LBB0_17:
 ; X86-NEXT:    leal (%eax,%eax,4), %ecx
 ; X86-NEXT:    jmp .LBB0_18
 ; X86-NEXT:  .LBB0_19:
 ; X86-NEXT:    shll $2, %eax
-; X86-NEXT:    jmp .LBB0_9
+; X86-NEXT:    jmp .LBB0_7
 ; X86-NEXT:  .LBB0_20:
 ; X86-NEXT:    leal (%eax,%eax,2), %ecx
 ; X86-NEXT:    jmp .LBB0_21
@@ -80,7 +80,7 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-NEXT:    jmp .LBB0_23
 ; X86-NEXT:  .LBB0_24:
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
-; X86-NEXT:    jmp .LBB0_9
+; X86-NEXT:    jmp .LBB0_7
 ; X86-NEXT:  .LBB0_25:
 ; X86-NEXT:    shll $4, %eax
 ; X86-NEXT:    popl %esi
@@ -109,7 +109,7 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-NEXT:  .LBB0_30:
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    shll $2, %eax
-; X86-NEXT:    jmp .LBB0_11
+; X86-NEXT:    jmp .LBB0_9
 ; X86-NEXT:  .LBB0_31:
 ; X86-NEXT:    leal (%eax,%eax,4), %ecx
 ; X86-NEXT:  .LBB0_21:
@@ -128,10 +128,10 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-NEXT:    jmp .LBB0_42
 ; X86-NEXT:  .LBB0_34:
 ; X86-NEXT:    shll $3, %eax
-; X86-NEXT:    jmp .LBB0_9
+; X86-NEXT:    jmp .LBB0_7
 ; X86-NEXT:  .LBB0_35:
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
-; X86-NEXT:  .LBB0_11:
+; X86-NEXT:  .LBB0_9:
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -143,7 +143,7 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X86-NEXT:    jmp .LBB0_27
 ; X86-NEXT:  .LBB0_37:
 ; X86-NEXT:    leal (%eax,%eax,8), %eax
-; X86-NEXT:  .LBB0_9:
+; X86-NEXT:  .LBB0_7:
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
@@ -199,41 +199,54 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X64-HSW-NEXT:    cmovel %ecx, %eax
 ; X64-HSW-NEXT:    decl %edi
 ; X64-HSW-NEXT:    cmpl $31, %edi
-; X64-HSW-NEXT:    ja .LBB0_3
+; X64-HSW-NEXT:    ja .LBB0_8
 ; X64-HSW-NEXT:  # %bb.1:
 ; X64-HSW-NEXT:    jmpq *.LJTI0_0(,%rdi,8)
 ; X64-HSW-NEXT:  .LBB0_2:
 ; X64-HSW-NEXT:    addl %eax, %eax
 ; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-HSW-NEXT:    retq
-; X64-HSW-NEXT:  .LBB0_3:
-; X64-HSW-NEXT:    xorl %eax, %eax
 ; X64-HSW-NEXT:  .LBB0_4:
-; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
-; X64-HSW-NEXT:    retq
-; X64-HSW-NEXT:  .LBB0_6:
 ; X64-HSW-NEXT:    shll $2, %eax
 ; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-HSW-NEXT:    retq
-; X64-HSW-NEXT:  .LBB0_8:
+; X64-HSW-NEXT:  .LBB0_6:
 ; X64-HSW-NEXT:    addl %eax, %eax
-; X64-HSW-NEXT:  .LBB0_5:
+; X64-HSW-NEXT:  .LBB0_3:
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
 ; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_33:
+; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
+; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
+; X64-HSW-NEXT:    jmp .LBB0_34
+; X64-HSW-NEXT:  .LBB0_8:
+; X64-HSW-NEXT:    xorl %eax, %eax
 ; X64-HSW-NEXT:  .LBB0_9:
-; X64-HSW-NEXT:    leal (,%rax,8), %ecx
-; X64-HSW-NEXT:    jmp .LBB0_38
+; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-HSW-NEXT:    retq
 ; X64-HSW-NEXT:  .LBB0_10:
 ; X64-HSW-NEXT:    shll $3, %eax
 ; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-HSW-NEXT:    retq
 ; X64-HSW-NEXT:  .LBB0_12:
 ; X64-HSW-NEXT:    addl %eax, %eax
-; X64-HSW-NEXT:  .LBB0_7:
+; X64-HSW-NEXT:  .LBB0_5:
 ; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
 ; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_31:
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %ecx
+; X64-HSW-NEXT:    leal (%rcx,%rcx,4), %ecx
+; X64-HSW-NEXT:    jmp .LBB0_34
+; X64-HSW-NEXT:  .LBB0_32:
+; X64-HSW-NEXT:    leal (%rax,%rax,8), %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
+; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_7:
+; X64-HSW-NEXT:    leal (,%rax,8), %ecx
+; X64-HSW-NEXT:    jmp .LBB0_38
 ; X64-HSW-NEXT:  .LBB0_13:
 ; X64-HSW-NEXT:    leal (%rax,%rax,4), %ecx
 ; X64-HSW-NEXT:    leal (%rax,%rcx,2), %eax
@@ -292,33 +305,6 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X64-HSW-NEXT:  .LBB0_27:
 ; X64-HSW-NEXT:    leal (%rax,%rax,4), %ecx
 ; X64-HSW-NEXT:    leal (%rax,%rcx,4), %ecx
-; X64-HSW-NEXT:    jmp .LBB0_34
-; X64-HSW-NEXT:  .LBB0_28:
-; X64-HSW-NEXT:    leal (%rax,%rax,2), %ecx
-; X64-HSW-NEXT:    shll $3, %ecx
-; X64-HSW-NEXT:    jmp .LBB0_38
-; X64-HSW-NEXT:  .LBB0_29:
-; X64-HSW-NEXT:    shll $3, %eax
-; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
-; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
-; X64-HSW-NEXT:    retq
-; X64-HSW-NEXT:  .LBB0_30:
-; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
-; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
-; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
-; X64-HSW-NEXT:    retq
-; X64-HSW-NEXT:  .LBB0_31:
-; X64-HSW-NEXT:    leal (%rax,%rax,4), %ecx
-; X64-HSW-NEXT:    leal (%rcx,%rcx,4), %ecx
-; X64-HSW-NEXT:    jmp .LBB0_34
-; X64-HSW-NEXT:  .LBB0_32:
-; X64-HSW-NEXT:    leal (%rax,%rax,8), %eax
-; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
-; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
-; X64-HSW-NEXT:    retq
-; X64-HSW-NEXT:  .LBB0_33:
-; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
-; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
 ; X64-HSW-NEXT:  .LBB0_34:
 ; X64-HSW-NEXT:    addl %eax, %ecx
 ; X64-HSW-NEXT:    movl %ecx, %eax
@@ -340,6 +326,10 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X64-HSW-NEXT:  .LBB0_37:
 ; X64-HSW-NEXT:    movl %eax, %ecx
 ; X64-HSW-NEXT:    shll $5, %ecx
+; X64-HSW-NEXT:    jmp .LBB0_38
+; X64-HSW-NEXT:  .LBB0_28:
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %ecx
+; X64-HSW-NEXT:    shll $3, %ecx
 ; X64-HSW-NEXT:  .LBB0_38:
 ; X64-HSW-NEXT:    subl %eax, %ecx
 ; X64-HSW-NEXT:    movl %ecx, %eax
@@ -348,6 +338,16 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
 ; X64-HSW-NEXT:  .LBB0_40:
 ; X64-HSW-NEXT:    shll $5, %eax
 ; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_29:
+; X64-HSW-NEXT:    shll $3, %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax
+; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_30:
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax
+; X64-HSW-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-HSW-NEXT:    retq
   %3 = icmp eq i32 %1, 0
   %4 = icmp sgt i32 %1, 1
diff --git a/llvm/test/CodeGen/X86/pic.ll b/llvm/test/CodeGen/X86/pic.ll
index 7c4db752b4e0425..408ea9ad71bd9e6 100644
--- a/llvm/test/CodeGen/X86/pic.ll
+++ b/llvm/test/CodeGen/X86/pic.ll
@@ -231,19 +231,19 @@ bb12:
 ; CHECK-I686:	.long	 .LBB7_5 at GOTOFF
 ; CHECK-I686:	.long	 .LBB7_8 at GOTOFF
 ; CHECK-I686:	.long	 .LBB7_7 at GOTOFF
+; CHECK-X32:	.long	.LBB7_2-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_2-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_4-.LJTI7_0
 ; CHECK-X32:	.long	.LBB7_3-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_4-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_6-.LJTI7_0
 ; CHECK-X32:	.long	.LBB7_3-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_12-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_8-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_12-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_10-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_8-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_9-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_10-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_9-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_12-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_14-.LJTI7_0
-; CHECK-X32:	.long	.LBB7_14-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_5-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_6-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_5-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_4-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_7-.LJTI7_0
+; CHECK-X32:	.long	.LBB7_7-.LJTI7_0
 }
 
 declare void @foo1(...)
diff --git a/llvm/test/CodeGen/X86/pr38743.ll b/llvm/test/CodeGen/X86/pr38743.ll
index c05310090660dd0..ef1c87118c8118f 100644
--- a/llvm/test/CodeGen/X86/pr38743.ll
+++ b/llvm/test/CodeGen/X86/pr38743.ll
@@ -27,15 +27,15 @@ define void @pr38743(i32 %a0) #1 align 2 {
 ; CHECK-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq .str.17(%rip), %rax
 ; CHECK-NEXT:    jmp .LBB0_4
-; CHECK-NEXT:  .LBB0_1: # %bb2
-; CHECK-NEXT:    movq .str.16+7(%rip), %rax
-; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq .str.16(%rip), %rax
-; CHECK-NEXT:    jmp .LBB0_4
 ; CHECK-NEXT:  .LBB0_3: # %bb8
 ; CHECK-NEXT:    movq .str.18+6(%rip), %rax
 ; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq .str.18(%rip), %rax
+; CHECK-NEXT:    jmp .LBB0_4
+; CHECK-NEXT:  .LBB0_1: # %bb2
+; CHECK-NEXT:    movq .str.16+7(%rip), %rax
+; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq .str.16(%rip), %rax
 ; CHECK-NEXT:  .LBB0_4: # %bb12
 ; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
index e9448a800fd9597..c50a39c99fb6c65 100644
--- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -190,14 +190,14 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
 ; CHECK-NEXT:    movl $268, %ebp ## imm = 0x10C
 ; CHECK-NEXT:    jmp LBB0_20
-; CHECK-NEXT:  LBB0_39: ## %sw.bb566
-; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movl $20, %ebp
-; CHECK-NEXT:    jmp LBB0_20
 ; CHECK-NEXT:  LBB0_19: ## %sw.bb243
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    movl $2, %ebp
 ; CHECK-NEXT:    jmp LBB0_20
+; CHECK-NEXT:  LBB0_39: ## %sw.bb566
+; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    movl $20, %ebp
+; CHECK-NEXT:    jmp LBB0_20
 ; CHECK-NEXT:  LBB0_32: ## %if.end517.loopexitsplit
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    incq %rbx
diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll b/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
index 4d0599022d53847..7c8618af83d3d7b 100644
--- a/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
+++ b/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
@@ -534,12 +534,6 @@ define dso_local i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-NEXT:    movl $7, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB6_2: # %bb0
-; X64-NEXT:    cmovbeq %rax, %rcx
-; X64-NEXT:    shlq $47, %rcx
-; X64-NEXT:    movl $2, %eax
-; X64-NEXT:    orq %rcx, %rsp
-; X64-NEXT:    retq
 ; X64-NEXT:  .LBB6_4: # Block address taken
 ; X64-NEXT:    # %bb2
 ; X64-NEXT:    cmpq $.LBB6_4, %rdx
@@ -564,6 +558,12 @@ define dso_local i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-NEXT:    movl $11, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
+; X64-NEXT:  .LBB6_2: # %bb0
+; X64-NEXT:    cmovbeq %rax, %rcx
+; X64-NEXT:    shlq $47, %rcx
+; X64-NEXT:    movl $2, %eax
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    retq
 ;
 ; X64-PIC-LABEL: test_switch_jumptable:
 ; X64-PIC:       # %bb.0: # %entry
@@ -589,12 +589,6 @@ define dso_local i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-PIC-NEXT:    movl $7, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
-; X64-PIC-NEXT:  .LBB6_2: # %bb0
-; X64-PIC-NEXT:    cmovbeq %rax, %rcx
-; X64-PIC-NEXT:    shlq $47, %rcx
-; X64-PIC-NEXT:    movl $2, %eax
-; X64-PIC-NEXT:    orq %rcx, %rsp
-; X64-PIC-NEXT:    retq
 ; X64-PIC-NEXT:  .LBB6_4: # Block address taken
 ; X64-PIC-NEXT:    # %bb2
 ; X64-PIC-NEXT:    leaq .LBB6_4(%rip), %rsi
@@ -622,6 +616,12 @@ define dso_local i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-PIC-NEXT:    movl $11, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
+; X64-PIC-NEXT:  .LBB6_2: # %bb0
+; X64-PIC-NEXT:    cmovbeq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
+; X64-PIC-NEXT:    movl $2, %eax
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
 ;
 ; X64-RETPOLINE-LABEL: test_switch_jumptable:
 ; X64-RETPOLINE:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/switch.ll b/llvm/test/CodeGen/X86/switch.ll
index f5040f2b2bab557..264c5bf15aa28bb 100644
--- a/llvm/test/CodeGen/X86/switch.ll
+++ b/llvm/test/CodeGen/X86/switch.ll
@@ -284,8 +284,6 @@ define void @jt_is_better(i32 %x) {
 ; CHECK-NEXT:  .LBB4_3: # %bb1
 ; CHECK-NEXT:    movl $1, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
-; CHECK-NEXT:  .LBB4_7: # %return
-; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB4_4: # %bb2
 ; CHECK-NEXT:    movl $2, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
@@ -295,6 +293,8 @@ define void @jt_is_better(i32 %x) {
 ; CHECK-NEXT:  .LBB4_6: # %bb4
 ; CHECK-NEXT:    movl $4, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
+; CHECK-NEXT:  .LBB4_7: # %return
+; CHECK-NEXT:    retq
 ;
 ; NOOPT-LABEL: jt_is_better:
 ; NOOPT:       # %bb.0: # %entry
@@ -1880,13 +1880,6 @@ define void @left_leaning_weight_balanced_tree(i32 %x) {
 ; CHECK-NEXT:  # %bb.7: # %bb2
 ; CHECK-NEXT:    movl $2, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
-; CHECK-NEXT:  .LBB19_12: # %entry
-; CHECK-NEXT:    cmpl $50, %edi
-; CHECK-NEXT:    je .LBB19_17
-; CHECK-NEXT:  # %bb.13: # %entry
-; CHECK-NEXT:    cmpl $60, %edi
-; CHECK-NEXT:    je .LBB19_14
-; CHECK-NEXT:    jmp .LBB19_18
 ; CHECK-NEXT:  .LBB19_8: # %entry
 ; CHECK-NEXT:    cmpl $30, %edi
 ; CHECK-NEXT:    je .LBB19_16
@@ -1896,6 +1889,14 @@ define void @left_leaning_weight_balanced_tree(i32 %x) {
 ; CHECK-NEXT:  # %bb.10: # %bb4
 ; CHECK-NEXT:    movl $4, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
+; CHECK-NEXT:  .LBB19_12: # %entry
+; CHECK-NEXT:    cmpl $50, %edi
+; CHECK-NEXT:    je .LBB19_17
+; CHECK-NEXT:  # %bb.13: # %entry
+; CHECK-NEXT:    cmpl $60, %edi
+; CHECK-NEXT:    je .LBB19_14
+; CHECK-NEXT:  .LBB19_18: # %return
+; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB19_15: # %bb1
 ; CHECK-NEXT:    movl $1, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
@@ -1905,8 +1906,6 @@ define void @left_leaning_weight_balanced_tree(i32 %x) {
 ; CHECK-NEXT:  .LBB19_17: # %bb5
 ; CHECK-NEXT:    movl $5, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
-; CHECK-NEXT:  .LBB19_18: # %return
-; CHECK-NEXT:    retq
 ;
 ; NOOPT-LABEL: left_leaning_weight_balanced_tree:
 ; NOOPT:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/win-catchpad.ll b/llvm/test/CodeGen/X86/win-catchpad.ll
index 59612bfe9a535ea..d2067dd4e51c24a 100644
--- a/llvm/test/CodeGen/X86/win-catchpad.ll
+++ b/llvm/test/CodeGen/X86/win-catchpad.ll
@@ -64,13 +64,13 @@ try.cont:
 ; X86: retl
 
 ; FIXME: These should be de-duplicated.
-; X86: [[restorebb2:LBB0_[0-9]+]]: # Block address taken
-; X86-NEXT:                        # %handler2
+; X86: [[restorebb1:LBB0_[0-9]+]]: # Block address taken
+; X86-NEXT:                        # %handler1
 ; X86-NEXT: addl $12, %ebp
 ; X86: jmp [[contbb]]
 
-; X86: [[restorebb1:LBB0_[0-9]+]]: # Block address taken
-; X86-NEXT:                        # %handler1
+; X86: [[restorebb2:LBB0_[0-9]+]]: # Block address taken
+; X86-NEXT:                        # %handler2
 ; X86-NEXT: addl $12, %ebp
 ; X86: jmp [[contbb]]
 
diff --git a/llvm/unittests/Support/BlockFrequencyTest.cpp b/llvm/unittests/Support/BlockFrequencyTest.cpp
index d0374b0a8c51944..18bb407e7fbde36 100644
--- a/llvm/unittests/Support/BlockFrequencyTest.cpp
+++ b/llvm/unittests/Support/BlockFrequencyTest.cpp
@@ -125,4 +125,45 @@ TEST(BlockFrequencyTest, SaturatingRightShift) {
   EXPECT_EQ(Freq.getFrequency(), 0x1ULL);
 }
 
+TEST(BlockFrequencyTest, AlmostEqual) {
+  EXPECT_FALSE(BlockFrequency(0x1234).almostEqual(BlockFrequency(0), 20));
+  EXPECT_FALSE(BlockFrequency(0x1234).almostEqual(BlockFrequency(0x1233), 20));
+  EXPECT_TRUE(BlockFrequency(0x1234).almostEqual(BlockFrequency(0x1234), 20));
+
+  EXPECT_FALSE(BlockFrequency(0).almostEqual(BlockFrequency(0x1234), 20));
+  EXPECT_FALSE(BlockFrequency(0x1233).almostEqual(BlockFrequency(0x1234), 20));
+  EXPECT_TRUE(BlockFrequency(0x1234).almostEqual(BlockFrequency(0x1234), 20));
+
+  EXPECT_FALSE(BlockFrequency(0x1235).almostEqual(BlockFrequency(0x1234), 20));
+  EXPECT_FALSE(BlockFrequency(0x1234).almostEqual(BlockFrequency(0x1235), 20));
+
+  EXPECT_TRUE(BlockFrequency(0x4129).almostEqual(BlockFrequency(0x4128), 1));
+  EXPECT_TRUE(BlockFrequency(0x4129).almostEqual(BlockFrequency(0x4128), 2));
+  EXPECT_TRUE(BlockFrequency(0x4129).almostEqual(BlockFrequency(0x4128), 8));
+  EXPECT_TRUE(BlockFrequency(0x4129).almostEqual(BlockFrequency(0x4128), 14));
+  EXPECT_FALSE(BlockFrequency(0x4129).almostEqual(BlockFrequency(0x4128), 15));
+  EXPECT_FALSE(BlockFrequency(0x4129).almostEqual(BlockFrequency(0x4128), 16));
+  EXPECT_FALSE(BlockFrequency(0x4129).almostEqual(BlockFrequency(0x4128), 63));
+
+  EXPECT_FALSE(
+      BlockFrequency(0x10000000000).almostEqual(BlockFrequency(0x10), 5));
+
+  BlockFrequency Max = BlockFrequency::max();
+  EXPECT_TRUE(BlockFrequency(Max).almostEqual(Max, 0));
+  EXPECT_TRUE(BlockFrequency(Max).almostEqual(Max, 1));
+  EXPECT_TRUE(BlockFrequency(Max).almostEqual(Max, 63));
+
+  BlockFrequency Zero = BlockFrequency(0);
+  EXPECT_TRUE(BlockFrequency(Max).almostEqual(Zero, 0));
+  EXPECT_FALSE(BlockFrequency(Max).almostEqual(Zero, 1));
+  EXPECT_FALSE(BlockFrequency(Max).almostEqual(Zero, 63));
+
+  EXPECT_TRUE(BlockFrequency(Zero).almostEqual(Max, 0));
+  EXPECT_FALSE(BlockFrequency(Zero).almostEqual(Max, 1));
+  EXPECT_FALSE(BlockFrequency(Zero).almostEqual(Max, 63));
+
+  EXPECT_TRUE(BlockFrequency(Zero).almostEqual(Zero, 0));
+  EXPECT_TRUE(BlockFrequency(Zero).almostEqual(Zero, 1));
+  EXPECT_TRUE(BlockFrequency(Zero).almostEqual(Zero, 63));
+}
 }

>From 3f68a35583ac46b3049240f072f1b03fb0003b57 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Mon, 11 Sep 2023 15:02:56 -0700
Subject: [PATCH 3/4] Add test for BlockFrequencyInfo precision for small
 min/max spreads

---
 .../Analysis/BlockFrequencyInfo/precision.ll  | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 llvm/test/Analysis/BlockFrequencyInfo/precision.ll

diff --git a/llvm/test/Analysis/BlockFrequencyInfo/precision.ll b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
new file mode 100644
index 000000000000000..4001fe991d6d9e8
--- /dev/null
+++ b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s -disable-output -passes="print<block-freq>" 2>&1 | FileCheck %s
+; Sanity check precision for small-ish min/max spread.
+
+ at g = global i32 0
+
+; CHECK-LABEL: block-frequency-info: func0
+; CHECK: - entry: float = 1.0, {{.*}}, count = 1000
+; CHECK: - cmp0_true: float = 0.4, {{.*}}, count = 388
+; CHECK: - cmp0_false: float = 0.6, {{.*}}, count = 600
+; CHECK: - cmp1_true: float = 0.1, {{.*}}, count = 88
+; CHECK: - cmp1_false: float = 0.3, {{.*}}, count = 288
+; CHECK: - join: float = 1.0, {{.*}}, count = 1000
+
+define void @func0(i32 %a0, i32 %a1) !prof !0 {
+entry:
+  %cmp0 = icmp ne i32 %a0, 0
+  br i1 %cmp0, label %cmp0_true, label %cmp0_false, !prof !1
+
+cmp0_true:
+  store volatile i32 1, ptr @g
+  %cmp1 = icmp ne i32 %a1, 0
+  br i1 %cmp1, label %cmp1_true, label %cmp1_false, !prof !2
+
+cmp0_false:
+  store volatile i32 2, ptr @g
+  br label %join
+
+cmp1_true:
+  store volatile i32 3, ptr @g
+  br label %join
+
+cmp1_false:
+  store volatile i32 4, ptr @g
+  br label %join
+
+join:
+  store volatile i32 5, ptr @g
+  ret void
+}
+
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 400, i32 600}
+!2 = !{!"branch_weights", i32 1, i32 3}

>From 784c1f81eb688e6fe561ddf18903f68a81047226 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Mon, 11 Sep 2023 14:45:09 -0700
Subject: [PATCH 4/4] BlockFrequencyInfoImpl: Increase precision for small
 min/max spreads

BlockFrequencyInfo stores its result as integer values expressing
frequencies within a function relative to the frequency of the entry
block. This also means that the precision for frequencies < 1.0
depends on the choice for the entry block.

Before this change we would often end up with unnecessarily small
choices resuling in unnecessarily poor precision. This simplifies the
algorithm to use the full 64bits available as much as possible.
---
 ...rprof-gcov-multiple-bbs-single-line.c.gcov |  16 +-
 llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp  |  34 +--
 .../loops_with_profile_info.ll                |  19 +-
 .../Analysis/BlockFrequencyInfo/precision.ll  |   6 +-
 .../arm64-spill-remarks-treshold-hotness.ll   |   2 +-
 .../greedy-broken-ssa-verifier-error.mir      |   2 +-
 ...ne-sink-temporal-divergence-swdev407790.ll |  57 ++--
 .../AMDGPU/tuple-allocation-failure.ll        | 174 ++++++------
 llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll   |  80 +++---
 llvm/test/CodeGen/PowerPC/reduce_cr.ll        |  18 +-
 llvm/test/CodeGen/PowerPC/tail-dup-layout.ll  |  10 +-
 llvm/test/CodeGen/RISCV/branch-relaxation.ll  |  86 +++---
 .../test/CodeGen/Thumb2/mve-blockplacement.ll | 264 +++++++++---------
 .../CodeGen/Thumb2/mve-float16regloops.ll     |  57 ++--
 .../CodeGen/Thumb2/mve-float32regloops.ll     |  93 +++---
 llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll  |  22 +-
 llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll  | 119 ++++----
 llvm/test/CodeGen/VE/Scalar/br_jt.ll          |  40 +--
 llvm/test/CodeGen/VE/Scalar/brind.ll          |  15 +-
 llvm/test/CodeGen/X86/bb_rotate.ll            |   2 +-
 .../X86/code_placement_ext_tsp_large.ll       |   8 +-
 .../X86/div-rem-pair-recomposition-signed.ll  | 134 +++++----
 .../div-rem-pair-recomposition-unsigned.ll    | 206 +++++++-------
 llvm/test/CodeGen/X86/fsafdo_test3.ll         |  80 +++---
 llvm/test/CodeGen/X86/pr38795.ll              |  99 +++----
 .../speculative-load-hardening-indirect.ll    |  94 +++----
 llvm/test/CodeGen/X86/statepoint-ra.ll        |   2 +-
 llvm/test/CodeGen/X86/switch.ll               |  23 +-
 .../X86/tail-dup-no-other-successor.ll        |   2 +-
 llvm/test/CodeGen/X86/tail-opts.ll            |  14 +-
 llvm/test/Other/cfg-printer-branch-weights.ll |   4 +-
 llvm/test/ThinLTO/X86/function_entry_count.ll |   2 +-
 .../CodeExtractor/MultipleExitBranchProb.ll   |   2 +-
 .../X86/pr52689-not-all-uses-rebased.ll       |   4 +
 .../Transforms/JumpThreading/thread-prob-7.ll |   2 +-
 .../JumpThreading/update-edge-weight.ll       |   2 +-
 llvm/test/Transforms/LICM/loopsink.ll         |  20 +-
 .../AArch64/opt-remark-with-hotness.ll        |   2 +-
 .../diagnostics-with-hotness.ll               |   2 +-
 .../LoopRotate/update-branch-weights.ll       |  12 +-
 .../Transforms/LoopVectorize/X86/avx512.ll    |  23 +-
 .../X86/no_fpmath_with_hotness.ll             |   2 +-
 .../LoopVectorize/diag-with-hotness-info-2.ll |   2 +-
 .../LoopVectorize/diag-with-hotness-info.ll   |   2 +-
 .../PGOProfile/Inputs/PR41279_2.proftext      |   3 +-
 .../PGOProfile/Inputs/criticaledge.proftext   |   4 +-
 .../Inputs/criticaledge_entry.proftext        |   4 +-
 .../PGOProfile/Inputs/indirectbr.proftext     |   2 +-
 .../Inputs/indirectbr_entry.proftext          |   2 +-
 llvm/test/Transforms/PGOProfile/PR41279_2.ll  |  23 +-
 .../Transforms/PGOProfile/bfi_verification.ll |  17 +-
 .../Transforms/PGOProfile/criticaledge.ll     |   6 +-
 llvm/test/Transforms/PGOProfile/fix_bfi.ll    |   2 +-
 llvm/test/Transforms/PGOProfile/loop2.ll      |   6 +-
 .../profile-correlation-irreducible-loops.ll  |   6 +-
 .../profile-inference-rebalance.ll            |   8 +-
 .../SampleProfile/pseudo-probe-update-2.ll    |  10 +-
 57 files changed, 972 insertions(+), 980 deletions(-)

diff --git a/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov b/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov
index 4debf8fc1b680d9..9297073d21ef80e 100644
--- a/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov
+++ b/compiler-rt/test/profile/Inputs/instrprof-gcov-multiple-bbs-single-line.c.gcov
@@ -10,25 +10,25 @@
 // CHECK-NEXT:        -:    4:
 // CHECK-NEXT:        1:    5:  int a = 1;
 // CHECK-NEXT:        1:    6:  if (a) {
-// CHECK-NEXT:branch  0 taken 1
-// CHECK-NEXT:branch  1 taken 0
+// CHECK-NEXT:branch  0 taken 0
+// CHECK-NEXT:branch  1 taken 1
 // CHECK-NEXT:        1:    7:    var++;
 // CHECK-NEXT:        1:    8:  }
 // CHECK-NEXT:        -:    9:
 // CHECK-NEXT:        1:   10:  if (a) {}
-// CHECK-NEXT:branch  0 taken 1
-// CHECK-NEXT:branch  1 taken 0
+// CHECK-NEXT:branch  0 taken 0
+// CHECK-NEXT:branch  1 taken 1
 // CHECK-NEXT:        -:   11:
 // CHECK-NEXT:        1:   12:  int b = 0;
 // CHECK-NEXT:        1:   13:  if (b) {
-// CHECK-NEXT:branch  0 taken 0
-// CHECK-NEXT:branch  1 taken 1
+// CHECK-NEXT:branch  0 taken 1
+// CHECK-NEXT:branch  1 taken 0
 // CHECK-NEXT:    #####:   14:    var++;
 // CHECK-NEXT:    #####:   15:  }
 // CHECK-NEXT:        -:   16:
 // CHECK-NEXT:        1:   17:  if (b) {}
-// CHECK-NEXT:branch  0 taken 0
-// CHECK-NEXT:branch  1 taken 1
+// CHECK-NEXT:branch  0 taken 1
+// CHECK-NEXT:branch  1 taken 0
 // CHECK-NEXT:        -:   18:
 // CHECK-NEXT:        1:   19:  return 0;
 // CHECK-NEXT:        -:   20:}
diff --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 6f944990c78674a..ae08d56ef098a75 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -481,30 +481,24 @@ void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
 
 static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
                                      const Scaled64 &Min, const Scaled64 &Max) {
-  // Scale the Factor to a size that creates integers.  Ideally, integers would
-  // be scaled so that Max == UINT64_MAX so that they can be best
-  // differentiated.  However, in the presence of large frequency values, small
-  // frequencies are scaled down to 1, making it impossible to differentiate
-  // small, unequal numbers. When the spread between Min and Max frequencies
-  // fits well within MaxBits, we make the scale be at least 8.
-  const unsigned MaxBits = 64;
-  const unsigned SpreadBits = (Max / Min).lg();
-  Scaled64 ScalingFactor;
-  if (SpreadBits <= MaxBits - 3) {
-    // If the values are small enough, make the scaling factor at least 8 to
-    // allow distinguishing small values.
-    ScalingFactor = Min.inverse();
-    ScalingFactor <<= 3;
-  } else {
-    // If the values need more than MaxBits to be represented, saturate small
-    // frequency values down to 1 by using a scaling factor that benefits large
-    // frequency values.
-    ScalingFactor = Scaled64(1, MaxBits) / Max;
-  }
+  // Scale the Factor to a size that creates integers.  If possible scale
+  // integers so that Max == UINT64_MAX so that they can be best differentiated.
+  // Is is possible that the range between min and max cannot be accurately
+  // represented in a 64bit integer without either loosing precision for small
+  // values (so small unequal numbers all map to 1) or saturaturing big numbers
+  // loosing precision for big numbers (so unequal big numbers may map to
+  // UINT64_MAX). We choose to loose precision for small numbers.
+  const unsigned MaxBits = sizeof(Scaled64::DigitsType) * CHAR_BIT;
+  // Users often add up multiple BlockFrequency values or multiply them with
+  // things like instruction costs. Leave some room to avoid saturating
+  // operations reaching UIN64_MAX too early.
+  const unsigned Slack = 10;
+  Scaled64 ScalingFactor = Scaled64(1, MaxBits - Slack) / Max;
 
   // Translate the floats to integers.
   LLVM_DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
                     << ", factor = " << ScalingFactor << "\n");
+  (void)Min;
   for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) {
     Scaled64 Scaled = BFI.Freqs[Index].Scaled * ScalingFactor;
     BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>());
diff --git a/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll b/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
index 41226a1cdfbaf32..7cebfb114f4ed4e 100644
--- a/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
+++ b/llvm/test/Analysis/BlockFrequencyInfo/loops_with_profile_info.ll
@@ -59,7 +59,7 @@ declare i32 @printf(i8*, ...)
 
 ; CHECK: Printing analysis {{.*}} for function 'main':
 ; CHECK-NEXT: block-frequency-info: main
-define i32 @main() {
+define i32 @main() !prof !6 {
 entry:
   %retval = alloca i32, align 4
   %i = alloca i32, align 4
@@ -93,7 +93,7 @@ for.cond4:                                        ; preds = %for.inc, %for.body3
   %cmp5 = icmp slt i32 %2, 100
   br i1 %cmp5, label %for.body6, label %for.end, !prof !3
 
-; CHECK: - for.body6: float = 500000.5, int = 4000004
+; CHECK: - for.body6: float = 1000000.0,{{.*}}count = 1000000
 for.body6:                                        ; preds = %for.cond4
   call void @bar()
   br label %for.inc
@@ -143,7 +143,7 @@ for.cond16:                                       ; preds = %for.inc19, %for.bod
   %cmp17 = icmp slt i32 %8, 10000
   br i1 %cmp17, label %for.body18, label %for.end21, !prof !4
 
-; CHECK: - for.body18: float = 499999.9, int = 3999998
+; CHECK: - for.body18: float = 999999.5,{{.*}}count = 1000000
 for.body18:                                       ; preds = %for.cond16
   call void @bar()
   br label %for.inc19
@@ -175,7 +175,7 @@ for.cond26:                                       ; preds = %for.inc29, %for.end
   %cmp27 = icmp slt i32 %12, 1000000
   br i1 %cmp27, label %for.body28, label %for.end31, !prof !5
 
-; CHECK: - for.body28: float = 499995.2, int = 3999961
+; CHECK: - for.body28: float = 1000224.3,{{.*}}count = 1000224
 for.body28:                                       ; preds = %for.cond26
   call void @bar()
   br label %for.inc29
@@ -197,8 +197,9 @@ for.end31:                                        ; preds = %for.cond26
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.7.0 (trunk 232635) (llvm/trunk 232636)"}
-!1 = !{!"branch_weights", i32 101, i32 2}
-!2 = !{!"branch_weights", i32 10001, i32 101}
-!3 = !{!"branch_weights", i32 1000001, i32 10001}
-!4 = !{!"branch_weights", i32 1000001, i32 101}
-!5 = !{!"branch_weights", i32 1000001, i32 2}
+!1 = !{!"branch_weights", i32 100, i32 1}
+!2 = !{!"branch_weights", i32 10000, i32 100}
+!3 = !{!"branch_weights", i32 1000000, i32 10000}
+!4 = !{!"branch_weights", i32 1000000, i32 100}
+!5 = !{!"branch_weights", i32 1000000, i32 1}
+!6 = !{!"function_entry_count", i32 1}
diff --git a/llvm/test/Analysis/BlockFrequencyInfo/precision.ll b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
index 4001fe991d6d9e8..7408d002d065d5b 100644
--- a/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
+++ b/llvm/test/Analysis/BlockFrequencyInfo/precision.ll
@@ -5,10 +5,10 @@
 
 ; CHECK-LABEL: block-frequency-info: func0
 ; CHECK: - entry: float = 1.0, {{.*}}, count = 1000
-; CHECK: - cmp0_true: float = 0.4, {{.*}}, count = 388
+; CHECK: - cmp0_true: float = 0.4, {{.*}}, count = 400
 ; CHECK: - cmp0_false: float = 0.6, {{.*}}, count = 600
-; CHECK: - cmp1_true: float = 0.1, {{.*}}, count = 88
-; CHECK: - cmp1_false: float = 0.3, {{.*}}, count = 288
+; CHECK: - cmp1_true: float = 0.1, {{.*}}, count = 100
+; CHECK: - cmp1_false: float = 0.3, {{.*}}, count = 300
 ; CHECK: - join: float = 1.0, {{.*}}, count = 1000
 
 define void @func0(i32 %a0, i32 %a1) !prof !0 {
diff --git a/llvm/test/CodeGen/AArch64/arm64-spill-remarks-treshold-hotness.ll b/llvm/test/CodeGen/AArch64/arm64-spill-remarks-treshold-hotness.ll
index 0578ab585402af9..5f849c67b0ca318 100644
--- a/llvm/test/CodeGen/AArch64/arm64-spill-remarks-treshold-hotness.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-spill-remarks-treshold-hotness.ll
@@ -5,7 +5,7 @@
 ; RUN:       -pass-remarks-with-hotness -pass-remarks-hotness-threshold=1 \
 ; RUN:       2>&1 | FileCheck -check-prefix=THRESHOLD %s
 
-; CHECK: remark: /tmp/kk.c:3:20: 1 spills 3.187500e+01 total spills cost 1 reloads 3.187500e+01 total reloads cost generated in loop{{$}}
+; CHECK: remark: /tmp/kk.c:3:20: 1 spills 3.200000e+01 total spills cost 1 reloads 3.200000e+01 total reloads cost generated in loop{{$}}
 ; THRESHOLD-NOT: remark
 
 define void @fpr128(ptr %p) nounwind ssp {
diff --git a/llvm/test/CodeGen/AMDGPU/greedy-broken-ssa-verifier-error.mir b/llvm/test/CodeGen/AMDGPU/greedy-broken-ssa-verifier-error.mir
index 537bea7d2cfbe39..7a623d235950dd2 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-broken-ssa-verifier-error.mir
+++ b/llvm/test/CodeGen/AMDGPU/greedy-broken-ssa-verifier-error.mir
@@ -15,7 +15,7 @@ machineFunctionInfo:
 body:             |
   ; GCN-LABEL: name: ra_introduces_vreg_def
   ; GCN: [[COPY_V0:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GCN: [[COPY_V0]]:vgpr_32 =
+  ; GCN: [[COPY_V1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   bb.0:
     liveins: $vgpr0, $vgpr1
     %0:vgpr_32 = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
index e2683bba37f4bc9..75f3b5463c3944b 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll
@@ -150,16 +150,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_add_i32 s54, s55, 1
 ; CHECK-NEXT:    s_add_i32 s5, s55, 5
 ; CHECK-NEXT:    v_or3_b32 v57, s4, v43, s54
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v56, v0
-; CHECK-NEXT:    v_mov_b32_e32 v59, s54
+; CHECK-NEXT:    ds_read_u8 v0, v0
+; CHECK-NEXT:    v_mov_b32_e32 v58, s54
 ; CHECK-NEXT:    s_mov_b32 s56, exec_lo
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v56, 0xff, v0
 ; CHECK-NEXT:    v_cmpx_lt_u32_e64 s5, v42
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_17
 ; CHECK-NEXT:  ; %bb.6: ; %.preheader2
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v58, 0xff, v56
 ; CHECK-NEXT:    s_mov_b32 s57, 0
 ; CHECK-NEXT:    s_mov_b32 s58, 0
 ; CHECK-NEXT:    s_branch .LBB0_8
@@ -171,18 +170,18 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_add_i32 s5, s4, 5
 ; CHECK-NEXT:    s_add_i32 s4, s4, 1
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, s5, v42
-; CHECK-NEXT:    v_mov_b32_e32 v59, s4
+; CHECK-NEXT:    v_mov_b32_e32 v58, s4
 ; CHECK-NEXT:    s_or_b32 s57, vcc_lo, s57
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s57
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_16
 ; CHECK-NEXT:  .LBB0_8: ; Parent Loop BB0_5 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    v_add_nc_u32_e32 v60, s58, v46
-; CHECK-NEXT:    v_add_nc_u32_e32 v59, s58, v57
+; CHECK-NEXT:    v_add_nc_u32_e32 v59, s58, v46
+; CHECK-NEXT:    v_add_nc_u32_e32 v58, s58, v57
 ; CHECK-NEXT:    s_mov_b32 s59, exec_lo
-; CHECK-NEXT:    ds_read_u8 v0, v60
+; CHECK-NEXT:    ds_read_u8 v0, v59
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT:    v_cmpx_eq_u16_e64 v56, v0
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v41
@@ -197,13 +196,13 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT:    ds_write_b32 v0, v59
+; CHECK-NEXT:    ds_write_b32 v0, v58
 ; CHECK-NEXT:  .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s59
-; CHECK-NEXT:    ds_read_u8 v0, v60 offset:1
+; CHECK-NEXT:    ds_read_u8 v0, v59 offset:1
 ; CHECK-NEXT:    s_mov_b32 s59, exec_lo
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT:    v_cmpx_eq_u16_e64 v56, v0
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_12
 ; CHECK-NEXT:  ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v41
@@ -215,17 +214,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_mov_b32 s12, s41
 ; CHECK-NEXT:    s_mov_b32 s13, s40
 ; CHECK-NEXT:    s_mov_b32 s14, s33
-; CHECK-NEXT:    v_add_nc_u32_e32 v61, 1, v59
+; CHECK-NEXT:    v_add_nc_u32_e32 v60, 1, v58
 ; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT:    ds_write_b32 v0, v61
+; CHECK-NEXT:    ds_write_b32 v0, v60
 ; CHECK-NEXT:  .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s59
-; CHECK-NEXT:    ds_read_u8 v0, v60 offset:2
+; CHECK-NEXT:    ds_read_u8 v0, v59 offset:2
 ; CHECK-NEXT:    s_mov_b32 s59, exec_lo
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT:    v_cmpx_eq_u16_e64 v56, v0
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_14
 ; CHECK-NEXT:  ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v41
@@ -237,17 +236,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_mov_b32 s12, s41
 ; CHECK-NEXT:    s_mov_b32 s13, s40
 ; CHECK-NEXT:    s_mov_b32 s14, s33
-; CHECK-NEXT:    v_add_nc_u32_e32 v61, 2, v59
+; CHECK-NEXT:    v_add_nc_u32_e32 v60, 2, v58
 ; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT:    ds_write_b32 v0, v61
+; CHECK-NEXT:    ds_write_b32 v0, v60
 ; CHECK-NEXT:  .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s59
-; CHECK-NEXT:    ds_read_u8 v0, v60 offset:3
+; CHECK-NEXT:    ds_read_u8 v0, v59 offset:3
 ; CHECK-NEXT:    s_mov_b32 s59, exec_lo
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_cmpx_eq_u16_e64 v58, v0
+; CHECK-NEXT:    v_cmpx_eq_u16_e64 v56, v0
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_7
 ; CHECK-NEXT:  ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v41
@@ -259,11 +258,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    s_mov_b32 s12, s41
 ; CHECK-NEXT:    s_mov_b32 s13, s40
 ; CHECK-NEXT:    s_mov_b32 s14, s33
-; CHECK-NEXT:    v_add_nc_u32_e32 v59, 3, v59
+; CHECK-NEXT:    v_add_nc_u32_e32 v58, 3, v58
 ; CHECK-NEXT:    v_add_nc_u32_e32 v47, 1, v47
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[42:43]
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CHECK-NEXT:    ds_write_b32 v0, v59
+; CHECK-NEXT:    ds_write_b32 v0, v58
 ; CHECK-NEXT:    s_branch .LBB0_7
 ; CHECK-NEXT:  .LBB0_16: ; %Flow43
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
@@ -273,7 +272,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s56
 ; CHECK-NEXT:    s_mov_b32 s55, exec_lo
-; CHECK-NEXT:    v_cmpx_lt_u32_e64 v59, v42
+; CHECK-NEXT:    v_cmpx_lt_u32_e64 v58, v42
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_23
 ; CHECK-NEXT:  ; %bb.18: ; %.preheader
 ; CHECK-NEXT:    ; in Loop: Header=BB0_5 Depth=1
@@ -283,19 +282,19 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
 ; CHECK-NEXT:    .p2align 6
 ; CHECK-NEXT:  .LBB0_19: ; in Loop: Header=BB0_20 Depth=2
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s57
-; CHECK-NEXT:    v_add_nc_u32_e32 v59, 1, v59
+; CHECK-NEXT:    v_add_nc_u32_e32 v58, 1, v58
 ; CHECK-NEXT:    v_add_nc_u32_e32 v57, 1, v57
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v59, v42
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc_lo, v58, v42
 ; CHECK-NEXT:    s_or_b32 s56, vcc_lo, s56
 ; CHECK-NEXT:    s_andn2_b32 exec_lo, exec_lo, s56
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_22
 ; CHECK-NEXT:  .LBB0_20: ; Parent Loop BB0_5 Depth=1
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v59
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, v44, v58
+; CHECK-NEXT:    s_mov_b32 s57, exec_lo
 ; CHECK-NEXT:    ds_read_u8 v0, v0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; CHECK-NEXT:    s_and_saveexec_b32 s57, s4
+; CHECK-NEXT:    v_cmpx_eq_u16_e64 v56, v0
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_19
 ; CHECK-NEXT:  ; %bb.21: ; in Loop: Header=BB0_20 Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v31, v41
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index a4bec7f85754904..dcf49de68492405 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -114,7 +114,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_30
 ; GLOBALNESS1-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
-; GLOBALNESS1-NEXT:    ; Child Loop BB1_15 Depth 2
+; GLOBALNESS1-NEXT:    ; Child Loop BB1_16 Depth 2
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[74:75], s[74:75] op_sel:[0,1]
 ; GLOBALNESS1-NEXT:    flat_load_dword v40, v[0:1]
 ; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
@@ -133,7 +133,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[46:47]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_8
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
 ; GLOBALNESS1-NEXT:  ; %bb.5: ; %NodeBlock
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s79, 1
@@ -143,17 +143,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s79, 1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS1-NEXT:    s_cbranch_execnz .LBB1_8
-; GLOBALNESS1-NEXT:    s_branch .LBB1_23
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_8
+; GLOBALNESS1-NEXT:    s_branch .LBB1_9
 ; GLOBALNESS1-NEXT:  .LBB1_7: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS1-NEXT:    s_branch .LBB1_23
-; GLOBALNESS1-NEXT:  .LBB1_8: ; %Flow25
+; GLOBALNESS1-NEXT:  .LBB1_8: ; %LeafBlock
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s79, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow25
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
-; GLOBALNESS1-NEXT:  .LBB1_9: ; %baz.exit.i
+; GLOBALNESS1-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS1-NEXT:    flat_load_dword v0, v[2:3]
@@ -163,17 +167,17 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[80:81], s[62:63]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_26
-; GLOBALNESS1-NEXT:  ; %bb.10: ; %bb33.i
+; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[52:53]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_12
-; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb39.i
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
+; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
-; GLOBALNESS1-NEXT:  .LBB1_12: ; %bb44.lr.ph.i
+; GLOBALNESS1-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
@@ -182,40 +186,40 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[64:65], 0, v2
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
-; GLOBALNESS1-NEXT:    s_branch .LBB1_15
-; GLOBALNESS1-NEXT:  .LBB1_13: ; %Flow16
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_branch .LBB1_16
+; GLOBALNESS1-NEXT:  .LBB1_14: ; %Flow16
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GLOBALNESS1-NEXT:  .LBB1_14: ; %bb63.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb63.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[50:51]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_25
-; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb44.i
+; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS1-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[60:61]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_14
-; GLOBALNESS1-NEXT:  ; %bb.16: ; %bb46.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
+; GLOBALNESS1-NEXT:  ; %bb.17: ; %bb46.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[48:49]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_14
-; GLOBALNESS1-NEXT:  ; %bb.17: ; %bb50.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
+; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb50.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[42:43]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_20
-; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb3.i.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
+; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb3.i.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[44:45]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_20
-; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb6.i.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
+; GLOBALNESS1-NEXT:  ; %bb.20: ; %bb6.i.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[66:67]
-; GLOBALNESS1-NEXT:  .LBB1_20: ; %spam.exit.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:  .LBB1_21: ; %spam.exit.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[54:55]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_14
-; GLOBALNESS1-NEXT:  ; %bb.21: ; %bb55.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
+; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb55.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    s_add_u32 s68, s38, 40
 ; GLOBALNESS1-NEXT:    s_addc_u32 s69, s39, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -239,19 +243,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
-; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_13
-; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb62.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_14
+; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb62.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[42:43], off
-; GLOBALNESS1-NEXT:    s_branch .LBB1_13
-; GLOBALNESS1-NEXT:  .LBB1_23: ; %LeafBlock
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s79, 0
-; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
-; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
+; GLOBALNESS1-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS1-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -403,7 +400,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_30
 ; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
-; GLOBALNESS0-NEXT:    ; Child Loop BB1_15 Depth 2
+; GLOBALNESS0-NEXT:    ; Child Loop BB1_16 Depth 2
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[76:77], s[76:77] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    flat_load_dword v40, v[0:1]
 ; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
@@ -422,7 +419,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[46:47]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_8
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
 ; GLOBALNESS0-NEXT:  ; %bb.5: ; %NodeBlock
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s75, 1
@@ -432,17 +429,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s75, 1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS0-NEXT:    s_cbranch_execnz .LBB1_8
-; GLOBALNESS0-NEXT:    s_branch .LBB1_23
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_8
+; GLOBALNESS0-NEXT:    s_branch .LBB1_9
 ; GLOBALNESS0-NEXT:  .LBB1_7: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS0-NEXT:    s_branch .LBB1_23
-; GLOBALNESS0-NEXT:  .LBB1_8: ; %Flow25
+; GLOBALNESS0-NEXT:  .LBB1_8: ; %LeafBlock
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s75, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow25
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
-; GLOBALNESS0-NEXT:  .LBB1_9: ; %baz.exit.i
+; GLOBALNESS0-NEXT:  ; %bb.10: ; %baz.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS0-NEXT:    flat_load_dword v0, v[2:3]
@@ -452,17 +453,17 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[80:81], s[62:63]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
-; GLOBALNESS0-NEXT:  ; %bb.10: ; %bb33.i
+; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[52:53]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_12
-; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb39.i
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
+; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[42:43], off
-; GLOBALNESS0-NEXT:  .LBB1_12: ; %bb44.lr.ph.i
+; GLOBALNESS0-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
@@ -471,40 +472,40 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[64:65], 0, v2
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[66:67], 1, v0
-; GLOBALNESS0-NEXT:    s_branch .LBB1_15
-; GLOBALNESS0-NEXT:  .LBB1_13: ; %Flow16
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_branch .LBB1_16
+; GLOBALNESS0-NEXT:  .LBB1_14: ; %Flow16
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GLOBALNESS0-NEXT:  .LBB1_14: ; %bb63.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb63.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[50:51]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_25
-; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb44.i
+; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[60:61]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_14
-; GLOBALNESS0-NEXT:  ; %bb.16: ; %bb46.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
+; GLOBALNESS0-NEXT:  ; %bb.17: ; %bb46.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[48:49]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_14
-; GLOBALNESS0-NEXT:  ; %bb.17: ; %bb50.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
+; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb50.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[42:43]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_20
-; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb3.i.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
+; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb3.i.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[44:45]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_20
-; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb6.i.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
+; GLOBALNESS0-NEXT:  ; %bb.20: ; %bb6.i.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[66:67]
-; GLOBALNESS0-NEXT:  .LBB1_20: ; %spam.exit.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:  .LBB1_21: ; %spam.exit.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[54:55]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_14
-; GLOBALNESS0-NEXT:  ; %bb.21: ; %bb55.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
+; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb55.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    s_add_u32 s72, s38, 40
 ; GLOBALNESS0-NEXT:    s_addc_u32 s73, s39, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
@@ -528,19 +529,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
-; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_13
-; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb62.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_14
+; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb62.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v42
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[42:43], off
-; GLOBALNESS0-NEXT:    s_branch .LBB1_13
-; GLOBALNESS0-NEXT:  .LBB1_23: ; %LeafBlock
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s75, 0
-; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
-; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
+; GLOBALNESS0-NEXT:    s_branch .LBB1_14
 ; GLOBALNESS0-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll b/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
index e97e17fc64782f4..b41f6c75133035d 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
@@ -87,11 +87,11 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_8
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_9: # %bb17
+; CHECK-NEXT:  .LBB0_9: # %bb54
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_9
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_10: # %bb26
+; CHECK-NEXT:  .LBB0_10: # %bb17
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_10
 ; CHECK-NEXT:    .p2align 4
@@ -99,7 +99,7 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_11
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_12: # %bb54
+; CHECK-NEXT:  .LBB0_12: # %bb26
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_12
 ; CHECK-NEXT:    .p2align 4
@@ -107,71 +107,71 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_13
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_14: # %bb47
+; CHECK-NEXT:  .LBB0_14: # %bb62
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_14
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_15: # %bb24
+; CHECK-NEXT:  .LBB0_15: # %bb47
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_15
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_16: # %bb19
+; CHECK-NEXT:  .LBB0_16: # %bb48
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_16
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_17: # %bb59
+; CHECK-NEXT:  .LBB0_17: # %bb49
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_17
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_18: # %bb46
+; CHECK-NEXT:  .LBB0_18: # %bb50
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_18
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_19: # %bb49
+; CHECK-NEXT:  .LBB0_19: # %bb58
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_19
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_20: # %bb57
+; CHECK-NEXT:  .LBB0_20: # %bb59
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_20
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_21: # %bb18
+; CHECK-NEXT:  .LBB0_21: # %bb60
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_21
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_22: # %bb58
+; CHECK-NEXT:  .LBB0_22: # %bb24
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_22
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_23: # %bb23
+; CHECK-NEXT:  .LBB0_23: # %bb19
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_23
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_24: # %bb60
+; CHECK-NEXT:  .LBB0_24: # %bb20
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_24
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_25: # %bb55
+; CHECK-NEXT:  .LBB0_25: # %bb23
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_25
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_26: # %bb56
+; CHECK-NEXT:  .LBB0_26: # %bb46
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_26
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_27: # %bb62
+; CHECK-NEXT:  .LBB0_27: # %bb55
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_27
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_28: # %bb20
+; CHECK-NEXT:  .LBB0_28: # %bb56
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_28
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_29: # %bb50
+; CHECK-NEXT:  .LBB0_29: # %bb57
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_29
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_30: # %bb48
+; CHECK-NEXT:  .LBB0_30: # %bb18
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    b .LBB0_30
 ; CHECK-NEXT:  .LBB0_31: # %bb9
@@ -280,11 +280,11 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_8
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_9: # %bb17
+; CHECK-BE-NEXT:  .LBB0_9: # %bb54
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_9
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_10: # %bb26
+; CHECK-BE-NEXT:  .LBB0_10: # %bb17
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_10
 ; CHECK-BE-NEXT:    .p2align 4
@@ -292,7 +292,7 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_11
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_12: # %bb54
+; CHECK-BE-NEXT:  .LBB0_12: # %bb26
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_12
 ; CHECK-BE-NEXT:    .p2align 4
@@ -300,71 +300,71 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_13
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_14: # %bb47
+; CHECK-BE-NEXT:  .LBB0_14: # %bb62
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_14
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_15: # %bb24
+; CHECK-BE-NEXT:  .LBB0_15: # %bb47
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_15
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_16: # %bb19
+; CHECK-BE-NEXT:  .LBB0_16: # %bb48
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_16
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_17: # %bb59
+; CHECK-BE-NEXT:  .LBB0_17: # %bb49
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_17
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_18: # %bb46
+; CHECK-BE-NEXT:  .LBB0_18: # %bb50
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_18
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_19: # %bb49
+; CHECK-BE-NEXT:  .LBB0_19: # %bb58
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_19
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_20: # %bb57
+; CHECK-BE-NEXT:  .LBB0_20: # %bb59
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_20
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_21: # %bb18
+; CHECK-BE-NEXT:  .LBB0_21: # %bb60
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_21
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_22: # %bb58
+; CHECK-BE-NEXT:  .LBB0_22: # %bb24
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_22
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_23: # %bb23
+; CHECK-BE-NEXT:  .LBB0_23: # %bb19
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_23
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_24: # %bb60
+; CHECK-BE-NEXT:  .LBB0_24: # %bb20
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_24
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_25: # %bb55
+; CHECK-BE-NEXT:  .LBB0_25: # %bb23
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_25
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_26: # %bb56
+; CHECK-BE-NEXT:  .LBB0_26: # %bb46
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_26
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_27: # %bb62
+; CHECK-BE-NEXT:  .LBB0_27: # %bb55
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_27
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_28: # %bb20
+; CHECK-BE-NEXT:  .LBB0_28: # %bb56
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_28
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_29: # %bb50
+; CHECK-BE-NEXT:  .LBB0_29: # %bb57
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_29
 ; CHECK-BE-NEXT:    .p2align 4
-; CHECK-BE-NEXT:  .LBB0_30: # %bb48
+; CHECK-BE-NEXT:  .LBB0_30: # %bb18
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    b .LBB0_30
 ; CHECK-BE-NEXT:  .LBB0_31: # %bb9
diff --git a/llvm/test/CodeGen/PowerPC/reduce_cr.ll b/llvm/test/CodeGen/PowerPC/reduce_cr.ll
index b1cac1cbc871aba..7491d13c5301015 100644
--- a/llvm/test/CodeGen/PowerPC/reduce_cr.ll
+++ b/llvm/test/CodeGen/PowerPC/reduce_cr.ll
@@ -4,10 +4,10 @@ target triple = "powerpc64le-grtev4-linux-gnu"
 
 ; First block frequency info
 ;CHECK:      block-frequency-info: loop_test
-;CHECK-NEXT: - BB0[entry]: float = 1.0, int = 12
-;CHECK-NEXT: - BB1[for.check]: float = 2.6667, int = 34
-;CHECK-NEXT: - BB2[test1]: float = 1.6667, int = 21
-;CHECK-NEXT: - BB3[optional1]: float = 0.625, int = 8
+;CHECK-NEXT: - BB0[entry]: float = 1.0, int = {{.*}}
+;CHECK-NEXT: - BB1[for.check]: float = 2.6667, int = {{.*}}
+;CHECK-NEXT: - BB2[test1]: float = 1.6667, int = {{.*}}
+;CHECK-NEXT: - BB3[optional1]: float = 0.625, int = {{.*}}
 
 ;CHECK:      block-frequency-info: loop_test
 ;CHECK:      block-frequency-info: loop_test
@@ -15,11 +15,11 @@ target triple = "powerpc64le-grtev4-linux-gnu"
 
 ; Last block frequency info
 ;CHECK:      block-frequency-info: loop_test
-;CHECK-NEXT: - BB0[entry]: float = 1.0, int = 12
-;CHECK-NEXT: - BB1[for.check]: float = 2.6667, int = 34
-;CHECK-NEXT: - BB2[for.check]: float = 2.1667, int = 27
-;CHECK-NEXT: - BB3[test1]: float = 1.6667, int = 21
-;CHECK-NEXT: - BB4[optional1]: float = 0.625, int = 8
+;CHECK-NEXT: - BB0[entry]: float = 1.0, int = {{.*}}
+;CHECK-NEXT: - BB1[for.check]: float = 2.6667, int = {{.*}}
+;CHECK-NEXT: - BB2[for.check]: float = 2.1667, int = {{.*}}
+;CHECK-NEXT: - BB3[test1]: float = 1.6667, int = {{.*}}
+;CHECK-NEXT: - BB4[optional1]: float = 0.625, int = {{.*}}
 
 
 define void @loop_test(ptr %tags, i32 %count) {
diff --git a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
index 8b4df1d2f99dac6..77d861ad0599c18 100644
--- a/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
+++ b/llvm/test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -372,19 +372,17 @@ exit:
 ; CHECK: # %bb.{{[0-9]+}}: # %entry
 ; CHECK: andi.
 ; CHECK: # %bb.{{[0-9]+}}: # %test2
-; Make sure then2 falls through from test2
+; Make sure else2 falls through from test2
 ; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
-; CHECK: # %bb.{{[0-9]+}}: # %then2
-; CHECK: andi. {{[0-9]+}}, {{[0-9]+}}, 4
+; CHECK: # %bb.{{[0-9]+}}: # %else2
+; CHECK: bl c
 ; CHECK: # %else1
 ; CHECK: bl a
 ; CHECK: bl a
-; Make sure then2 was copied into else1
+; CHECK: # %then2
 ; CHECK: andi. {{[0-9]+}}, {{[0-9]+}}, 4
 ; CHECK: # %end1
 ; CHECK: bl d
-; CHECK: # %else2
-; CHECK: bl c
 ; CHECK: # %end2
 define void @avoidable_test(i32 %tag) {
 entry:
diff --git a/llvm/test/CodeGen/RISCV/branch-relaxation.ll b/llvm/test/CodeGen/RISCV/branch-relaxation.ll
index 4f7736e318cae6b..3d48dc9637eaedf 100644
--- a/llvm/test/CodeGen/RISCV/branch-relaxation.ll
+++ b/llvm/test/CodeGen/RISCV/branch-relaxation.ll
@@ -2769,42 +2769,22 @@ define void @relax_jal_spill_32_restore_block_correspondence() {
 ; CHECK-RV32-NEXT:    #APP
 ; CHECK-RV32-NEXT:    li t6, 31
 ; CHECK-RV32-NEXT:    #NO_APP
-; CHECK-RV32-NEXT:    bne t5, t6, .LBB6_1
-; CHECK-RV32-NEXT:  # %bb.7: # %entry
-; CHECK-RV32-NEXT:    sw s11, 0(sp)
-; CHECK-RV32-NEXT:    jump .LBB6_8, s11
-; CHECK-RV32-NEXT:  .LBB6_1: # %cond_2
-; CHECK-RV32-NEXT:    bne t3, t4, .LBB6_2
-; CHECK-RV32-NEXT:  # %bb.9: # %cond_2
-; CHECK-RV32-NEXT:    sw s11, 0(sp)
-; CHECK-RV32-NEXT:    jump .LBB6_10, s11
-; CHECK-RV32-NEXT:  .LBB6_2: # %cond_3
-; CHECK-RV32-NEXT:    bne t1, t2, .LBB6_3
-; CHECK-RV32-NEXT:  # %bb.11: # %cond_3
-; CHECK-RV32-NEXT:    sw s11, 0(sp)
-; CHECK-RV32-NEXT:    jump .LBB6_12, s11
-; CHECK-RV32-NEXT:  .LBB6_3: # %space
-; CHECK-RV32-NEXT:    #APP
-; CHECK-RV32-NEXT:    .zero 1048576
-; CHECK-RV32-NEXT:    #NO_APP
-; CHECK-RV32-NEXT:    j .LBB6_4
+; CHECK-RV32-NEXT:    bne t5, t6, .LBB6_2
+; CHECK-RV32-NEXT:    j .LBB6_1
 ; CHECK-RV32-NEXT:  .LBB6_8: # %dest_1
 ; CHECK-RV32-NEXT:    lw s11, 0(sp)
-; CHECK-RV32-NEXT:  .LBB6_4: # %dest_1
+; CHECK-RV32-NEXT:  .LBB6_1: # %dest_1
 ; CHECK-RV32-NEXT:    #APP
 ; CHECK-RV32-NEXT:    # dest 1
 ; CHECK-RV32-NEXT:    #NO_APP
-; CHECK-RV32-NEXT:    j .LBB6_5
-; CHECK-RV32-NEXT:  .LBB6_10: # %dest_2
-; CHECK-RV32-NEXT:    lw s11, 0(sp)
-; CHECK-RV32-NEXT:  .LBB6_5: # %dest_2
+; CHECK-RV32-NEXT:    j .LBB6_3
+; CHECK-RV32-NEXT:  .LBB6_2: # %cond_2
+; CHECK-RV32-NEXT:    bne t3, t4, .LBB6_5
+; CHECK-RV32-NEXT:  .LBB6_3: # %dest_2
 ; CHECK-RV32-NEXT:    #APP
 ; CHECK-RV32-NEXT:    # dest 2
 ; CHECK-RV32-NEXT:    #NO_APP
-; CHECK-RV32-NEXT:    j .LBB6_6
-; CHECK-RV32-NEXT:  .LBB6_12: # %dest_3
-; CHECK-RV32-NEXT:    lw s11, 0(sp)
-; CHECK-RV32-NEXT:  .LBB6_6: # %dest_3
+; CHECK-RV32-NEXT:  .LBB6_4: # %dest_3
 ; CHECK-RV32-NEXT:    #APP
 ; CHECK-RV32-NEXT:    # dest 3
 ; CHECK-RV32-NEXT:    #NO_APP
@@ -2907,6 +2887,15 @@ define void @relax_jal_spill_32_restore_block_correspondence() {
 ; CHECK-RV32-NEXT:    lw s11, 12(sp) # 4-byte Folded Reload
 ; CHECK-RV32-NEXT:    addi sp, sp, 64
 ; CHECK-RV32-NEXT:    ret
+; CHECK-RV32-NEXT:  .LBB6_5: # %cond_3
+; CHECK-RV32-NEXT:    beq t1, t2, .LBB6_4
+; CHECK-RV32-NEXT:  # %bb.6: # %space
+; CHECK-RV32-NEXT:    #APP
+; CHECK-RV32-NEXT:    .zero 1048576
+; CHECK-RV32-NEXT:    #NO_APP
+; CHECK-RV32-NEXT:  # %bb.7: # %space
+; CHECK-RV32-NEXT:    sw s11, 0(sp)
+; CHECK-RV32-NEXT:    jump .LBB6_8, s11
 ;
 ; CHECK-RV64-LABEL: relax_jal_spill_32_restore_block_correspondence:
 ; CHECK-RV64:       # %bb.0: # %entry
@@ -3026,34 +3015,21 @@ define void @relax_jal_spill_32_restore_block_correspondence() {
 ; CHECK-RV64-NEXT:    sext.w t6, t6
 ; CHECK-RV64-NEXT:    sd t5, 16(sp) # 8-byte Folded Spill
 ; CHECK-RV64-NEXT:    sext.w t5, t5
-; CHECK-RV64-NEXT:    bne t5, t6, .LBB6_1
-; CHECK-RV64-NEXT:  # %bb.7: # %entry
-; CHECK-RV64-NEXT:    jump .LBB6_4, t5
-; CHECK-RV64-NEXT:  .LBB6_1: # %cond_2
-; CHECK-RV64-NEXT:    sext.w t5, t4
-; CHECK-RV64-NEXT:    sext.w t6, t3
-; CHECK-RV64-NEXT:    bne t6, t5, .LBB6_2
-; CHECK-RV64-NEXT:  # %bb.9: # %cond_2
-; CHECK-RV64-NEXT:    jump .LBB6_5, t5
-; CHECK-RV64-NEXT:  .LBB6_2: # %cond_3
-; CHECK-RV64-NEXT:    sext.w t5, t2
-; CHECK-RV64-NEXT:    sext.w t6, t1
-; CHECK-RV64-NEXT:    bne t6, t5, .LBB6_3
-; CHECK-RV64-NEXT:  # %bb.11: # %cond_3
-; CHECK-RV64-NEXT:    jump .LBB6_6, t5
-; CHECK-RV64-NEXT:  .LBB6_3: # %space
-; CHECK-RV64-NEXT:    #APP
-; CHECK-RV64-NEXT:    .zero 1048576
-; CHECK-RV64-NEXT:    #NO_APP
-; CHECK-RV64-NEXT:  .LBB6_4: # %dest_1
+; CHECK-RV64-NEXT:    bne t5, t6, .LBB6_2
+; CHECK-RV64-NEXT:  .LBB6_1: # %dest_1
 ; CHECK-RV64-NEXT:    #APP
 ; CHECK-RV64-NEXT:    # dest 1
 ; CHECK-RV64-NEXT:    #NO_APP
-; CHECK-RV64-NEXT:  .LBB6_5: # %dest_2
+; CHECK-RV64-NEXT:    j .LBB6_3
+; CHECK-RV64-NEXT:  .LBB6_2: # %cond_2
+; CHECK-RV64-NEXT:    sext.w t5, t4
+; CHECK-RV64-NEXT:    sext.w t6, t3
+; CHECK-RV64-NEXT:    bne t6, t5, .LBB6_5
+; CHECK-RV64-NEXT:  .LBB6_3: # %dest_2
 ; CHECK-RV64-NEXT:    #APP
 ; CHECK-RV64-NEXT:    # dest 2
 ; CHECK-RV64-NEXT:    #NO_APP
-; CHECK-RV64-NEXT:  .LBB6_6: # %dest_3
+; CHECK-RV64-NEXT:  .LBB6_4: # %dest_3
 ; CHECK-RV64-NEXT:    #APP
 ; CHECK-RV64-NEXT:    # dest 3
 ; CHECK-RV64-NEXT:    #NO_APP
@@ -3158,6 +3134,16 @@ define void @relax_jal_spill_32_restore_block_correspondence() {
 ; CHECK-RV64-NEXT:    ld s11, 24(sp) # 8-byte Folded Reload
 ; CHECK-RV64-NEXT:    addi sp, sp, 128
 ; CHECK-RV64-NEXT:    ret
+; CHECK-RV64-NEXT:  .LBB6_5: # %cond_3
+; CHECK-RV64-NEXT:    sext.w t5, t2
+; CHECK-RV64-NEXT:    sext.w t6, t1
+; CHECK-RV64-NEXT:    beq t6, t5, .LBB6_4
+; CHECK-RV64-NEXT:  # %bb.6: # %space
+; CHECK-RV64-NEXT:    #APP
+; CHECK-RV64-NEXT:    .zero 1048576
+; CHECK-RV64-NEXT:    #NO_APP
+; CHECK-RV64-NEXT:  # %bb.7: # %space
+; CHECK-RV64-NEXT:    jump .LBB6_1, t5
 entry:
   %ra = call i32 asm sideeffect "addi ra, x0, 1", "={ra}"()
   %t0 = call i32 asm sideeffect "addi t0, x0, 5", "={t0}"()
diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
index 39bf97d880ea3f4..e22fd4cabfa529d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
@@ -357,48 +357,50 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    mov r12, r1
-; CHECK-NEXT:    subs r1, r0, #1
-; CHECK-NEXT:    sbcs r1, r12, #0
+; CHECK-NEXT:    mov lr, r0
+; CHECK-NEXT:    subs r0, #1
+; CHECK-NEXT:    sbcs r0, r1, #0
 ; CHECK-NEXT:    blt.w .LBB1_28
 ; CHECK-NEXT:  @ %bb.1: @ %for.cond2.preheader.lr.ph
-; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    csel lr, r2, r3, lt
-; CHECK-NEXT:    movw r4, #43691
-; CHECK-NEXT:    mov r1, lr
-; CHECK-NEXT:    cmp.w lr, #3
+; CHECK-NEXT:    csel r7, r2, r0, lt
+; CHECK-NEXT:    mov r12, r1
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    cmp r7, #3
 ; CHECK-NEXT:    it ls
 ; CHECK-NEXT:    movls r1, #3
-; CHECK-NEXT:    movt r4, #43690
-; CHECK-NEXT:    sub.w r1, r1, lr
-; CHECK-NEXT:    ldr r6, [sp, #128]
+; CHECK-NEXT:    mov r4, r2
+; CHECK-NEXT:    subs r1, r1, r7
+; CHECK-NEXT:    movw r2, #43691
 ; CHECK-NEXT:    adds r1, #2
+; CHECK-NEXT:    movt r2, #43690
+; CHECK-NEXT:    ldr r6, [sp, #128]
 ; CHECK-NEXT:    movw r8, :lower16:c
+; CHECK-NEXT:    umull r1, r2, r1, r2
 ; CHECK-NEXT:    movt r8, :upper16:c
-; CHECK-NEXT:    mov.w r9, #12
-; CHECK-NEXT:    umull r1, r4, r1, r4
+; CHECK-NEXT:    movs r1, #4
 ; CHECK-NEXT:    @ implicit-def: $r10
 ; CHECK-NEXT:    @ implicit-def: $r5
 ; CHECK-NEXT:    @ implicit-def: $r11
-; CHECK-NEXT:    str r0, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    movs r1, #4
-; CHECK-NEXT:    strd r2, r12, [sp, #4] @ 8-byte Folded Spill
-; CHECK-NEXT:    add.w r3, r3, r4, lsr #1
-; CHECK-NEXT:    add.w r1, r1, r4, lsr #1
-; CHECK-NEXT:    movw r4, #65532
-; CHECK-NEXT:    vdup.32 q6, r3
-; CHECK-NEXT:    movt r4, #32767
-; CHECK-NEXT:    and.w r7, r1, r4
+; CHECK-NEXT:    mov.w r9, #12
+; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r0, r2, lsr #1
+; CHECK-NEXT:    add.w r1, r1, r2, lsr #1
+; CHECK-NEXT:    movw r2, #65532
+; CHECK-NEXT:    vdup.32 q6, r0
+; CHECK-NEXT:    movt r2, #32767
+; CHECK-NEXT:    and.w r3, r1, r2
 ; CHECK-NEXT:    adr r1, .LCPI1_0
-; CHECK-NEXT:    vdup.32 q7, r3
+; CHECK-NEXT:    vdup.32 q7, r0
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI1_1
 ; CHECK-NEXT:    vldrw.u32 q5, [r1]
-; CHECK-NEXT:    vadd.i32 q4, q0, lr
-; CHECK-NEXT:    b .LBB1_4
+; CHECK-NEXT:    strd r3, r7, [sp, #4] @ 8-byte Folded Spill
+; CHECK-NEXT:    vadd.i32 q4, q0, r7
+; CHECK-NEXT:    b .LBB1_6
 ; CHECK-NEXT:  .LBB1_2: @ %for.body6.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
 ; CHECK-NEXT:    mov r0, r11
 ; CHECK-NEXT:    cmn.w r11, #4
 ; CHECK-NEXT:    it le
@@ -407,7 +409,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    adds r0, #6
 ; CHECK-NEXT:    movt r2, #9362
 ; CHECK-NEXT:    sub.w r1, r0, r11
-; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:    mov r10, r3
 ; CHECK-NEXT:    umull r2, r3, r1, r2
 ; CHECK-NEXT:    subs r2, r1, r3
 ; CHECK-NEXT:    add.w r2, r3, r2, lsr #1
@@ -415,73 +417,81 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    lsls r3, r3, #3
 ; CHECK-NEXT:    sub.w r2, r3, r2, lsr #2
 ; CHECK-NEXT:    subs r1, r2, r1
+; CHECK-NEXT:    mov r3, r10
 ; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:  .LBB1_3: @ %for.cond.cleanup5.loopexit134.split.loop.exit139
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
 ; CHECK-NEXT:    add.w r11, r0, #7
-; CHECK-NEXT:    ldrd r12, r0, [sp, #8] @ 8-byte Folded Reload
-; CHECK-NEXT:  .LBB1_3: @ %for.cond.cleanup5
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
+; CHECK-NEXT:  .LBB1_4: @ %for.cond.cleanup5
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
+; CHECK-NEXT:    mov.w r10, #0
+; CHECK-NEXT:  .LBB1_5: @ %for.cond.cleanup5
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
 ; CHECK-NEXT:    adds r5, #2
-; CHECK-NEXT:    subs r1, r5, r0
-; CHECK-NEXT:    asr.w r3, r5, #31
-; CHECK-NEXT:    sbcs.w r1, r3, r12
+; CHECK-NEXT:    subs.w r1, r5, lr
+; CHECK-NEXT:    asr.w r0, r5, #31
+; CHECK-NEXT:    sbcs.w r0, r0, r12
 ; CHECK-NEXT:    bge.w .LBB1_28
-; CHECK-NEXT:  .LBB1_4: @ %for.cond2.preheader
+; CHECK-NEXT:  .LBB1_6: @ %for.cond2.preheader
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB1_17 Depth 2
-; CHECK-NEXT:    @ Child Loop BB1_8 Depth 2
-; CHECK-NEXT:    @ Child Loop BB1_10 Depth 3
+; CHECK-NEXT:    @ Child Loop BB1_19 Depth 2
+; CHECK-NEXT:    @ Child Loop BB1_10 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB1_12 Depth 3
+; CHECK-NEXT:    @ Child Loop BB1_14 Depth 3
 ; CHECK-NEXT:    cmp.w r11, #2
-; CHECK-NEXT:    bgt .LBB1_3
-; CHECK-NEXT:  @ %bb.5: @ %for.body6.lr.ph
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    cmp.w lr, #5
-; CHECK-NEXT:    bhi .LBB1_15
-; CHECK-NEXT:  @ %bb.6: @ %for.body6.us.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
+; CHECK-NEXT:    bgt .LBB1_5
+; CHECK-NEXT:  @ %bb.7: @ %for.body6.lr.ph
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
+; CHECK-NEXT:    cmp r7, #5
+; CHECK-NEXT:    bhi .LBB1_17
+; CHECK-NEXT:  @ %bb.8: @ %for.body6.us.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
 ; CHECK-NEXT:    ldrd r2, r3, [sp, #120]
 ; CHECK-NEXT:    movs r0, #32
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    mov r4, r7
-; CHECK-NEXT:    mov r7, lr
+; CHECK-NEXT:    mov r4, r6
+; CHECK-NEXT:    mov r7, r12
+; CHECK-NEXT:    mov r6, lr
 ; CHECK-NEXT:    bl __aeabi_ldivmod
+; CHECK-NEXT:    mov lr, r6
+; CHECK-NEXT:    mov r6, r4
+; CHECK-NEXT:    mov r12, r7
+; CHECK-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r4, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vdup.32 q0, r2
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r2, r12, [sp, #4] @ 8-byte Folded Reload
-; CHECK-NEXT:    mov lr, r7
-; CHECK-NEXT:    mov r7, r4
-; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    b .LBB1_8
-; CHECK-NEXT:  .LBB1_7: @ %for.cond.cleanup17.us
-; CHECK-NEXT:    @ in Loop: Header=BB1_8 Depth=2
-; CHECK-NEXT:    add.w r11, r3, #7
-; CHECK-NEXT:    cmn.w r3, #4
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    mov r0, r11
+; CHECK-NEXT:    b .LBB1_10
+; CHECK-NEXT:  .LBB1_9: @ %for.cond.cleanup17.us
+; CHECK-NEXT:    @ in Loop: Header=BB1_10 Depth=2
+; CHECK-NEXT:    add.w r11, r0, #7
+; CHECK-NEXT:    cmn.w r0, #4
 ; CHECK-NEXT:    mov.w r10, #0
-; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    bge .LBB1_3
-; CHECK-NEXT:  .LBB1_8: @ %for.body6.us
-; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
+; CHECK-NEXT:    mov r0, r11
+; CHECK-NEXT:    bge .LBB1_5
+; CHECK-NEXT:  .LBB1_10: @ %for.body6.us
+; CHECK-NEXT:    @ Parent Loop BB1_6 Depth=1
 ; CHECK-NEXT:    @ => This Loop Header: Depth=2
-; CHECK-NEXT:    @ Child Loop BB1_10 Depth 3
 ; CHECK-NEXT:    @ Child Loop BB1_12 Depth 3
+; CHECK-NEXT:    @ Child Loop BB1_14 Depth 3
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    cbz r2, .LBB1_11
-; CHECK-NEXT:  @ %bb.9: @ %for.body13.us51.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB1_8 Depth=2
-; CHECK-NEXT:    movw r4, :lower16:a
+; CHECK-NEXT:    cbz r4, .LBB1_13
+; CHECK-NEXT:  @ %bb.11: @ %for.body13.us51.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB1_10 Depth=2
+; CHECK-NEXT:    movw r2, :lower16:a
 ; CHECK-NEXT:    vmov q1, q4
-; CHECK-NEXT:    movt r4, :upper16:a
-; CHECK-NEXT:    str r1, [r4]
-; CHECK-NEXT:    movw r4, :lower16:b
-; CHECK-NEXT:    movt r4, :upper16:b
-; CHECK-NEXT:    str r1, [r4]
-; CHECK-NEXT:    mov r4, r7
-; CHECK-NEXT:  .LBB1_10: @ %vector.body111
-; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
-; CHECK-NEXT:    @ Parent Loop BB1_8 Depth=2
+; CHECK-NEXT:    movt r2, :upper16:a
+; CHECK-NEXT:    str r1, [r2]
+; CHECK-NEXT:    movw r2, :lower16:b
+; CHECK-NEXT:    movt r2, :upper16:b
+; CHECK-NEXT:    str r1, [r2]
+; CHECK-NEXT:    mov r2, r3
+; CHECK-NEXT:  .LBB1_12: @ %vector.body111
+; CHECK-NEXT:    @ Parent Loop BB1_6 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB1_10 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
 ; CHECK-NEXT:    vqadd.u32 q2, q5, r1
-; CHECK-NEXT:    subs r4, #4
+; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vcmp.u32 hi, q7, q2
 ; CHECK-NEXT:    vshl.i32 q2, q1, #2
 ; CHECK-NEXT:    add.w r1, r1, #4
@@ -489,18 +499,18 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    vadd.i32 q1, q1, r9
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [q2]
-; CHECK-NEXT:    bne .LBB1_10
-; CHECK-NEXT:    b .LBB1_13
-; CHECK-NEXT:  .LBB1_11: @ %vector.body.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB1_8 Depth=2
-; CHECK-NEXT:    mov r4, r7
+; CHECK-NEXT:    bne .LBB1_12
+; CHECK-NEXT:    b .LBB1_15
+; CHECK-NEXT:  .LBB1_13: @ %vector.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB1_10 Depth=2
+; CHECK-NEXT:    mov r2, r3
 ; CHECK-NEXT:    vmov q1, q4
-; CHECK-NEXT:  .LBB1_12: @ %vector.body
-; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
-; CHECK-NEXT:    @ Parent Loop BB1_8 Depth=2
+; CHECK-NEXT:  .LBB1_14: @ %vector.body
+; CHECK-NEXT:    @ Parent Loop BB1_6 Depth=1
+; CHECK-NEXT:    @ Parent Loop BB1_10 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
 ; CHECK-NEXT:    vqadd.u32 q2, q5, r1
-; CHECK-NEXT:    subs r4, #4
+; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vcmp.u32 hi, q6, q2
 ; CHECK-NEXT:    vshl.i32 q2, q1, #2
 ; CHECK-NEXT:    add.w r1, r1, #4
@@ -508,64 +518,56 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
 ; CHECK-NEXT:    vadd.i32 q1, q1, r9
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [q2]
-; CHECK-NEXT:    bne .LBB1_12
-; CHECK-NEXT:  .LBB1_13: @ %for.cond9.for.cond15.preheader_crit_edge.us
-; CHECK-NEXT:    @ in Loop: Header=BB1_8 Depth=2
+; CHECK-NEXT:    bne .LBB1_14
+; CHECK-NEXT:  .LBB1_15: @ %for.cond9.for.cond15.preheader_crit_edge.us
+; CHECK-NEXT:    @ in Loop: Header=BB1_10 Depth=2
 ; CHECK-NEXT:    cmp r6, #0
-; CHECK-NEXT:    beq .LBB1_7
-; CHECK-NEXT:  @ %bb.14: @ %for.cond9.for.cond15.preheader_crit_edge.us
-; CHECK-NEXT:    @ in Loop: Header=BB1_8 Depth=2
+; CHECK-NEXT:    beq .LBB1_9
+; CHECK-NEXT:  @ %bb.16: @ %for.cond9.for.cond15.preheader_crit_edge.us
+; CHECK-NEXT:    @ in Loop: Header=BB1_10 Depth=2
 ; CHECK-NEXT:    eor r1, r10, #1
 ; CHECK-NEXT:    lsls r1, r1, #31
-; CHECK-NEXT:    bne .LBB1_7
+; CHECK-NEXT:    bne .LBB1_9
 ; CHECK-NEXT:    b .LBB1_26
-; CHECK-NEXT:  .LBB1_15: @ %for.body6.lr.ph.split
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
+; CHECK-NEXT:  .LBB1_17: @ %for.body6.lr.ph.split
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
 ; CHECK-NEXT:    cmp r6, #0
 ; CHECK-NEXT:    beq.w .LBB1_2
-; CHECK-NEXT:  @ %bb.16: @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    ldrd r12, r0, [sp, #8] @ 8-byte Folded Reload
-; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:  .LBB1_17: @ %for.body6.us60
-; CHECK-NEXT:    @ Parent Loop BB1_4 Depth=1
+; CHECK-NEXT:  @ %bb.18: @ in Loop: Header=BB1_6 Depth=1
+; CHECK-NEXT:    mov r0, r11
+; CHECK-NEXT:  .LBB1_19: @ %for.body6.us60
+; CHECK-NEXT:    @ Parent Loop BB1_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    lsls.w r1, r10, #31
 ; CHECK-NEXT:    bne .LBB1_27
-; CHECK-NEXT:  @ %bb.18: @ %for.cond.cleanup17.us63
-; CHECK-NEXT:    @ in Loop: Header=BB1_17 Depth=2
-; CHECK-NEXT:    cmn.w r3, #4
-; CHECK-NEXT:    bge .LBB1_22
-; CHECK-NEXT:  @ %bb.19: @ %for.cond.cleanup17.us63.1
-; CHECK-NEXT:    @ in Loop: Header=BB1_17 Depth=2
-; CHECK-NEXT:    cmn.w r3, #12
-; CHECK-NEXT:    bgt .LBB1_23
-; CHECK-NEXT:  @ %bb.20: @ %for.cond.cleanup17.us63.2
-; CHECK-NEXT:    @ in Loop: Header=BB1_17 Depth=2
-; CHECK-NEXT:    cmn.w r3, #19
+; CHECK-NEXT:  @ %bb.20: @ %for.cond.cleanup17.us63
+; CHECK-NEXT:    @ in Loop: Header=BB1_19 Depth=2
+; CHECK-NEXT:    cmn.w r0, #4
+; CHECK-NEXT:    bge.w .LBB1_3
+; CHECK-NEXT:  @ %bb.21: @ %for.cond.cleanup17.us63.1
+; CHECK-NEXT:    @ in Loop: Header=BB1_19 Depth=2
+; CHECK-NEXT:    cmn.w r0, #12
 ; CHECK-NEXT:    bgt .LBB1_24
-; CHECK-NEXT:  @ %bb.21: @ %for.cond.cleanup17.us63.3
-; CHECK-NEXT:    @ in Loop: Header=BB1_17 Depth=2
-; CHECK-NEXT:    add.w r11, r3, #28
-; CHECK-NEXT:    cmn.w r3, #25
-; CHECK-NEXT:    mov.w r10, #0
-; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    blt .LBB1_17
-; CHECK-NEXT:    b .LBB1_3
-; CHECK-NEXT:  .LBB1_22: @ %for.cond.cleanup5.loopexit134.split.loop.exit139
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    add.w r11, r3, #7
-; CHECK-NEXT:    b .LBB1_25
-; CHECK-NEXT:  .LBB1_23: @ %for.cond.cleanup5.loopexit134.split.loop.exit137
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    add.w r11, r3, #14
-; CHECK-NEXT:    b .LBB1_25
-; CHECK-NEXT:  .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit135
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
-; CHECK-NEXT:    add.w r11, r3, #21
-; CHECK-NEXT:  .LBB1_25: @ %for.cond.cleanup5
-; CHECK-NEXT:    @ in Loop: Header=BB1_4 Depth=1
+; CHECK-NEXT:  @ %bb.22: @ %for.cond.cleanup17.us63.2
+; CHECK-NEXT:    @ in Loop: Header=BB1_19 Depth=2
+; CHECK-NEXT:    cmn.w r0, #19
+; CHECK-NEXT:    bgt .LBB1_25
+; CHECK-NEXT:  @ %bb.23: @ %for.cond.cleanup17.us63.3
+; CHECK-NEXT:    @ in Loop: Header=BB1_19 Depth=2
+; CHECK-NEXT:    add.w r11, r0, #28
+; CHECK-NEXT:    cmn.w r0, #25
 ; CHECK-NEXT:    mov.w r10, #0
-; CHECK-NEXT:    b .LBB1_3
+; CHECK-NEXT:    mov r0, r11
+; CHECK-NEXT:    blt .LBB1_19
+; CHECK-NEXT:    b .LBB1_5
+; CHECK-NEXT:  .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit137
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
+; CHECK-NEXT:    add.w r11, r0, #14
+; CHECK-NEXT:    b .LBB1_4
+; CHECK-NEXT:  .LBB1_25: @ %for.cond.cleanup5.loopexit134.split.loop.exit135
+; CHECK-NEXT:    @ in Loop: Header=BB1_6 Depth=1
+; CHECK-NEXT:    add.w r11, r0, #21
+; CHECK-NEXT:    b .LBB1_4
 ; CHECK-NEXT:  .LBB1_26: @ %for.inc19.us
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    b .LBB1_26
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index 88131fcf21a9233..1c95d28b5eed1be 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1021,24 +1021,29 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    b .LBB16_6
+; CHECK-NEXT:  .LBB16_3: @ %while.end.loopexit
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    add.w r5, r5, r0, lsl #1
 ; CHECK-NEXT:    b .LBB16_5
-; CHECK-NEXT:  .LBB16_3: @ %for.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:  .LBB16_4: @ %for.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    wls lr, r0, .LBB16_4
-; CHECK-NEXT:    b .LBB16_9
-; CHECK-NEXT:  .LBB16_4: @ %while.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    wls lr, r0, .LBB16_5
+; CHECK-NEXT:    b .LBB16_10
+; CHECK-NEXT:  .LBB16_5: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r12, r12, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #8
 ; CHECK-NEXT:    add.w r0, r5, r0, lsl #1
 ; CHECK-NEXT:    add.w r5, r0, #8
 ; CHECK-NEXT:    beq.w .LBB16_12
-; CHECK-NEXT:  .LBB16_5: @ %while.body
+; CHECK-NEXT:  .LBB16_6: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB16_7 Depth 2
-; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_8 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    ldrh.w lr, [r3, #14]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #8
@@ -1074,14 +1079,14 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vfma.f16 q0, q1, lr
 ; CHECK-NEXT:    cmp r0, #16
-; CHECK-NEXT:    blo .LBB16_8
-; CHECK-NEXT:  @ %bb.6: @ %for.body.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    blo .LBB16_9
+; CHECK-NEXT:  @ %bb.7: @ %for.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:  .LBB16_7: @ %for.body
-; CHECK-NEXT:    @ Parent Loop BB16_5 Depth=1
+; CHECK-NEXT:  .LBB16_8: @ %for.body
+; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrh r0, [r6], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r5]
@@ -1112,26 +1117,22 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    adds r5, #16
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
-; CHECK-NEXT:    le lr, .LBB16_7
-; CHECK-NEXT:    b .LBB16_3
-; CHECK-NEXT:  .LBB16_8: @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    le lr, .LBB16_8
+; CHECK-NEXT:    b .LBB16_4
+; CHECK-NEXT:  .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    b .LBB16_3
-; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    b .LBB16_4
+; CHECK-NEXT:  .LBB16_10: @ %while.body76.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    mov r0, r5
-; CHECK-NEXT:  .LBB16_10: @ %while.body76
-; CHECK-NEXT:    @ Parent Loop BB16_5 Depth=1
+; CHECK-NEXT:  .LBB16_11: @ %while.body76
+; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrh r4, [r6], #2
 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #2
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
-; CHECK-NEXT:    le lr, .LBB16_10
-; CHECK-NEXT:  @ %bb.11: @ %while.end.loopexit
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    add.w r5, r5, r0, lsl #1
-; CHECK-NEXT:    b .LBB16_4
+; CHECK-NEXT:    le lr, .LBB16_11
+; CHECK-NEXT:    b .LBB16_3
 ; CHECK-NEXT:  .LBB16_12: @ %if.end
 ; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index ca6b8c2fffa22cc..808626d9a0aebe6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1016,25 +1016,30 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    str r6, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    b .LBB16_6
+; CHECK-NEXT:  .LBB16_3: @ %while.end.loopexit
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
+; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    add.w r4, r4, r0, lsl #2
 ; CHECK-NEXT:    b .LBB16_5
-; CHECK-NEXT:  .LBB16_3: @ %for.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:  .LBB16_4: @ %for.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
-; CHECK-NEXT:    wls lr, r0, .LBB16_4
-; CHECK-NEXT:    b .LBB16_9
-; CHECK-NEXT:  .LBB16_4: @ %while.end
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    wls lr, r0, .LBB16_5
+; CHECK-NEXT:    b .LBB16_10
+; CHECK-NEXT:  .LBB16_5: @ %while.end
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r12, r12, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 ; CHECK-NEXT:    add.w r0, r4, r0, lsl #2
 ; CHECK-NEXT:    add.w r4, r0, #16
 ; CHECK-NEXT:    beq .LBB16_12
-; CHECK-NEXT:  .LBB16_5: @ %while.body
+; CHECK-NEXT:  .LBB16_6: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
-; CHECK-NEXT:    @ Child Loop BB16_7 Depth 2
-; CHECK-NEXT:    @ Child Loop BB16_10 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_8 Depth 2
+; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
 ; CHECK-NEXT:    add.w lr, r10, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    ldrd r3, r7, [r10]
@@ -1042,7 +1047,8 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    ldrd r11, r8, [r10, #24]
 ; CHECK-NEXT:    vstrb.8 q0, [r9], #16
 ; CHECK-NEXT:    vldrw.u32 q0, [r4], #32
-; CHECK-NEXT:    strd r9, r1, [sp, #24] @ 8-byte Folded Spill
+; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
+; CHECK-NEXT:    str.w r9, [sp, #24] @ 4-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q1, [r4, #-28]
 ; CHECK-NEXT:    vmul.f32 q0, q0, r3
 ; CHECK-NEXT:    vldrw.u32 q6, [r4, #-24]
@@ -1060,14 +1066,14 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
 ; CHECK-NEXT:    cmp r0, #16
 ; CHECK-NEXT:    vfma.f32 q0, q1, r8
-; CHECK-NEXT:    blo .LBB16_8
-; CHECK-NEXT:  @ %bb.6: @ %for.body.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    blo .LBB16_9
+; CHECK-NEXT:  @ %bb.7: @ %for.body.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, r0
 ; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:  .LBB16_7: @ %for.body
-; CHECK-NEXT:    @ Parent Loop BB16_5 Depth=1
+; CHECK-NEXT:  .LBB16_8: @ %for.body
+; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldm.w r7, {r0, r3, r5, r6, r8, r11}
 ; CHECK-NEXT:    vldrw.u32 q1, [r4], #32
@@ -1088,26 +1094,22 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    vfma.f32 q0, q2, r11
 ; CHECK-NEXT:    vfma.f32 q0, q3, r9
 ; CHECK-NEXT:    vfma.f32 q0, q1, r1
-; CHECK-NEXT:    le lr, .LBB16_7
-; CHECK-NEXT:    b .LBB16_3
-; CHECK-NEXT:  .LBB16_8: @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    le lr, .LBB16_8
+; CHECK-NEXT:    b .LBB16_4
+; CHECK-NEXT:  .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    b .LBB16_3
-; CHECK-NEXT:  .LBB16_9: @ %while.body76.preheader
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
+; CHECK-NEXT:    b .LBB16_4
+; CHECK-NEXT:  .LBB16_10: @ %while.body76.preheader
+; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    mov r3, r4
-; CHECK-NEXT:  .LBB16_10: @ %while.body76
-; CHECK-NEXT:    @ Parent Loop BB16_5 Depth=1
+; CHECK-NEXT:  .LBB16_11: @ %while.body76
+; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldr r0, [r7], #4
 ; CHECK-NEXT:    vldrw.u32 q1, [r3], #4
 ; CHECK-NEXT:    vfma.f32 q0, q1, r0
-; CHECK-NEXT:    le lr, .LBB16_10
-; CHECK-NEXT:  @ %bb.11: @ %while.end.loopexit
-; CHECK-NEXT:    @ in Loop: Header=BB16_5 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add.w r4, r4, r0, lsl #2
-; CHECK-NEXT:    b .LBB16_4
+; CHECK-NEXT:    le lr, .LBB16_11
+; CHECK-NEXT:    b .LBB16_3
 ; CHECK-NEXT:  .LBB16_12:
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
@@ -1573,26 +1575,27 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    ldrd r6, r9, [r0]
-; CHECK-NEXT:    and r7, r3, #3
+; CHECK-NEXT:    ldrd r7, r9, [r0]
+; CHECK-NEXT:    and r6, r3, #3
 ; CHECK-NEXT:    ldr r0, [r0, #8]
 ; CHECK-NEXT:    lsrs r3, r3, #2
 ; CHECK-NEXT:    @ implicit-def: $r12
-; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    str r6, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    str r2, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB19_3
 ; CHECK-NEXT:  .LBB19_1: @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    mov r7, r5
+; CHECK-NEXT:    mov r2, r5
 ; CHECK-NEXT:    mov r4, r11
 ; CHECK-NEXT:    mov r8, r10
 ; CHECK-NEXT:  .LBB19_2: @ %if.end69
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    ldrd r2, r6, [sp, #8] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    adds r0, #128
-; CHECK-NEXT:    strd r7, r4, [r9]
-; CHECK-NEXT:    subs r6, #1
+; CHECK-NEXT:    strd r2, r4, [r9]
+; CHECK-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    subs r7, #1
 ; CHECK-NEXT:    strd r3, r8, [r9, #8]
 ; CHECK-NEXT:    add.w r9, r9, #16
 ; CHECK-NEXT:    mov r1, r2
@@ -1600,11 +1603,11 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:  .LBB19_3: @ %do.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB19_5 Depth 2
-; CHECK-NEXT:    str r6, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    mov r6, r2
 ; CHECK-NEXT:    ldrd r5, r11, [r9]
 ; CHECK-NEXT:    ldrd r8, r10, [r9, #8]
 ; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; CHECK-NEXT:    str r7, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    wls lr, r2, .LBB19_6
 ; CHECK-NEXT:  @ %bb.4: @ %while.body.lr.ph
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
@@ -1641,27 +1644,27 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:    le lr, .LBB19_5
 ; CHECK-NEXT:  .LBB19_6: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq .LBB19_1
 ; CHECK-NEXT:  @ %bb.7: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    ldrd lr, r4, [r1]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    ldrd r7, r1, [r1, #8]
+; CHECK-NEXT:    ldrd r2, r1, [r1, #8]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    vmul.f32 q0, q0, r1
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #64]
-; CHECK-NEXT:    vfma.f32 q0, q6, r7
+; CHECK-NEXT:    vfma.f32 q0, q6, r2
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
 ; CHECK-NEXT:    vfma.f32 q0, q7, r4
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #96]
 ; CHECK-NEXT:    vfma.f32 q0, q4, lr
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #112]
 ; CHECK-NEXT:    vfma.f32 q0, q5, r5
-; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    cmp r3, #1
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
 ; CHECK-NEXT:    vfma.f32 q0, q2, r8
 ; CHECK-NEXT:    vfma.f32 q0, q1, r10
@@ -1670,19 +1673,19 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %
 ; CHECK-NEXT:  @ %bb.8: @ %if.then58
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    str r5, [r6]
-; CHECK-NEXT:    mov r7, lr
+; CHECK-NEXT:    mov r2, lr
 ; CHECK-NEXT:    mov r4, r12
 ; CHECK-NEXT:    mov r3, r5
 ; CHECK-NEXT:    b .LBB19_12
 ; CHECK-NEXT:  .LBB19_9: @ %if.else
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    vmov r8, s1
-; CHECK-NEXT:    cmp r2, #2
+; CHECK-NEXT:    cmp r3, #2
 ; CHECK-NEXT:    vstr s1, [r6, #4]
 ; CHECK-NEXT:    str r5, [r6]
 ; CHECK-NEXT:    bne .LBB19_11
 ; CHECK-NEXT:  @ %bb.10: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    mov r7, r4
+; CHECK-NEXT:    mov r2, r4
 ; CHECK-NEXT:    mov r3, r8
 ; CHECK-NEXT:    mov r4, lr
 ; CHECK-NEXT:    mov r8, r5
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll
index 747021e5c64eb30..f70af5661f4c904 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-vselect.ll
@@ -383,27 +383,27 @@ define arm_aapcs_vfpcc <2 x i64> @cmpeqz_v2i1_i1(<2 x i64> %a, <2 x i64> %b, i64
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, lr}
 ; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    orr.w r2, r0, r1
+; CHECK-NEXT:    orr.w r3, r0, r1
 ; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, r3, d3
+; CHECK-NEXT:    vmov r1, r2, d3
 ; CHECK-NEXT:    csetm r12, eq
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    orrs r1, r3
-; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r1, r2, d0
+; CHECK-NEXT:    csetm r4, eq
+; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r1, r2, d1
 ; CHECK-NEXT:    csetm lr, eq
-; CHECK-NEXT:    orrs r1, r3
-; CHECK-NEXT:    vmov r1, r4, d1
-; CHECK-NEXT:    csetm r3, eq
-; CHECK-NEXT:    orrs r1, r4
+; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    csetm r1, eq
-; CHECK-NEXT:    cbz r2, .LBB15_2
+; CHECK-NEXT:    cbz r3, .LBB15_2
 ; CHECK-NEXT:  @ %bb.1: @ %select.false
 ; CHECK-NEXT:    bfi r0, r12, #0, #8
-; CHECK-NEXT:    bfi r0, lr, #8, #8
+; CHECK-NEXT:    bfi r0, r4, #8, #8
 ; CHECK-NEXT:    b .LBB15_3
 ; CHECK-NEXT:  .LBB15_2:
-; CHECK-NEXT:    bfi r0, r3, #0, #8
+; CHECK-NEXT:    bfi r0, lr, #0, #8
 ; CHECK-NEXT:    bfi r0, r1, #8, #8
 ; CHECK-NEXT:  .LBB15_3: @ %select.end
 ; CHECK-NEXT:    vmsr p0, r0
diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index b5d981ef340254d..d8e0eb7119a9b27 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -6,101 +6,102 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr no
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    .pad #12
+; CHECK-NEXT:    sub sp, #12
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq.w .LBB0_8
 ; CHECK-NEXT:  @ %bb.1: @ %entry
-; CHECK-NEXT:    mov r11, r2
 ; CHECK-NEXT:    cmp r3, #1
 ; CHECK-NEXT:    bne .LBB0_3
 ; CHECK-NEXT:  @ %bb.2:
-; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    mov r12, r0
 ; CHECK-NEXT:    mov r8, r1
-; CHECK-NEXT:    mov r10, r11
+; CHECK-NEXT:    mov r10, r2
 ; CHECK-NEXT:    b .LBB0_6
 ; CHECK-NEXT:  .LBB0_3: @ %vector.ph
-; CHECK-NEXT:    bic r2, r3, #1
-; CHECK-NEXT:    adr r4, .LCPI0_0
-; CHECK-NEXT:    subs r7, r2, #2
-; CHECK-NEXT:    movs r6, #1
 ; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    add.w r10, r11, r2, lsl #2
+; CHECK-NEXT:    bic r3, r3, #1
+; CHECK-NEXT:    subs r7, r3, #2
+; CHECK-NEXT:    movs r6, #1
+; CHECK-NEXT:    adr r4, .LCPI0_0
+; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #1
-; CHECK-NEXT:    str r2, [sp] @ 4-byte Spill
-; CHECK-NEXT:    add.w r8, r1, r2, lsl #2
-; CHECK-NEXT:    add.w r12, r0, r2, lsl #2
+; CHECK-NEXT:    add.w r10, r2, r3, lsl #2
+; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
 ; CHECK-NEXT:  .LBB0_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrd r4, r2, [r0], #8
+; CHECK-NEXT:    ldrd r4, r3, [r0], #8
 ; CHECK-NEXT:    movs r5, #0
 ; CHECK-NEXT:    ldrd r7, r6, [r1], #8
-; CHECK-NEXT:    smull r4, r7, r7, r4
-; CHECK-NEXT:    asrl r4, r7, #31
+; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    smull r4, r11, r7, r4
+; CHECK-NEXT:    asrl r4, r11, #31
 ; CHECK-NEXT:    rsbs.w r9, r4, #-2147483648
 ; CHECK-NEXT:    mov.w r9, #-1
-; CHECK-NEXT:    sbcs.w r3, r9, r7
+; CHECK-NEXT:    sbcs.w r3, r9, r11
 ; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r5, r3, #0, #8
-; CHECK-NEXT:    smull r2, r3, r6, r2
-; CHECK-NEXT:    asrl r2, r3, #31
-; CHECK-NEXT:    rsbs.w r6, r2, #-2147483648
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
-; CHECK-NEXT:    sbcs.w r6, r9, r3
-; CHECK-NEXT:    vmov q2[3], q2[1], r7, r3
-; CHECK-NEXT:    csetm r6, lt
-; CHECK-NEXT:    bfi r5, r6, #8, #8
+; CHECK-NEXT:    ldr r3, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    smull r6, r3, r6, r3
+; CHECK-NEXT:    asrl r6, r3, #31
+; CHECK-NEXT:    rsbs.w r7, r6, #-2147483648
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r6
+; CHECK-NEXT:    sbcs.w r7, r9, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r11, r3
+; CHECK-NEXT:    csetm r7, lt
+; CHECK-NEXT:    mvn r6, #-2147483648
+; CHECK-NEXT:    bfi r5, r7, #8, #8
 ; CHECK-NEXT:    vmsr p0, r5
-; CHECK-NEXT:    mvn r5, #-2147483648
 ; CHECK-NEXT:    vpsel q2, q2, q0
-; CHECK-NEXT:    vmov r2, r3, d4
-; CHECK-NEXT:    subs r2, r2, r5
-; CHECK-NEXT:    sbcs r2, r3, #0
-; CHECK-NEXT:    mov.w r3, #0
-; CHECK-NEXT:    csetm r2, lt
-; CHECK-NEXT:    bfi r3, r2, #0, #8
-; CHECK-NEXT:    vmov r2, r4, d5
-; CHECK-NEXT:    subs r2, r2, r5
-; CHECK-NEXT:    sbcs r2, r4, #0
-; CHECK-NEXT:    csetm r2, lt
-; CHECK-NEXT:    bfi r3, r2, #8, #8
-; CHECK-NEXT:    vmsr p0, r3
+; CHECK-NEXT:    vmov r3, r4, d4
+; CHECK-NEXT:    subs r3, r3, r6
+; CHECK-NEXT:    sbcs r3, r4, #0
+; CHECK-NEXT:    mov.w r4, #0
+; CHECK-NEXT:    csetm r3, lt
+; CHECK-NEXT:    bfi r4, r3, #0, #8
+; CHECK-NEXT:    vmov r3, r5, d5
+; CHECK-NEXT:    subs r3, r3, r6
+; CHECK-NEXT:    sbcs r3, r5, #0
+; CHECK-NEXT:    csetm r3, lt
+; CHECK-NEXT:    bfi r4, r3, #8, #8
+; CHECK-NEXT:    vmsr p0, r4
 ; CHECK-NEXT:    vpsel q2, q2, q1
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    strd r3, r2, [r11], #8
+; CHECK-NEXT:    vmov r3, s10
+; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    strd r4, r3, [r2], #8
 ; CHECK-NEXT:    le lr, .LBB0_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
-; CHECK-NEXT:    ldrd r2, r3, [sp] @ 8-byte Folded Reload
-; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    ldrd r7, r3, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    cmp r7, r3
 ; CHECK-NEXT:    beq .LBB0_8
 ; CHECK-NEXT:  .LBB0_6: @ %for.body.preheader
-; CHECK-NEXT:    sub.w lr, r3, r2
+; CHECK-NEXT:    sub.w lr, r3, r7
 ; CHECK-NEXT:    mov.w r0, #-1
 ; CHECK-NEXT:    mov.w r1, #-2147483648
-; CHECK-NEXT:    mvn r3, #-2147483648
+; CHECK-NEXT:    mvn r2, #-2147483648
 ; CHECK-NEXT:  .LBB0_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldr r2, [r12], #4
+; CHECK-NEXT:    ldr r3, [r12], #4
 ; CHECK-NEXT:    ldr r4, [r8], #4
-; CHECK-NEXT:    smull r2, r5, r4, r2
-; CHECK-NEXT:    asrl r2, r5, #31
-; CHECK-NEXT:    subs r4, r1, r2
-; CHECK-NEXT:    sbcs.w r4, r0, r5
-; CHECK-NEXT:    cset r4, lt
-; CHECK-NEXT:    cmp r4, #0
-; CHECK-NEXT:    csel r2, r2, r1, ne
-; CHECK-NEXT:    csel r4, r5, r0, ne
-; CHECK-NEXT:    subs r5, r2, r3
-; CHECK-NEXT:    sbcs r4, r4, #0
-; CHECK-NEXT:    csel r2, r2, r3, lt
-; CHECK-NEXT:    str r2, [r10], #4
+; CHECK-NEXT:    smull r4, r3, r4, r3
+; CHECK-NEXT:    asrl r4, r3, #31
+; CHECK-NEXT:    subs r5, r1, r4
+; CHECK-NEXT:    sbcs.w r5, r0, r3
+; CHECK-NEXT:    cset r5, lt
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csel r4, r4, r1, ne
+; CHECK-NEXT:    csel r3, r3, r0, ne
+; CHECK-NEXT:    subs r5, r4, r2
+; CHECK-NEXT:    sbcs r3, r3, #0
+; CHECK-NEXT:    csel r3, r4, r2, lt
+; CHECK-NEXT:    str r3, [r10], #4
 ; CHECK-NEXT:    le lr, .LBB0_7
 ; CHECK-NEXT:  .LBB0_8: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    add sp, #12
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.9:
diff --git a/llvm/test/CodeGen/VE/Scalar/br_jt.ll b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
index fd44a54a7273801..69cc1800555a76a 100644
--- a/llvm/test/CodeGen/VE/Scalar/br_jt.ll
+++ b/llvm/test/CodeGen/VE/Scalar/br_jt.ll
@@ -399,29 +399,29 @@ define signext i32 @br_jt4_m(i32 signext %0, i32 signext %1) {
 ; PIC-NEXT:    and %s0, %s0, (32)0
 ; PIC-NEXT:    brlt.w 2, %s0, .LBB5_4
 ; PIC-NEXT:  # %bb.1:
-; PIC-NEXT:    breq.w 1, %s0, .LBB5_8
+; PIC-NEXT:    breq.w 1, %s0, .LBB5_7
 ; PIC-NEXT:  # %bb.2:
-; PIC-NEXT:    brne.w 2, %s0, .LBB5_7
+; PIC-NEXT:    brne.w 2, %s0, .LBB5_9
 ; PIC-NEXT:  # %bb.3:
 ; PIC-NEXT:    or %s0, 0, (0)1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
 ; PIC-NEXT:  .LBB5_4:
-; PIC-NEXT:    breq.w 3, %s0, .LBB5_9
+; PIC-NEXT:    breq.w 3, %s0, .LBB5_8
 ; PIC-NEXT:  # %bb.5:
-; PIC-NEXT:    brne.w 4, %s0, .LBB5_7
+; PIC-NEXT:    brne.w 4, %s0, .LBB5_9
 ; PIC-NEXT:  # %bb.6:
 ; PIC-NEXT:    and %s0, %s1, (32)0
 ; PIC-NEXT:    adds.w.sx %s0, 3, %s0
-; PIC-NEXT:  .LBB5_7:
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
-; PIC-NEXT:  .LBB5_8:
+; PIC-NEXT:  .LBB5_7:
 ; PIC-NEXT:    or %s0, 3, (0)1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
-; PIC-NEXT:  .LBB5_9:
+; PIC-NEXT:  .LBB5_8:
 ; PIC-NEXT:    or %s0, 4, (0)1
+; PIC-NEXT:  .LBB5_9:
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
   switch i32 %0, label %7 [
@@ -479,21 +479,21 @@ define signext i32 @br_jt7_m(i32 signext %0, i32 signext %1) {
 ; CHECK-NEXT:    adds.w.sx %s0, 3, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB6_8:
-; CHECK-NEXT:    or %s0, 0, %s2
-; CHECK-NEXT:  .LBB6_9:
-; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
-; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB6_7:
-; CHECK-NEXT:    or %s0, 11, (0)1
+; CHECK-NEXT:  .LBB6_5:
+; CHECK-NEXT:    adds.w.sx %s0, -2, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
 ; CHECK-NEXT:  .LBB6_6:
 ; CHECK-NEXT:    or %s0, 10, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .LBB6_5:
-; CHECK-NEXT:    adds.w.sx %s0, -2, %s1
+; CHECK-NEXT:  .LBB6_7:
+; CHECK-NEXT:    or %s0, 11, (0)1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+; CHECK-NEXT:  .LBB6_8:
+; CHECK-NEXT:    or %s0, 0, %s2
+; CHECK-NEXT:  .LBB6_9:
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
 ;
@@ -530,6 +530,10 @@ define signext i32 @br_jt7_m(i32 signext %0, i32 signext %1) {
 ; PIC-NEXT:    or %s0, 10, (0)1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
+; PIC-NEXT:  .LBB6_13:
+; PIC-NEXT:    or %s0, 0, (0)1
+; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
+; PIC-NEXT:    b.l.t (, %s10)
 ; PIC-NEXT:  .LBB6_14:
 ; PIC-NEXT:    adds.w.sx %s0, 3, %s1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
@@ -542,10 +546,6 @@ define signext i32 @br_jt7_m(i32 signext %0, i32 signext %1) {
 ; PIC-NEXT:  .LBB6_2:
 ; PIC-NEXT:    or %s0, 3, (0)1
 ; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
-; PIC-NEXT:    b.l.t (, %s10)
-; PIC-NEXT:  .LBB6_13:
-; PIC-NEXT:    or %s0, 0, (0)1
-; PIC-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; PIC-NEXT:    b.l.t (, %s10)
   switch i32 %0, label %11 [
     i32 1, label %12
diff --git a/llvm/test/CodeGen/VE/Scalar/brind.ll b/llvm/test/CodeGen/VE/Scalar/brind.ll
index 907f0a07504156a..6585e2b5eda664a 100644
--- a/llvm/test/CodeGen/VE/Scalar/brind.ll
+++ b/llvm/test/CodeGen/VE/Scalar/brind.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple=ve | FileCheck %s
 
 ; Function Attrs: norecurse nounwind readnone
@@ -18,19 +19,19 @@ define signext i32 @brind(i32 signext %0) {
 ; CHECK-NEXT:    cmov.w.eq %s1, %s2, %s0
 ; CHECK-NEXT:    b.l.t (, %s1)
 ; CHECK-NEXT:  .Ltmp0: # Block address taken
-; CHECK-NEXT:  .LBB{{[0-9]+}}_3:
+; CHECK-NEXT:  .LBB0_3:
 ; CHECK-NEXT:    or %s0, -1, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-; CHECK-NEXT:  .Ltmp2: # Block address taken
-; CHECK-NEXT:  .LBB{{[0-9]+}}_2:
-; CHECK-NEXT:    or %s0, 2, (0)1
-; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
-; CHECK-NEXT:    b.l.t (, %s10)
 ; CHECK-NEXT:  .Ltmp1: # Block address taken
-; CHECK-NEXT:  .LBB{{[0-9]+}}_1:
+; CHECK-NEXT:  .LBB0_1:
 ; CHECK-NEXT:    or %s0, 1, (0)1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
+; CHECK-NEXT:    b.l.t (, %s10)
+; CHECK-NEXT:  .Ltmp2: # Block address taken
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    or %s0, 2, (0)1
+; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %2 = icmp eq i32 %0, 1
   %3 = select i1 %2, ptr blockaddress(@brind, %6), ptr blockaddress(@brind, %8)
diff --git a/llvm/test/CodeGen/X86/bb_rotate.ll b/llvm/test/CodeGen/X86/bb_rotate.ll
index 55a7b0138026328..0ed0600e8dbad67 100644
--- a/llvm/test/CodeGen/X86/bb_rotate.ll
+++ b/llvm/test/CodeGen/X86/bb_rotate.ll
@@ -4,13 +4,13 @@ define i1 @no_viable_top_fallthrough() {
 ; CHECK-LABEL: no_viable_top_fallthrough
 ; CHECK: %.entry
 ; CHECK: %.bb1
+; CHECK: %.stop
 ; CHECK: %.bb2
 ; CHECK: %.middle
 ; CHECK: %.backedge
 ; CHECK: %.bb3
 ; CHECK: %.header
 ; CHECK: %.exit
-; CHECK: %.stop
 .entry:
   %val1 = call i1 @foo()
   br i1 %val1, label %.bb1, label %.header, !prof !10
diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
index cee8489e9aaea0c..bb081f6bab5329f 100644
--- a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
+++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll
@@ -68,8 +68,8 @@ define void @func_large() !prof !0 {
 ; increased by ~17%
 ;
 ; CHECK-LABEL: Applying ext-tsp layout
-; CHECK:   original  layout score: 9171074274.27
-; CHECK:   optimized layout score: 10844307310.87
+; CHECK:   original  layout score: 23587612604815436.00
+; CHECK:   optimized layout score: 27891096739311172.00
 ; CHECK: b0
 ; CHECK: b2
 ; CHECK: b3
@@ -84,8 +84,8 @@ define void @func_large() !prof !0 {
 ; An expected output with chain-split-threshold=1 (disabling split point enumeration)
 ;
 ; CHECK2-LABEL: Applying ext-tsp layout
-; CHECK2:   original  layout score: 9171074274.27
-; CHECK2:   optimized layout score: 10844307310.87
+; CHECK2:   original  layout score: 23587612604815436.00
+; CHECK2:   optimized layout score: 27891096739311172.00
 ; CHECK2: b0
 ; CHECK2: b2
 ; CHECK2: b3
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index d26f4b7044cf3c0..bf7c1c00c71df10 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -271,46 +271,47 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl $64, %edx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    subl %edx, %ebp
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    movl $127, %ecx
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %ebp, %ecx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    cmovnel %ebx, %edi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    cmovnel %esi, %edi
 ; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    cmovnel %ebx, %edx
+; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovnel %ebx, %eax
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    jne .LBB4_1
 ; X86-NEXT:  # %bb.8: # %_udiv-special-cases
-; X86-NEXT:    movl %ebp, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    xorl $127, %ebp
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    orl %ebp, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    je .LBB4_9
 ; X86-NEXT:  # %bb.5: # %udiv-bb1
@@ -326,9 +327,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    xorb $127, %al
 ; X86-NEXT:    movb %al, %ch
 ; X86-NEXT:    andb $7, %ch
@@ -353,33 +353,29 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %esi, %eax
 ; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jae .LBB4_2
 ; X86-NEXT:  # %bb.6:
-; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    jmp .LBB4_7
 ; X86-NEXT:  .LBB4_1:
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    jmp .LBB4_9
 ; X86-NEXT:  .LBB4_2: # %udiv-preheader
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -393,16 +389,16 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    andb $15, %cl
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %ebx
-; X86-NEXT:    movl 100(%esp,%ebx), %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 100(%esp,%ebx), %ebp
+; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl 96(%esp,%ebx), %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %esi, %ebp
+; X86-NEXT:    shrdl %cl, %ebp, %edx
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl 88(%esp,%ebx), %esi
+; X86-NEXT:    movl 88(%esp,%ebx), %edx
 ; X86-NEXT:    movl 92(%esp,%ebx), %ebx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    shrl %cl, %eax
@@ -413,8 +409,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    shrdl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -429,7 +425,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB4_3: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -440,22 +437,22 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shldl $1, %ebp, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %ebx, %ebp
-; X86-NEXT:    shldl $1, %esi, %ebx
+; X86-NEXT:    shldl $1, %edi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    shldl $1, %ecx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %edi, %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    shldl $1, %esi, %edi
 ; X86-NEXT:    orl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %esi, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %ebp, %ecx
@@ -464,12 +461,11 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $1, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %eax
@@ -482,8 +478,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sbbl %edi, %edx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %eax, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    sbbl %esi, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -496,26 +492,25 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    jne .LBB4_3
 ; X86-NEXT:  # %bb.4:
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %edx, %edi
 ; X86-NEXT:    orl %ecx, %edi
 ; X86-NEXT:    shldl $1, %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %esi, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl %ebp, %esi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:  .LBB4_9: # %udiv-end
 ; X86-NEXT:    xorl %ebx, %edi
 ; X86-NEXT:    xorl %ebx, %edx
@@ -528,11 +523,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sbbl %ebx, %edx
 ; X86-NEXT:    sbbl %ebx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    movl %edx, 8(%ecx)
-; X86-NEXT:    movl %edi, 12(%ecx)
+; X86-NEXT:    movl %esi, (%ebp)
+; X86-NEXT:    movl %eax, 4(%ebp)
+; X86-NEXT:    movl %edx, 8(%ebp)
+; X86-NEXT:    movl %edi, 12(%ebp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %edx, %ebx
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index ebb95f16a723c4c..41f5d8590c237dc 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -177,14 +177,14 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $132, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    subl $136, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sete %bl
@@ -205,7 +205,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    bsrl %eax, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %ebp, %ebp
+; X86-NEXT:    bsrl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    xorl $31, %ebp
@@ -262,28 +262,25 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    cmovnel %ecx, %esi
 ; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    cmovnel %ecx, %ebp
 ; X86-NEXT:    jne .LBB4_8
 ; X86-NEXT:  # %bb.1: # %_udiv-special-cases
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    xorl $127, %eax
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    je .LBB4_8
 ; X86-NEXT:  # %bb.2: # %udiv-bb1
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -300,20 +297,20 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 124(%esp,%eax), %edx
-; X86-NEXT:    movl 128(%esp,%eax), %esi
+; X86-NEXT:    movl 128(%esp,%eax), %edx
+; X86-NEXT:    movl 132(%esp,%eax), %esi
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 120(%esp,%eax), %ebp
+; X86-NEXT:    movl 124(%esp,%eax), %ebp
 ; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    shrl %esi
 ; X86-NEXT:    shrl %cl, %esi
 ; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    movl 116(%esp,%eax), %ebp
+; X86-NEXT:    movl 120(%esp,%eax), %ebp
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %ebp, %edx
 ; X86-NEXT:    shll %cl, %ebp
@@ -321,16 +318,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    jae .LBB4_3
 ; X86-NEXT:  # %bb.6:
-; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    jmp .LBB4_7
 ; X86-NEXT:  .LBB4_3: # %udiv-preheader
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -354,26 +352,29 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 80(%esp,%eax), %ebp
+; X86-NEXT:    movl 84(%esp,%eax), %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl 80(%esp,%eax), %edx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 76(%esp,%eax), %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %ebp, %ebx
-; X86-NEXT:    movl 68(%esp,%eax), %esi
-; X86-NEXT:    movl 72(%esp,%eax), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    shrl %cl, %eax
+; X86-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 72(%esp,%eax), %ebp
+; X86-NEXT:    movl 76(%esp,%eax), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shrl %cl, %esi
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %edx, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, %ebp
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    shrdl %cl, %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -386,41 +387,41 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
 ; X86-NEXT:  .LBB4_4: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT:    shldl $1, %ebx, %ebp
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    shldl $1, %ebx, %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %edx, %ebx
-; X86-NEXT:    shldl $1, %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    shldl $1, %ebp, %edx
+; X86-NEXT:    shldl $1, %esi, %ebp
+; X86-NEXT:    shldl $1, %edi, %esi
+; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %esi
+; X86-NEXT:    shldl $1, %eax, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, %ecx
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    shldl $1, %edi, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %edi
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl $1, %eax
@@ -433,93 +434,94 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    subl %ecx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    sbbl %eax, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $-1, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    jne .LBB4_4
 ; X86-NEXT:  # %bb.5:
 ; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
 ; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NEXT:    shldl $1, %esi, %edx
-; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    shldl $1, %ebx, %esi
-; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:    shldl $1, %ebp, %ebx
-; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ebp, %ebp
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    orl %eax, %ebp
 ; X86-NEXT:  .LBB4_8: # %udiv-end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebp, (%ecx)
-; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    movl %esi, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull %ecx, %esi
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    imull %ecx, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull %esi, %ebp
 ; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %edx, %ebp
@@ -546,7 +548,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    addl $132, %esp
+; X86-NEXT:    addl $136, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/fsafdo_test3.ll b/llvm/test/CodeGen/X86/fsafdo_test3.ll
index bbcc3ff59ec35fd..79b57fe4f1a3283 100644
--- a/llvm/test/CodeGen/X86/fsafdo_test3.ll
+++ b/llvm/test/CodeGen/X86/fsafdo_test3.ll
@@ -43,51 +43,51 @@
 ;; Check BFI before and after
 
 ; BFI: block-frequency-info: foo
-; BFI:  - BB0[entry]: float = 1.0, int = 8, count = 4268
-; BFI:  - BB1[for.cond1.preheader]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB2[if.then]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB3[if.end]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB4[if.then7]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB5[if.end9]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB6[if.then.1]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB7[if.end.1]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB8[if.then7.1]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB9[if.end9.1]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB10[if.then.2]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB11[if.end.2]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB12[if.then7.2]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB13[if.end9.2]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB14[if.then.3]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB15[if.end.3]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB16[if.then7.3]: float = 2.5405, int = 20, count = 10670
-; BFI:  - BB17[if.end9.3]: float = 59.967, int = 479, count = 255547
-; BFI:  - BB18[for.end12]: float = 1.0, int = 8, count = 4268
+; BFI:  - BB0[entry]: float = 1.0, int = {{.*}}, count = 4268
+; BFI:  - BB1[for.cond1.preheader]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB2[if.then]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB3[if.end]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB4[if.then7]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB5[if.end9]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB6[if.then.1]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB7[if.end.1]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB8[if.then7.1]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB9[if.end9.1]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB10[if.then.2]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB11[if.end.2]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB12[if.then7.2]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB13[if.end9.2]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB14[if.then.3]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB15[if.end.3]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB16[if.then7.3]: float = 2.5405, int = {{.*}}, count = 10843
+; BFI:  - BB17[if.end9.3]: float = 59.967, int = {{.*}}, count = 255941
+; BFI:  - BB18[for.end12]: float = 1.0, int = {{.*}}, count = 4268
 ;
 ; BFI: # *** IR Dump Before SampleFDO loader in MIR (fs-profile-loader) ***:
 ; BFI: # End machine code for function foo.
 ; BFI-EMPTY:
 ; BFI: block-frequency-info: foo
-; BFI:  - BB0[entry]: float = 1.0, int = 8, count = 4268
-; BFI:  - BB1[for.cond1.preheader]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB2[if.then]: float = 2.7041, int = 21, count = 11204
-; BFI:  - BB3[if.end]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB4[if.then7]: float = 2.7041, int = 21, count = 11204
-; BFI:  - BB5[if.end9]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB6[if.then.1]: float = 65.351, int = 522, count = 278487
-; BFI:  - BB7[if.end.1]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB8[if.then7.1]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB9[if.end9.1]: float = 66.446, int = 531, count = 283289
-; BFIV0:  - BB10[if.then.2]: float = 2.7041, int = 21, count = 11204
-; BFIV1:  - BB10[if.then.2]: float = 61.075, int = 488, count = 260348
-; BFI:  - BB11[if.end.2]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB12[if.then7.2]: float = 65.405, int = 523, count = 279021
-; BFI:  - BB13[if.end9.2]: float = 66.446, int = 531, count = 283289
-; BFIV0:  - BB14[if.then.3]: float = 61.075, int = 488, count = 260348
-; BFIV1:  - BB14[if.then.3]: float = 2.7041, int = 21, count = 11204
-; BFI:  - BB15[if.end.3]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB16[if.then7.3]: float = 54.846, int = 438, count = 233673
-; BFI:  - BB17[if.end9.3]: float = 66.446, int = 531, count = 283289
-; BFI:  - BB18[for.end12]: float = 1.0, int = 8, count = 4268
+; BFI:  - BB0[entry]: float = 1.0, int = {{.*}}, count = 4268
+; BFI:  - BB1[for.cond1.preheader]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB2[if.then]: float = 2.7041, int = {{.*}}, count = 11541
+; BFI:  - BB3[if.end]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB4[if.then7]: float = 2.7041, int = {{.*}}, count = 11541
+; BFI:  - BB5[if.end9]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB6[if.then.1]: float = 65.351, int = {{.*}}, count = 278916
+; BFI:  - BB7[if.end.1]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB8[if.then7.1]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB9[if.end9.1]: float = 66.446, int = {{.*}}, count = 283590
+; BFIV0:  - BB10[if.then.2]: float = 2.7041, int = {{.*}}, count = 11541
+; BFIV1:  - BB10[if.then.2]: float = 61.075, int = {{.*}}, count = 260670
+; BFI:  - BB11[if.end.2]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB12[if.then7.2]: float = 65.405, int = {{.*}}, count = 279149
+; BFI:  - BB13[if.end9.2]: float = 66.446, int = {{.*}}, count = 283590
+; BFIV0:  - BB14[if.then.3]: float = 61.075, int = {{.*}}, count = 260670
+; BFIV1:  - BB14[if.then.3]: float = 2.7041, int = {{.*}}, count = 11541
+; BFI:  - BB15[if.end.3]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB16[if.then7.3]: float = 54.846, int = {{.*}}, count = 234082
+; BFI:  - BB17[if.end9.3]: float = 66.446, int = {{.*}}, count = 283590
+; BFI:  - BB18[for.end12]: float = 1.0, int = {{.*}}, count = 4268
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/X86/pr38795.ll b/llvm/test/CodeGen/X86/pr38795.ll
index 8e0532e60652800..5695ab5e288b5d8 100644
--- a/llvm/test/CodeGen/X86/pr38795.ll
+++ b/llvm/test/CodeGen/X86/pr38795.ll
@@ -23,21 +23,22 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    .cfi_offset %ebx, -12
 ; CHECK-NEXT:    .cfi_offset %ebp, -8
 ; CHECK-NEXT:    xorl %ebx, %ebx
-; CHECK-NEXT:    # implicit-def: $esi
+; CHECK-NEXT:    # implicit-def: $ecx
 ; CHECK-NEXT:    # implicit-def: $edi
-; CHECK-NEXT:    # implicit-def: $ch
-; CHECK-NEXT:    # implicit-def: $dl
+; CHECK-NEXT:    # implicit-def: $al
+; CHECK-NEXT:    # kill: killed $al
+; CHECK-NEXT:    # implicit-def: $al
 ; CHECK-NEXT:    # implicit-def: $ebp
 ; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_14: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movb %dl, %ch
-; CHECK-NEXT:    movl %ecx, %edx
+; CHECK-NEXT:  .LBB0_16: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dh, %al
 ; CHECK-NEXT:  .LBB0_1: # %for.cond
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB0_22 Depth 2
-; CHECK-NEXT:    cmpb $8, %dl
-; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    cmpb $8, %al
 ; CHECK-NEXT:    ja .LBB0_3
 ; CHECK-NEXT:  # %bb.2: # %for.cond
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
@@ -45,37 +46,36 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    je .LBB0_3
 ; CHECK-NEXT:  # %bb.4: # %if.end
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movl %esi, %ecx
-; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    cltd
 ; CHECK-NEXT:    idivl a
-; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    movb %cl, %dh
 ; CHECK-NEXT:    movl $0, h
-; CHECK-NEXT:    cmpb $8, %dl
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT:    cmpb $8, %al
 ; CHECK-NEXT:    jg .LBB0_8
 ; CHECK-NEXT:  # %bb.5: # %if.then13
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movl %eax, %esi
 ; CHECK-NEXT:    movl $.str, (%esp)
-; CHECK-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; CHECK-NEXT:    calll printf
-; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload
 ; CHECK-NEXT:    testb %bl, %bl
+; CHECK-NEXT:    movl %esi, %ecx
 ; CHECK-NEXT:    # implicit-def: $eax
-; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; CHECK-NEXT:    movl %ecx, %edx
+; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload
+; CHECK-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dh, %dl
 ; CHECK-NEXT:    je .LBB0_6
 ; CHECK-NEXT:    jmp .LBB0_18
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_3: # %if.then
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl $.str, (%esp)
-; CHECK-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; CHECK-NEXT:    calll printf
-; CHECK-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
 ; CHECK-NEXT:    # implicit-def: $eax
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
 ; CHECK-NEXT:  .LBB0_6: # %for.cond35
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    testl %edi, %edi
@@ -96,31 +96,20 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    calll printf
 ; CHECK-NEXT:  .LBB0_21: # %for.end46
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    # implicit-def: $ch
-; CHECK-NEXT:    # implicit-def: $cl
+; CHECK-NEXT:    # implicit-def: $al
+; CHECK-NEXT:    # implicit-def: $dh
 ; CHECK-NEXT:    # implicit-def: $ebp
 ; CHECK-NEXT:    jmp .LBB0_22
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_8: # %if.end21
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    # implicit-def: $ebp
-; CHECK-NEXT:    testb %bl, %bl
-; CHECK-NEXT:    je .LBB0_13
+; CHECK-NEXT:    jmp .LBB0_9
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_10: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    # implicit-def: $eax
-; CHECK-NEXT:    testb %bl, %bl
-; CHECK-NEXT:    je .LBB0_19
-; CHECK-NEXT:  .LBB0_12: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    # implicit-def: $edi
-; CHECK-NEXT:    # implicit-def: $ch
-; CHECK-NEXT:    # implicit-def: $dl
-; CHECK-NEXT:    # implicit-def: $ebp
-; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    jne .LBB0_11
 ; CHECK-NEXT:  .LBB0_7: # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    xorl %edi, %edi
-; CHECK-NEXT:    movb %dl, %cl
+; CHECK-NEXT:    movb %dl, %dh
+; CHECK-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_22: # %for.cond47
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
@@ -131,14 +120,14 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    # in Loop: Header=BB0_22 Depth=2
 ; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    jne .LBB0_22
-; CHECK-NEXT:  # %bb.24: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movb %ch, %dl
+; CHECK-NEXT:  .LBB0_9: # %ae
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    jne .LBB0_10
-; CHECK-NEXT:  .LBB0_13: # %if.end26
+; CHECK-NEXT:  # %bb.13: # %if.end26
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    je .LBB0_14
 ; CHECK-NEXT:  # %bb.15: # %if.end26
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
@@ -146,17 +135,31 @@ define dso_local void @fn() {
 ; CHECK-NEXT:    jne .LBB0_16
 ; CHECK-NEXT:  # %bb.17: # %if.then31
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    movb %dl, %ch
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; CHECK-NEXT:    xorl %ebp, %ebp
 ; CHECK-NEXT:  .LBB0_18: # %for.inc
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movl %ecx, %edx
+; CHECK-NEXT:    movb %dh, %al
 ; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_16: # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movb %dl, %ch
-; CHECK-NEXT:    movl %ecx, %edx
+; CHECK-NEXT:  .LBB0_10: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    # implicit-def: $eax
+; CHECK-NEXT:    testb %bl, %bl
+; CHECK-NEXT:    je .LBB0_19
+; CHECK-NEXT:  .LBB0_12: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    # implicit-def: $edi
+; CHECK-NEXT:    # implicit-def: $cl
+; CHECK-NEXT:    # kill: killed $cl
+; CHECK-NEXT:    # implicit-def: $dl
+; CHECK-NEXT:    # implicit-def: $ebp
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    jne .LBB0_11
+; CHECK-NEXT:    jmp .LBB0_7
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_14: # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; CHECK-NEXT:    movb %dh, %al
 ; CHECK-NEXT:    jmp .LBB0_1
 entry:
   br label %for.cond
diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll b/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
index 7c8618af83d3d7b..308ae65928e4d10 100644
--- a/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
+++ b/llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
@@ -267,6 +267,14 @@ define dso_local i32 @test_indirectbr(ptr %ptr) nounwind {
 ; X64-NEXT:    movl $2, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_2: # Block address taken
+; X64-NEXT:    # %bb1
+; X64-NEXT:    cmpq $.LBB4_2, %rdx
+; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    shlq $47, %rcx
+; X64-NEXT:    movl $7, %eax
+; X64-NEXT:    orq %rcx, %rsp
+; X64-NEXT:    retq
 ; X64-NEXT:  .LBB4_3: # Block address taken
 ; X64-NEXT:    # %bb2
 ; X64-NEXT:    cmpq $.LBB4_3, %rdx
@@ -283,14 +291,6 @@ define dso_local i32 @test_indirectbr(ptr %ptr) nounwind {
 ; X64-NEXT:    movl $42, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB4_2: # Block address taken
-; X64-NEXT:    # %bb1
-; X64-NEXT:    cmpq $.LBB4_2, %rdx
-; X64-NEXT:    cmovneq %rax, %rcx
-; X64-NEXT:    shlq $47, %rcx
-; X64-NEXT:    movl $7, %eax
-; X64-NEXT:    orq %rcx, %rsp
-; X64-NEXT:    retq
 ;
 ; X64-PIC-LABEL: test_indirectbr:
 ; X64-PIC:       # %bb.0: # %entry
@@ -309,6 +309,15 @@ define dso_local i32 @test_indirectbr(ptr %ptr) nounwind {
 ; X64-PIC-NEXT:    movl $2, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
+; X64-PIC-NEXT:  .LBB4_2: # Block address taken
+; X64-PIC-NEXT:    # %bb1
+; X64-PIC-NEXT:    leaq .LBB4_2(%rip), %rsi
+; X64-PIC-NEXT:    cmpq %rsi, %rdx
+; X64-PIC-NEXT:    cmovneq %rax, %rcx
+; X64-PIC-NEXT:    shlq $47, %rcx
+; X64-PIC-NEXT:    movl $7, %eax
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    retq
 ; X64-PIC-NEXT:  .LBB4_3: # Block address taken
 ; X64-PIC-NEXT:    # %bb2
 ; X64-PIC-NEXT:    leaq .LBB4_3(%rip), %rsi
@@ -327,15 +336,6 @@ define dso_local i32 @test_indirectbr(ptr %ptr) nounwind {
 ; X64-PIC-NEXT:    movl $42, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
-; X64-PIC-NEXT:  .LBB4_2: # Block address taken
-; X64-PIC-NEXT:    # %bb1
-; X64-PIC-NEXT:    leaq .LBB4_2(%rip), %rsi
-; X64-PIC-NEXT:    cmpq %rsi, %rdx
-; X64-PIC-NEXT:    cmovneq %rax, %rcx
-; X64-PIC-NEXT:    shlq $47, %rcx
-; X64-PIC-NEXT:    movl $7, %eax
-; X64-PIC-NEXT:    orq %rcx, %rsp
-; X64-PIC-NEXT:    retq
 ;
 ; X64-RETPOLINE-LABEL: test_indirectbr:
 ; X64-RETPOLINE:       # %bb.0: # %entry
@@ -375,27 +375,27 @@ define dso_local i32 @test_indirectbr_global(i32 %idx) nounwind {
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
 ; X64-NEXT:  .Ltmp1: # Block address taken
-; X64-NEXT:  .LBB5_3: # %bb2
-; X64-NEXT:    cmpq $.LBB5_3, %rdx
+; X64-NEXT:  .LBB5_2: # %bb1
+; X64-NEXT:    cmpq $.LBB5_2, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    shlq $47, %rcx
-; X64-NEXT:    movl $13, %eax
+; X64-NEXT:    movl $7, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
 ; X64-NEXT:  .Ltmp2: # Block address taken
-; X64-NEXT:  .LBB5_4: # %bb3
-; X64-NEXT:    cmpq $.LBB5_4, %rdx
+; X64-NEXT:  .LBB5_3: # %bb2
+; X64-NEXT:    cmpq $.LBB5_3, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    shlq $47, %rcx
-; X64-NEXT:    movl $42, %eax
+; X64-NEXT:    movl $13, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
 ; X64-NEXT:  .Ltmp3: # Block address taken
-; X64-NEXT:  .LBB5_2: # %bb1
-; X64-NEXT:    cmpq $.LBB5_2, %rdx
+; X64-NEXT:  .LBB5_4: # %bb3
+; X64-NEXT:    cmpq $.LBB5_4, %rdx
 ; X64-NEXT:    cmovneq %rax, %rcx
 ; X64-NEXT:    shlq $47, %rcx
-; X64-NEXT:    movl $7, %eax
+; X64-NEXT:    movl $42, %eax
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    retq
 ;
@@ -419,30 +419,30 @@ define dso_local i32 @test_indirectbr_global(i32 %idx) nounwind {
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
 ; X64-PIC-NEXT:  .Ltmp1: # Block address taken
-; X64-PIC-NEXT:  .LBB5_3: # %bb2
-; X64-PIC-NEXT:    leaq .LBB5_3(%rip), %rsi
+; X64-PIC-NEXT:  .LBB5_2: # %bb1
+; X64-PIC-NEXT:    leaq .LBB5_2(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
 ; X64-PIC-NEXT:    shlq $47, %rcx
-; X64-PIC-NEXT:    movl $13, %eax
+; X64-PIC-NEXT:    movl $7, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
 ; X64-PIC-NEXT:  .Ltmp2: # Block address taken
-; X64-PIC-NEXT:  .LBB5_4: # %bb3
-; X64-PIC-NEXT:    leaq .LBB5_4(%rip), %rsi
+; X64-PIC-NEXT:  .LBB5_3: # %bb2
+; X64-PIC-NEXT:    leaq .LBB5_3(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
 ; X64-PIC-NEXT:    shlq $47, %rcx
-; X64-PIC-NEXT:    movl $42, %eax
+; X64-PIC-NEXT:    movl $13, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
 ; X64-PIC-NEXT:  .Ltmp3: # Block address taken
-; X64-PIC-NEXT:  .LBB5_2: # %bb1
-; X64-PIC-NEXT:    leaq .LBB5_2(%rip), %rsi
+; X64-PIC-NEXT:  .LBB5_4: # %bb3
+; X64-PIC-NEXT:    leaq .LBB5_4(%rip), %rsi
 ; X64-PIC-NEXT:    cmpq %rsi, %rdx
 ; X64-PIC-NEXT:    cmovneq %rax, %rcx
 ; X64-PIC-NEXT:    shlq $47, %rcx
-; X64-PIC-NEXT:    movl $7, %eax
+; X64-PIC-NEXT:    movl $42, %eax
 ; X64-PIC-NEXT:    orq %rcx, %rsp
 ; X64-PIC-NEXT:    retq
 ;
@@ -658,12 +658,6 @@ define dso_local i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-RETPOLINE-NEXT:    movl $11, %eax
 ; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
 ; X64-RETPOLINE-NEXT:    retq
-; X64-RETPOLINE-NEXT:  .LBB7_6:
-; X64-RETPOLINE-NEXT:    cmoveq %rax, %rcx
-; X64-RETPOLINE-NEXT:    shlq $47, %rcx
-; X64-RETPOLINE-NEXT:    movl $2, %eax
-; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
-; X64-RETPOLINE-NEXT:    retq
 ; X64-RETPOLINE-NEXT:  .LBB7_7: # %bb1
 ; X64-RETPOLINE-NEXT:    cmovneq %rax, %rcx
 ; X64-RETPOLINE-NEXT:    shlq $47, %rcx
@@ -676,6 +670,12 @@ define dso_local i32 @test_switch_jumptable(i32 %idx) nounwind {
 ; X64-RETPOLINE-NEXT:    movl $42, %eax
 ; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
 ; X64-RETPOLINE-NEXT:    retq
+; X64-RETPOLINE-NEXT:  .LBB7_6:
+; X64-RETPOLINE-NEXT:    cmoveq %rax, %rcx
+; X64-RETPOLINE-NEXT:    shlq $47, %rcx
+; X64-RETPOLINE-NEXT:    movl $2, %eax
+; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
+; X64-RETPOLINE-NEXT:    retq
 entry:
   switch i32 %idx, label %bb0 [
     i32 0, label %bb1
@@ -839,13 +839,16 @@ define dso_local i32 @test_switch_jumptable_fallthrough(i32 %idx, ptr %a.ptr, pt
 ; X64-RETPOLINE-NEXT:  # %bb.13:
 ; X64-RETPOLINE-NEXT:    cmovneq %r10, %r9
 ; X64-RETPOLINE-NEXT:    jmp .LBB8_12
+; X64-RETPOLINE-NEXT:  .LBB8_2:
+; X64-RETPOLINE-NEXT:    cmovneq %r10, %r9
+; X64-RETPOLINE-NEXT:    jmp .LBB8_9
+; X64-RETPOLINE-NEXT:  .LBB8_6:
+; X64-RETPOLINE-NEXT:    cmovneq %r10, %r9
+; X64-RETPOLINE-NEXT:    jmp .LBB8_11
 ; X64-RETPOLINE-NEXT:  .LBB8_8:
 ; X64-RETPOLINE-NEXT:    cmoveq %r10, %r9
 ; X64-RETPOLINE-NEXT:    movl (%rsi), %edi
 ; X64-RETPOLINE-NEXT:    orl %r9d, %edi
-; X64-RETPOLINE-NEXT:    jmp .LBB8_9
-; X64-RETPOLINE-NEXT:  .LBB8_2:
-; X64-RETPOLINE-NEXT:    cmovneq %r10, %r9
 ; X64-RETPOLINE-NEXT:  .LBB8_9: # %bb1
 ; X64-RETPOLINE-NEXT:    addl (%rdx), %edi
 ; X64-RETPOLINE-NEXT:    orl %r9d, %edi
@@ -853,9 +856,6 @@ define dso_local i32 @test_switch_jumptable_fallthrough(i32 %idx, ptr %a.ptr, pt
 ; X64-RETPOLINE-NEXT:  .LBB8_10: # %bb2
 ; X64-RETPOLINE-NEXT:    addl (%rcx), %eax
 ; X64-RETPOLINE-NEXT:    orl %r9d, %eax
-; X64-RETPOLINE-NEXT:    jmp .LBB8_11
-; X64-RETPOLINE-NEXT:  .LBB8_6:
-; X64-RETPOLINE-NEXT:    cmovneq %r10, %r9
 ; X64-RETPOLINE-NEXT:  .LBB8_11: # %bb3
 ; X64-RETPOLINE-NEXT:    addl (%r8), %eax
 ; X64-RETPOLINE-NEXT:    orl %r9d, %eax
diff --git a/llvm/test/CodeGen/X86/statepoint-ra.ll b/llvm/test/CodeGen/X86/statepoint-ra.ll
index 4e57648820c4b30..5a4e04dd70553a6 100644
--- a/llvm/test/CodeGen/X86/statepoint-ra.ll
+++ b/llvm/test/CodeGen/X86/statepoint-ra.ll
@@ -16,7 +16,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ;YAML:   - String:          ' total spills cost '
 ;YAML:   - NumReloads:      '7'
 ;YAML:   - String:          ' reloads '
-;YAML:   - TotalReloadsCost: '3.109004e-15'
+;YAML:   - TotalReloadsCost: '3.108624e-15'
 ;YAML:   - String:          ' total reloads cost '
 ;YAML:   - NumZeroCostFoldedReloads: '20'
 ;YAML:   - String:          ' zero cost folded reloads '
diff --git a/llvm/test/CodeGen/X86/switch.ll b/llvm/test/CodeGen/X86/switch.ll
index 264c5bf15aa28bb..57191eba8e6c460 100644
--- a/llvm/test/CodeGen/X86/switch.ll
+++ b/llvm/test/CodeGen/X86/switch.ll
@@ -1880,6 +1880,13 @@ define void @left_leaning_weight_balanced_tree(i32 %x) {
 ; CHECK-NEXT:  # %bb.7: # %bb2
 ; CHECK-NEXT:    movl $2, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
+; CHECK-NEXT:  .LBB19_12: # %entry
+; CHECK-NEXT:    cmpl $50, %edi
+; CHECK-NEXT:    je .LBB19_17
+; CHECK-NEXT:  # %bb.13: # %entry
+; CHECK-NEXT:    cmpl $60, %edi
+; CHECK-NEXT:    je .LBB19_14
+; CHECK-NEXT:    jmp .LBB19_18
 ; CHECK-NEXT:  .LBB19_8: # %entry
 ; CHECK-NEXT:    cmpl $30, %edi
 ; CHECK-NEXT:    je .LBB19_16
@@ -1889,20 +1896,14 @@ define void @left_leaning_weight_balanced_tree(i32 %x) {
 ; CHECK-NEXT:  # %bb.10: # %bb4
 ; CHECK-NEXT:    movl $4, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
-; CHECK-NEXT:  .LBB19_12: # %entry
-; CHECK-NEXT:    cmpl $50, %edi
-; CHECK-NEXT:    je .LBB19_17
-; CHECK-NEXT:  # %bb.13: # %entry
-; CHECK-NEXT:    cmpl $60, %edi
-; CHECK-NEXT:    je .LBB19_14
-; CHECK-NEXT:  .LBB19_18: # %return
-; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB19_15: # %bb1
 ; CHECK-NEXT:    movl $1, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
 ; CHECK-NEXT:  .LBB19_16: # %bb3
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
+; CHECK-NEXT:  .LBB19_18: # %return
+; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB19_17: # %bb5
 ; CHECK-NEXT:    movl $5, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
@@ -2351,12 +2352,12 @@ define void @jump_table_affects_balance(i32 %x) {
 ; CHECK-NEXT:  .LBB22_6: # %bb0
 ; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
-; CHECK-NEXT:  .LBB22_8: # %bb2
-; CHECK-NEXT:    movl $2, %edi
-; CHECK-NEXT:    jmp g at PLT # TAILCALL
 ; CHECK-NEXT:  .LBB22_7: # %bb1
 ; CHECK-NEXT:    movl $1, %edi
 ; CHECK-NEXT:    jmp g at PLT # TAILCALL
+; CHECK-NEXT:  .LBB22_8: # %bb2
+; CHECK-NEXT:    movl $2, %edi
+; CHECK-NEXT:    jmp g at PLT # TAILCALL
 ; CHECK-NEXT:  .LBB22_10: # %return
 ; CHECK-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/tail-dup-no-other-successor.ll b/llvm/test/CodeGen/X86/tail-dup-no-other-successor.ll
index 6fa6f94e6530a97..1b8bf8eea5df25e 100644
--- a/llvm/test/CodeGen/X86/tail-dup-no-other-successor.ll
+++ b/llvm/test/CodeGen/X86/tail-dup-no-other-successor.ll
@@ -12,10 +12,10 @@ declare void @effect(i32);
 ; CHECK: %entry
 ; CHECK: %loop.top
 ; CHECK: %loop.latch
-; CHECK: %top.fakephi
 ; CHECK: %loop.end
 ; CHECK: %false
 ; CHECK: %ret
+; CHECK: %top.fakephi
 define void @no_successor_still_no_taildup (i32 %count, i32 %key) {
 entry:
   br label %loop.top
diff --git a/llvm/test/CodeGen/X86/tail-opts.ll b/llvm/test/CodeGen/X86/tail-opts.ll
index ae3401ece7ce114..d54110d1fa8119a 100644
--- a/llvm/test/CodeGen/X86/tail-opts.ll
+++ b/llvm/test/CodeGen/X86/tail-opts.ll
@@ -279,11 +279,7 @@ define fastcc void @c_expand_expr_stmt(ptr %expr) nounwind {
 ; CHECK-NEXT:  .LBB3_9: # %bb3
 ; CHECK-NEXT:  .LBB3_15:
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:  .LBB3_16: # %lvalue_p.exit4
-; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    jne .LBB3_9
-; CHECK-NEXT:  # %bb.17: # %lvalue_p.exit4
-; CHECK-NEXT:    testb %bl, %bl
+; CHECK-NEXT:    jmp .LBB3_16
 ; CHECK-NEXT:  .LBB3_10: # %bb2.i3
 ; CHECK-NEXT:    movq 8(%rax), %rax
 ; CHECK-NEXT:    movzbl 16(%rax), %ecx
@@ -302,8 +298,12 @@ define fastcc void @c_expand_expr_stmt(ptr %expr) nounwind {
 ; CHECK-NEXT:    je .LBB3_16
 ; CHECK-NEXT:  # %bb.14: # %bb2.i.i2
 ; CHECK-NEXT:    cmpl $23, %ecx
-; CHECK-NEXT:    je .LBB3_16
-; CHECK-NEXT:    jmp .LBB3_9
+; CHECK-NEXT:    jne .LBB3_9
+; CHECK-NEXT:  .LBB3_16: # %lvalue_p.exit4
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB3_9
+; CHECK-NEXT:  # %bb.17: # %lvalue_p.exit4
+; CHECK-NEXT:    testb %bl, %bl
 entry:
   %tmp4 = load i8, ptr null, align 8                  ; <i8> [#uses=3]
   switch i8 %tmp4, label %bb3 [
diff --git a/llvm/test/Other/cfg-printer-branch-weights.ll b/llvm/test/Other/cfg-printer-branch-weights.ll
index c8d57ecbbc2b223..803087f3318e969 100644
--- a/llvm/test/Other/cfg-printer-branch-weights.ll
+++ b/llvm/test/Other/cfg-printer-branch-weights.ll
@@ -6,11 +6,11 @@ entry:
   %check = icmp sgt i32 %0, 0
   br i1 %check, label %if, label %exit, !prof !0
 
-; CHECK: label="W:7"
+; CHECK: label="W:89623871094784"
 ; CHECK-NOT: ["];
 if:                     ; preds = %entry
   br label %exit
-; CHECK: label="W:1600"
+; CHECK: label="W:17924774638387200"
 ; CHECK-NOT: ["];
 exit:                   ; preds = %entry, %if
   ret void
diff --git a/llvm/test/ThinLTO/X86/function_entry_count.ll b/llvm/test/ThinLTO/X86/function_entry_count.ll
index 12cedba6b9c83dd..b65bc226040bfcb 100644
--- a/llvm/test/ThinLTO/X86/function_entry_count.ll
+++ b/llvm/test/ThinLTO/X86/function_entry_count.ll
@@ -18,7 +18,7 @@
 ; CHECK: define void @f(i32{{.*}}) [[ATTR:#[0-9]+]] !prof ![[PROF1:[0-9]+]]
 ; CHECK: define available_externally void @g() !prof ![[PROF2]]
 ; CHECK-DAG: ![[PROF1]] = !{!"synthetic_function_entry_count", i64 10}
-; CHECK-DAG: ![[PROF2]] = !{!"synthetic_function_entry_count", i64 198}
+; CHECK-DAG: ![[PROF2]] = !{!"synthetic_function_entry_count", i64 200}
 ; CHECK-DAG: attributes [[ATTR]] = { norecurse nounwind }
 
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll b/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
index 63568456d0e58c8..ca50a04a328151c 100644
--- a/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
+++ b/llvm/test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
@@ -31,4 +31,4 @@ ret i32 %val
 !2 = !{!"branch_weights", i32 5, i32 5}
 !3 = !{!"branch_weights", i32 4, i32 1}
 
-; CHECK: [[COUNT1]] = !{!"branch_weights", i32 31, i32 8}
+; CHECK: [[COUNT1]] = !{!"branch_weights", i32 858993459, i32 214748365}
diff --git a/llvm/test/Transforms/ConstantHoisting/X86/pr52689-not-all-uses-rebased.ll b/llvm/test/Transforms/ConstantHoisting/X86/pr52689-not-all-uses-rebased.ll
index 88ba4d3562c826c..e4352e4d98b77e6 100644
--- a/llvm/test/Transforms/ConstantHoisting/X86/pr52689-not-all-uses-rebased.ll
+++ b/llvm/test/Transforms/ConstantHoisting/X86/pr52689-not-all-uses-rebased.ll
@@ -2,6 +2,10 @@
 
 ; REQUIRES: asserts
 
+; My changes fixed this likely by accident, please update as necessary when
+; you work on this:
+; XFAIL: *
+
 ; Matching assertion strings is not easy as they might differ on different
 ; platforms. So limit this to x86_64-linux.
 ; REQUIRES: x86_64-linux
diff --git a/llvm/test/Transforms/JumpThreading/thread-prob-7.ll b/llvm/test/Transforms/JumpThreading/thread-prob-7.ll
index f11bfd026688192..8c9d89871d00b32 100644
--- a/llvm/test/Transforms/JumpThreading/thread-prob-7.ll
+++ b/llvm/test/Transforms/JumpThreading/thread-prob-7.ll
@@ -52,5 +52,5 @@ bb_join:
 ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 400, i32 600}
 ; CHECK: [[PROF2]] = !{!"branch_weights", i32 300, i32 300}
-; CHECK: [[PROF3]] = !{!"branch_weights", i32 678152731, i32 1469330917}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 613566756, i32 1533916892}
 ;.
diff --git a/llvm/test/Transforms/JumpThreading/update-edge-weight.ll b/llvm/test/Transforms/JumpThreading/update-edge-weight.ll
index ff82fb0b214d401..6313a87993303fd 100644
--- a/llvm/test/Transforms/JumpThreading/update-edge-weight.ll
+++ b/llvm/test/Transforms/JumpThreading/update-edge-weight.ll
@@ -2,7 +2,7 @@
 
 ; Test if edge weights are properly updated after jump threading.
 
-; CHECK: !2 = !{!"branch_weights", i32 1629125526, i32 518358122}
+; CHECK: !2 = !{!"branch_weights", i32 1561806291, i32 585677357}
 
 define void @foo(i32 %n) !prof !0 {
 entry:
diff --git a/llvm/test/Transforms/LICM/loopsink.ll b/llvm/test/Transforms/LICM/loopsink.ll
index c08b992f35f41b9..ea7b0e06264d711 100644
--- a/llvm/test/Transforms/LICM/loopsink.ll
+++ b/llvm/test/Transforms/LICM/loopsink.ll
@@ -195,23 +195,27 @@ define i32 @t3(i32, i32) #0 !prof !0 {
   ret i32 10
 }
 
-; For single-BB loop with <=1 avg trip count, sink load to b1
+; For single-BB loop with <=1 avg trip count, sink load to body
 ; CHECK: t4
-; CHECK: .preheader:
+; CHECK: .header:
 ; CHECK-NOT: load i32, ptr @g
-; CHECK: .b1:
+; CHECK: .body:
 ; CHECK: load i32, ptr @g
 ; CHECK: .exit:
 define i32 @t4(i32, i32) #0 !prof !0 {
-.preheader:
+.entry:
   %invariant = load i32, ptr @g
-  br label %.b1
+  br label %.header
 
-.b1:
-  %iv = phi i32 [ %t1, %.b1 ], [ 0, %.preheader ]
+.header:
+  %iv = phi i32 [ %t1, %.body ], [ 0, %.entry ]
+  %c0 = icmp sgt i32 %iv, %0
+  br i1 %c0, label %.body, label %.exit, !prof !1
+
+.body:
   %t1 = add nsw i32 %invariant, %iv
   %c1 = icmp sgt i32 %iv, %0
-  br i1 %c1, label %.b1, label %.exit, !prof !1
+  br label %.header
 
 .exit:
   ret i32 10
diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/opt-remark-with-hotness.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/opt-remark-with-hotness.ll
index 174d55651171c11..2dc515758afebb6 100644
--- a/llvm/test/Transforms/LoopDataPrefetch/AArch64/opt-remark-with-hotness.ll
+++ b/llvm/test/Transforms/LoopDataPrefetch/AArch64/opt-remark-with-hotness.ll
@@ -78,5 +78,5 @@ for.body:                                         ; preds = %for.body, %for.body
 !19 = !{!"int", !13, i64 0}
 !20 = !DILocation(line: 9, column: 11, scope: !6)
 !21 = !{!"function_entry_count", i64 6}
-!22 = !{!"branch_weights", i32 99, i32 1}
+!22 = !{!"branch_weights", i32 2000, i32 1}
 !23 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll b/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
index 0b31fd8d45e8380..6f36f4d263f4301 100644
--- a/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
+++ b/llvm/test/Transforms/LoopDistribute/diagnostics-with-hotness.ll
@@ -79,5 +79,5 @@ for.cond.cleanup:
 !20 = distinct !{!20, !21}
 !21 = !{!"llvm.loop.distribute.enable", i1 true}
 !22 = !{!"function_entry_count", i64 3}
-!23 = !{!"branch_weights", i32 99, i32 1}
+!23 = !{!"branch_weights", i32 2000, i32 1}
 !24 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopRotate/update-branch-weights.ll b/llvm/test/Transforms/LoopRotate/update-branch-weights.ll
index f587ed99ab84daa..5d742b64e0adbf4 100644
--- a/llvm/test/Transforms/LoopRotate/update-branch-weights.ll
+++ b/llvm/test/Transforms/LoopRotate/update-branch-weights.ll
@@ -70,9 +70,9 @@ outer_loop_exit:
 
 ; BFI_AFTER-LABEL: block-frequency-info: func1
 ; BFI_AFTER: - entry: {{.*}} count = 1024
-; BFI_AFTER: - loop_body.lr.ph: {{.*}} count = 1024
-; BFI_AFTER: - loop_body: {{.*}} count = 20608
-; BFI_AFTER: - loop_header.loop_exit_crit_edge: {{.*}} count = 1024
+; BFI_AFTER: - loop_body.lr.ph: {{.*}} count = 1016
+; BFI_AFTER: - loop_body: {{.*}} count = 20480
+; BFI_AFTER: - loop_header.loop_exit_crit_edge: {{.*}} count = 1016
 ; BFI_AFTER: - loop_exit: {{.*}} count = 1024
 
 ; IR-LABEL: define void @func1
@@ -146,14 +146,14 @@ loop_exit:
 
 ; BFI_BEFORE-LABEL: block-frequency-info: func3_zero_branch_weight
 ; BFI_BEFORE: - entry: {{.*}} count = 1024
-; BFI_BEFORE: - loop_header: {{.*}} count = 2199023255296
-; BFI_BEFORE: - loop_body: {{.*}} count = 2199023254272
+; BFI_BEFORE: - loop_header: {{.*}} count = 2199023255552
+; BFI_BEFORE: - loop_body: {{.*}} count = 2199023254528
 ; BFI_BEFORE: - loop_exit: {{.*}} count = 1024
 
 ; BFI_AFTER-LABEL: block-frequency-info: func3_zero_branch_weight
 ; BFI_AFTER: - entry: {{.*}} count = 1024
 ; BFI_AFTER: - loop_body.lr.ph: {{.*}} count = 1024
-; BFI_AFTER: - loop_body: {{.*}} count = 2199023255296
+; BFI_AFTER: - loop_body: {{.*}} count = 2199023255552
 ; BFI_AFTER: - loop_header.loop_exit_crit_edge: {{.*}} count = 1024
 ; BFI_AFTER: - loop_exit: {{.*}} count = 1024
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx512.ll b/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
index 44aae477bf71c15..33d1d3f0d22191d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
@@ -7,11 +7,12 @@ target triple = "x86_64-apple-macosx10.9.0"
 ; Verify that we generate 512-bit wide vectors for a basic integer memset
 ; loop.
 
-; CHECK-LABEL: f:
-; CHECK: vmovdqu64 %zmm{{.}},
-; CHECK-NOT: %ymm
-; CHECK: epilog
+; CHECK-LABEL: _f:
+; CHECK: %vec.epilog.vector.body
 ; CHECK: %ymm
+; CHECK: %vector.body
+; CHECK-NOT: %ymm
+; CHECK: vmovdqu64 %zmm{{.}},
 
 ; Verify that we don't generate 512-bit wide vectors when subtarget feature says not to
 
@@ -46,7 +47,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; Verify that the "prefer-vector-width=256" attribute prevents the use of 512-bit
 ; vectors
 
-; CHECK-LABEL: g:
+; CHECK-LABEL: _g:
 ; CHECK: vmovdqu %ymm{{.}},
 ; CHECK-NOT: %zmm
 
@@ -81,17 +82,19 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; Verify that the "prefer-vector-width=512" attribute override the subtarget
 ; vectors
 
-; CHECK-LABEL: h:
+; CHECK-LABEL: _h:
+; CHECK: %vec.epilog.vector.body
+; CHECK: %ymm
+; CHECK: %vector.body
 ; CHECK: vmovdqu64 %zmm{{.}},
 ; CHECK-NOT: %ymm
-; CHECK: epilog
-; CHECK: %ymm
 
 ; CHECK-PREFER-AVX256-LABEL: h:
+; CHECK-PREFER-AVX256: %vec.epilog.vector.body
+; CHECK-PREFER-AVX256: %ymm
+; CHECK-PREFER-AVX256: %vector.body
 ; CHECK-PREFER-AVX256: vmovdqu64 %zmm{{.}},
 ; CHECK-PREFER-AVX256-NOT: %ymm
-; CHECK-PREFER-AVX256: epilog
-; CHECK-PREFER-AVX256: %ymm
 
 define void @h(ptr %a, i32 %n) "prefer-vector-width"="512" {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
index b1fc96ea77ed034..4f413a50837dd69 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
@@ -108,5 +108,5 @@ attributes #0 = { nounwind }
                              isOptimized: true, flags: "-O2",
                              splitDebugFilename: "abc.debug", emissionKind: 2)
 !29 = !{!"function_entry_count", i64 3}
-!30 = !{!"branch_weights", i32 99, i32 1}
+!30 = !{!"branch_weights", i32 10000, i32 1}
 !31 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
index ed107b10dcd9874..4da1d099645bee2 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
@@ -198,5 +198,5 @@ attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "l
 !55 = distinct !{!55, !43}
 !56 = !{!"function_entry_count", i64 3}
 !57 = !{!"function_entry_count", i64 50}
-!58 = !{!"branch_weights", i32 99, i32 1}
+!58 = !{!"branch_weights", i32 10000, i32 1}
 !59 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
index 30d11a12c79c4bc..4b7b714a2562800 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
@@ -209,5 +209,5 @@ attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "l
 !55 = distinct !{!55, !43}
 !56 = !{!"function_entry_count", i64 3}
 !57 = !{!"function_entry_count", i64 50}
-!58 = !{!"branch_weights", i32 99, i32 1}
+!58 = !{!"branch_weights", i32 10000, i32 1}
 !59 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/PR41279_2.proftext b/llvm/test/Transforms/PGOProfile/Inputs/PR41279_2.proftext
index c6cb02aaddd1d6d..651ca44caf808d0 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/PR41279_2.proftext
+++ b/llvm/test/Transforms/PGOProfile/Inputs/PR41279_2.proftext
@@ -1,7 +1,8 @@
 :ir
 f
 1096621589180411894
-2
+3
 3
 2
+1
 
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/criticaledge.proftext b/llvm/test/Transforms/PGOProfile/Inputs/criticaledge.proftext
index 85b9779abeece66..6757a1ad6185e0b 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/criticaledge.proftext
+++ b/llvm/test/Transforms/PGOProfile/Inputs/criticaledge.proftext
@@ -7,10 +7,10 @@ test_criticalEdge
 1
 2
 2
-0
-1
 2
 1
+0
+1
 
 <stdin>:bar
 742261418966908927
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/criticaledge_entry.proftext b/llvm/test/Transforms/PGOProfile/Inputs/criticaledge_entry.proftext
index f1497d6c01c9f89..3cc0bb0be65bf26 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/criticaledge_entry.proftext
+++ b/llvm/test/Transforms/PGOProfile/Inputs/criticaledge_entry.proftext
@@ -8,10 +8,10 @@ test_criticalEdge
 2
 1
 2
-0
-1
 2
 1
+0
+1
 
 <stdin>:bar
 742261418966908927
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/indirectbr.proftext b/llvm/test/Transforms/PGOProfile/Inputs/indirectbr.proftext
index 49fafd9d99bf91f..0cbdea7aacb6144 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/indirectbr.proftext
+++ b/llvm/test/Transforms/PGOProfile/Inputs/indirectbr.proftext
@@ -7,6 +7,6 @@ foo
 4
 # Counter Values:
 139
-20
 5
+20
 63
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/indirectbr_entry.proftext b/llvm/test/Transforms/PGOProfile/Inputs/indirectbr_entry.proftext
index 6910f7e21d677e7..70d2844ba5ade02 100644
--- a/llvm/test/Transforms/PGOProfile/Inputs/indirectbr_entry.proftext
+++ b/llvm/test/Transforms/PGOProfile/Inputs/indirectbr_entry.proftext
@@ -8,6 +8,6 @@ foo
 4
 # Counter Values:
 202
-20
 5
+20
 63
diff --git a/llvm/test/Transforms/PGOProfile/PR41279_2.ll b/llvm/test/Transforms/PGOProfile/PR41279_2.ll
index fc3e54fcb4c17a3..8c3c5695c1a5d6a 100644
--- a/llvm/test/Transforms/PGOProfile/PR41279_2.ll
+++ b/llvm/test/Transforms/PGOProfile/PR41279_2.ll
@@ -9,7 +9,21 @@ define dso_local void @f() personality ptr @__C_specific_handler {
 ; USE-SAME: !prof ![[FUNC_ENTRY_COUNT:[0-9]+]]
 ; USE-DAG: {{![0-9]+}} = !{i32 1, !"ProfileSummary", {{![0-9]+}}}
 ; USE-DAG: {{![0-9]+}} = !{!"DetailedSummary", {{![0-9]+}}}
-; USE-DAG: ![[FUNC_ENTRY_COUNT]] = !{!"function_entry_count", i64 5}
+; USE-DAG: ![[FUNC_ENTRY_COUNT]] = !{!"function_entry_count", i64 6}
+;
+; GEN-LABEL: @f
+;
+; GEN: catch.dispatch:
+; GEN-NOT: call void @llvm.instrprof.increment
+;
+; GEN:  _except1:
+; GEN:    call void @llvm.instrprof.increment(ptr @__profn_f, i64 {{[0-9]+}}, i32 3, i32 1)
+;
+; GEN: __except6:
+; GEN:   call void @llvm.instrprof.increment(ptr @__profn_f, i64 {{[0-9]+}}, i32 3, i32 2)
+;
+; GEN: invoke.cont3:
+; GEN:   call void @llvm.instrprof.increment(ptr @__profn_f, i64 1096621589180411894, i32 3, i32 0)
 entry:
   %__exception_code = alloca i32, align 4
   %__exception_code2 = alloca i32, align 4
@@ -27,8 +41,6 @@ __except1:
   %2 = call i32 @llvm.eh.exceptioncode(token %1)
   store i32 %2, ptr %__exception_code, align 4
   br label %__try.cont7
-;GEN:  _except1:
-;GEN:    call void @llvm.instrprof.increment(ptr @__profn_f, i64 {{[0-9]+}}, i32 2, i32 1)
 
 invoke.cont:
   br label %__try.cont
@@ -39,8 +51,6 @@ __try.cont:
 
 catch.dispatch4:
   %3 = catchswitch within none [label %__except5] unwind to caller
-; GEN: catch.dispatch4:
-; GEN-NOT: call void @llvm.instrprof.increment
 
 __except5:
   %4 = catchpad within %3 [ptr null]
@@ -56,9 +66,6 @@ __try.cont7:
 
 invoke.cont3:
   br label %__try.cont7
-;GEN: invoke.cont3:
-;GEN:  call void @llvm.instrprof.increment(ptr @__profn_f, i64 {{[0-9]+}}, i32 2, i32 0)
-
 }
 
 declare dso_local i32 @__C_specific_handler(...)
diff --git a/llvm/test/Transforms/PGOProfile/bfi_verification.ll b/llvm/test/Transforms/PGOProfile/bfi_verification.ll
index 9d07842a3122177..395a69b3535a3c0 100644
--- a/llvm/test/Transforms/PGOProfile/bfi_verification.ll
+++ b/llvm/test/Transforms/PGOProfile/bfi_verification.ll
@@ -95,15 +95,8 @@ if.then25:
 if.end26:
   ret void
 }
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB do.body Count=39637749 BFI_Count=40801304
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB while.cond Count=80655628 BFI_Count=83956530
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB while.body Count=41017879 BFI_Count=42370585
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB while.cond3 Count=71254487 BFI_Count=73756204
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB while.body7 Count=31616738 BFI_Count=32954900
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB while.end8 Count=39637749 BFI_Count=40801304
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB if.then Count=32743703 BFI_Count=33739540
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB if.end Count=39637749 BFI_Count=40801304
-; THRESHOLD-CHECK: remark: <unknown>:0:0: BB if.then25 Count=6013544 BFI_Count=6277124
-; THRESHOLD-CHECK: remark: <unknown>:0:0: In Func sort_basket: Num_of_BB=14, Num_of_non_zerovalue_BB=14, Num_of_mis_matching_BB=9
-; HOTONLY-CHECK: remark: <unknown>:0:0: BB if.then25 Count=6013544 BFI_Count=6277124 (raw-Cold to BFI-Hot)
-; HOTONLY-CHECK: remark: <unknown>:0:0: In Func sort_basket: Num_of_BB=14, Num_of_non_zerovalue_BB=14, Num_of_mis_matching_BB=1
+
+; TODO: All the inaccuracies that were reported here disappeared after
+; improving BFI precision...
+; THRESHOLD-CHECK:{{.*}}
+; HOTONLY-CHECK:{{.*}}
diff --git a/llvm/test/Transforms/PGOProfile/criticaledge.ll b/llvm/test/Transforms/PGOProfile/criticaledge.ll
index c24925c68fa32db..388ba6f353b3603 100644
--- a/llvm/test/Transforms/PGOProfile/criticaledge.ll
+++ b/llvm/test/Transforms/PGOProfile/criticaledge.ll
@@ -48,7 +48,7 @@ sw.bb:
 
 sw.bb1:
 ; GEN: sw.bb1:
-; GEN: call void @llvm.instrprof.increment(ptr @__profn_test_criticalEdge, i64 {{[0-9]+}}, i32 8, i32 4)
+; GEN: call void @llvm.instrprof.increment(ptr @__profn_test_criticalEdge, i64 {{[0-9]+}}, i32 8, i32 6)
   %call2 = call i32 @bar(i32 1024)
   br label %sw.epilog
 
@@ -75,7 +75,7 @@ if.end:
 
 sw.default:
 ; GEN: sw.default:
-; GEN-NOT: call void @llvm.instrprof.increment
+; GEN: call void @llvm.instrprof.increment(ptr @__profn_test_criticalEdge, i64 {{[0-9]+}}, i32 8, i32 4)
   %call6 = call i32 @bar(i32 32)
   %cmp7 = icmp sgt i32 %j, 10
   br i1 %cmp7, label %if.then8, label %if.end9
@@ -90,7 +90,7 @@ if.then8:
 
 if.end9:
 ; GEN: if.end9:
-; GEN: call void @llvm.instrprof.increment(ptr @__profn_test_criticalEdge, i64 {{[0-9]+}}, i32 8, i32 6)
+; GEN-NOT: call void @llvm.instrprof.increment
   %res.0 = phi i32 [ %add, %if.then8 ], [ %call6, %sw.default ]
   br label %sw.epilog
 
diff --git a/llvm/test/Transforms/PGOProfile/fix_bfi.ll b/llvm/test/Transforms/PGOProfile/fix_bfi.ll
index fcfe3aa7b3a9cc1..aedef436210ef07 100644
--- a/llvm/test/Transforms/PGOProfile/fix_bfi.ll
+++ b/llvm/test/Transforms/PGOProfile/fix_bfi.ll
@@ -96,4 +96,4 @@ if.end26:
 }
 
 ; CHECK: define dso_local void @sort_basket(i64 %min, i64 %max) #0 !prof [[ENTRY_COUNT:![0-9]+]]
-; CHECK: [[ENTRY_COUNT]] = !{!"function_entry_count", i64 12949310}
+; CHECK: [[ENTRY_COUNT]] = !{!"function_entry_count", i64 13338888}
diff --git a/llvm/test/Transforms/PGOProfile/loop2.ll b/llvm/test/Transforms/PGOProfile/loop2.ll
index 071f8a6d5ad5949..c872c618a64be66 100644
--- a/llvm/test/Transforms/PGOProfile/loop2.ll
+++ b/llvm/test/Transforms/PGOProfile/loop2.ll
@@ -30,7 +30,8 @@ for.cond.outer:
 
 for.body.outer:
 ; GEN: for.body.outer:
-; GEN-NOT: call void @llvm.instrprof.increment
+; NOTENTRY: call void @llvm.instrprof.increment(ptr @__profn_test_nested_for, i64 798733566382720768, i32 3, i32 1)
+; ENTRY: call void @llvm.instrprof.increment(ptr @__profn_test_nested_for, i64 798733566382720768, i32 3, i32 2)
   br label %for.cond.inner
 
 for.cond.inner:
@@ -62,8 +63,7 @@ for.end.inner:
 
 for.inc.outer:
 ; GEN: for.inc.outer:
-; NOTENTRY: call void @llvm.instrprof.increment(ptr @__profn_test_nested_for, i64 {{[0-9]+}}, i32 3, i32 1)
-; ENTRY: call void @llvm.instrprof.increment(ptr @__profn_test_nested_for, i64 {{[0-9]+}}, i32 3, i32 2)
+; GEN-NOT: call void @llvm.instrprof.increment
   %inc.2 = add nsw i32 %i.0, 1
   br label %for.cond.outer
 
diff --git a/llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll b/llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll
index f5c3ca4aca470df..ef2fcc6a9e2485a 100644
--- a/llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll
@@ -58,19 +58,19 @@ b1:
 b2:
   call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 2, i32 0, i64 -1)
   br i1 %cmp, label %b7, label %b3
-; CHECK: - b2: float = {{.*}}, int = {{.*}}, count = 625
+; CHECK: - b2: float = {{.*}}, int = {{.*}}, count = 586
 
 b3:
   call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 3, i32 0, i64 -1)
   br i1 %cmp, label %b7, label %b4
-; CHECK: - b3: float = {{.*}}, int = {{.*}}, count = 625
+; CHECK: - b3: float = {{.*}}, int = {{.*}}, count = 586
 ; CHECK2: br i1 %cmp, label %b7, label %b4,
 ; CHECK2-SAME: !prof ![[END172_PROF:[0-9]+]]
 
 b4:
   call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 4, i32 0, i64 -1)
   br label %b2
-; CHECK: - b4: float = {{.*}}, int = {{.*}}, count = 624
+; CHECK: - b4: float = {{.*}}, int = {{.*}}, count = 585
 
 b5:
   call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 5, i32 0, i64 -1)
diff --git a/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll b/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll
index 36772eda1ede76e..9d38f8889396a6e 100644
--- a/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll
@@ -148,26 +148,26 @@ b1:
   br i1 %cmp, label %b2, label %b3
 ; CHECK:  edge b1 -> b2 probability is 0x40000000 / 0x80000000 = 50.00%
 ; CHECK:  edge b1 -> b3 probability is 0x40000000 / 0x80000000 = 50.00%
-; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 1973
+; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 2000
 
 b2:
   call void @llvm.pseudoprobe(i64 2506109673213838996, i64 3, i32 0, i64 -1)
   br i1 %cmp, label %b3, label %b4
 ; CHECK:  edge b2 -> b3 probability is 0x40000000 / 0x80000000 = 50.00%
 ; CHECK:  edge b2 -> b4 probability is 0x40000000 / 0x80000000 = 50.00%
-; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 955
+; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 1000
 
 b3:
   call void @llvm.pseudoprobe(i64 2506109673213838996, i64 4, i32 0, i64 -1)
   br label %b5
 ; CHECK:  edge b3 -> b5 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
-; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 1527
+; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 1500
 
 b4:
   call void @llvm.pseudoprobe(i64 2506109673213838996, i64 5, i32 0, i64 -1)
   br label %b5
 ; CHECK:  edge b4 -> b5 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
-; CHECK2: - b4: float = {{.*}}, int = {{.*}}, count = 445
+; CHECK2: - b4: float = {{.*}}, int = {{.*}}, count = 500
 
 b5:
   call void @llvm.pseudoprobe(i64 2506109673213838996, i64 6, i32 0, i64 -1)
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll
index 19e83649723d642..105494942d383d5 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-update-2.ll
@@ -14,8 +14,8 @@ T1:                                               ; preds = %0
   %v1 = call i32 @f1(), !prof !12
   %cond3 = icmp eq i32 %v1, 412
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 2, i32 0, i64 -1)
-;; The distribution factor -8513881372706734080 stands for 53.85%, whic is from 7/6+7.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -8513881372706734080)
+;; The distribution factor -9223372036854775808 stands for 53.85%, whic is from 7/6+7.
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID:]], i64 4, i32 0, i64 -9223372036854775808)
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1), !dbg !13
 ;; Probe 7 has two copies, since they don't share the same inline context, they are not
 ;; considered sharing samples, thus their distribution factors are not fixed up.
@@ -29,8 +29,8 @@ T1:                                               ; preds = %0
 Merge:                                            ; preds = %0
   %v2 = call i32 @f2(), !prof !12
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 3, i32 0, i64 -1)
-;; The distribution factor 8513881922462547968 stands for 46.25%, which is from 6/6+7.
-; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 4, i32 0, i64 8513881922462547968)
+;; The distribution factor  -9223372036854775808 stands for 46.25%, which is from 6/6+7.
+; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 4, i32 0, i64  -9223372036854775808)
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 8513881922462547968), !dbg !13
 ; CHECK: call void @llvm.pseudoprobe(i64 [[#GUID]], i64 7, i32 0, i64 -1)
   call void @llvm.pseudoprobe(i64 6699318081062747564, i64 7, i32 0, i64 -1), !dbg !18
@@ -77,4 +77,4 @@ attributes #0 = { inaccessiblememonly nounwind willreturn }
 !16 = distinct !DILocation(line: 10, column: 11, scope: !17)
 !17 = !DILexicalBlockFile(scope: !4, file: !5, discriminator: 186646551)
 !18 = !DILocation(line: 53, column: 3, scope: !15, inlinedAt: !19)
-!19 = !DILocation(line: 12, column: 3, scope: !4)
\ No newline at end of file
+!19 = !DILocation(line: 12, column: 3, scope: !4)



More information about the llvm-commits mailing list