[llvm] EarlyCSE: create casts on type-mismatch (PR #113339)

Ramkumar Ramachandra via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 30 07:12:10 PDT 2024


https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/113339

>From 7800b0b3e42026d2cc00fd33d0bbb16abd9d6232 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Tue, 22 Oct 2024 16:44:13 +0100
Subject: [PATCH 1/3] CodeGen/test: regen two tests with UTC (NFC)

---
 llvm/test/CodeGen/NVPTX/load-store.ll         | 2145 ++++++++++++-----
 .../PowerPC/big-endian-store-forward.ll       |   12 +-
 2 files changed, 1586 insertions(+), 571 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index f922fd92fa244e..8435e016096621 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefixes=CHECK,SM60 %s
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70
@@ -22,149 +23,297 @@
 
 ; generic statespace
 
-; CHECK-LABEL: generic_weak
 define void @generic_weak(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
-  ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_weak(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<29>;
+; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .f32 %f<15>;
+; CHECK-NEXT:    .reg .b64 %rd<11>;
+; CHECK-NEXT:    .reg .f64 %fd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [generic_weak_param_0];
+; CHECK-NEXT:    ld.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [generic_weak_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [generic_weak_param_2];
+; CHECK-NEXT:    st.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [generic_weak_param_3];
+; CHECK-NEXT:    ld.u16 %rs3, [%rd2];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.u64 %rd5, [%rd4];
+; CHECK-NEXT:    add.s64 %rd6, %rd5, 1;
+; CHECK-NEXT:    st.u64 [%rd4], %rd6;
+; CHECK-NEXT:    ld.f32 %f1, [%rd3];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.f32 [%rd3], %f2;
+; CHECK-NEXT:    ld.f64 %fd1, [%rd4];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.f64 [%rd4], %fd2;
+; CHECK-NEXT:    ld.v2.u8 {%rs5, %rs6}, [%rd2];
+; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT:    st.v2.u8 [%rd2], {%rs8, %rs7};
+; CHECK-NEXT:    ld.u32 %r3, [%rd3];
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs9, %r4;
+; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs10;
+; CHECK-NEXT:    bfe.u32 %r6, %r3, 8, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r6;
+; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r7, %rs12;
+; CHECK-NEXT:    bfi.b32 %r8, %r7, %r5, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r3, 16, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs13, %r9;
+; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs14;
+; CHECK-NEXT:    bfi.b32 %r11, %r10, %r8, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs15, %r12;
+; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r13, %rs16;
+; CHECK-NEXT:    bfi.b32 %r14, %r13, %r11, 24, 8;
+; CHECK-NEXT:    st.u32 [%rd3], %r14;
+; CHECK-NEXT:    ld.u32 %r15, [%rd3];
+; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r15;
+; CHECK-NEXT:    add.s16 %rs19, %rs18, 1;
+; CHECK-NEXT:    add.s16 %rs20, %rs17, 1;
+; CHECK-NEXT:    mov.b32 %r16, {%rs20, %rs19};
+; CHECK-NEXT:    st.u32 [%rd3], %r16;
+; CHECK-NEXT:    ld.v4.u16 {%rs21, %rs22, %rs23, %rs24}, [%rd4];
+; CHECK-NEXT:    add.s16 %rs25, %rs24, 1;
+; CHECK-NEXT:    add.s16 %rs26, %rs23, 1;
+; CHECK-NEXT:    add.s16 %rs27, %rs22, 1;
+; CHECK-NEXT:    add.s16 %rs28, %rs21, 1;
+; CHECK-NEXT:    st.v4.u16 [%rd4], {%rs28, %rs27, %rs26, %rs25};
+; CHECK-NEXT:    ld.v2.u32 {%r17, %r18}, [%rd4];
+; CHECK-NEXT:    add.s32 %r19, %r18, 1;
+; CHECK-NEXT:    add.s32 %r20, %r17, 1;
+; CHECK-NEXT:    st.v2.u32 [%rd4], {%r20, %r19};
+; CHECK-NEXT:    ld.v4.u32 {%r21, %r22, %r23, %r24}, [%rd4];
+; CHECK-NEXT:    add.s32 %r25, %r24, 1;
+; CHECK-NEXT:    add.s32 %r26, %r23, 1;
+; CHECK-NEXT:    add.s32 %r27, %r22, 1;
+; CHECK-NEXT:    add.s32 %r28, %r21, 1;
+; CHECK-NEXT:    st.v4.u32 [%rd4], {%r28, %r27, %r26, %r25};
+; CHECK-NEXT:    ld.v2.u64 {%rd7, %rd8}, [%rd4];
+; CHECK-NEXT:    add.s64 %rd9, %rd8, 1;
+; CHECK-NEXT:    add.s64 %rd10, %rd7, 1;
+; CHECK-NEXT:    st.v2.u64 [%rd4], {%rd10, %rd9};
+; CHECK-NEXT:    ld.v2.f32 {%f3, %f4}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT:    st.v2.f32 [%rd4], {%f6, %f5};
+; CHECK-NEXT:    ld.v4.f32 {%f7, %f8, %f9, %f10}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f11, %f10, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f12, %f9, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f13, %f8, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f14, %f7, 0f3F800000;
+; CHECK-NEXT:    st.v4.f32 [%rd4], {%f14, %f13, %f12, %f11};
+; CHECK-NEXT:    ld.v2.f64 {%fd3, %fd4}, [%rd4];
+; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %fd6, %fd3, 0d3FF0000000000000;
+; CHECK-NEXT:    st.v2.f64 [%rd4], {%fd6, %fd5};
+; CHECK-NEXT:    ret;
   %a.load = load i8, ptr %a
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store i8 %a.add, ptr %a
 
-  ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load i16, ptr %b
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store i16 %b.add, ptr %b
 
-  ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load i32, ptr %c
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store i32 %c.add, ptr %c
 
-  ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load i64, ptr %d
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store i64 %d.add, ptr %d
 
-  ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load float, ptr %c
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store float %e.add, ptr %c
 
-  ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load double, ptr %d
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store double %f.add, ptr %d
 
   ; TODO: make the lowering of this weak vector ops consistent with
   ;       the ones of the next tests. This test lowers to a weak PTX
   ;       vector op, but next test lowers to a vector PTX op.
-  ; CHECK: ld.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %h.load = load <2 x i8>, ptr %b
   %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
-  ; CHECK: st.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store <2 x i8> %h.add, ptr %b
 
   ; TODO: make the lowering of this weak vector ops consistent with
   ;       the ones of the previous test. This test lowers to a weak
   ;       PTX scalar op, but prior test lowers to a vector PTX op.
-  ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %i.load = load <4 x i8>, ptr %c
   %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
-  ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store <4 x i8> %i.add, ptr %c
 
-  ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %j.load = load <2 x i16>, ptr %c
   %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
-  ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store <2 x i16> %j.add, ptr %c
 
-  ; CHECK: ld.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %k.load = load <4 x i16>, ptr %d
   %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
-  ; CHECK: st.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store <4 x i16> %k.add, ptr %d
 
-  ; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %l.load = load <2 x i32>, ptr %d
   %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
-  ; CHECK: st.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
   store <2 x i32> %l.add, ptr %d
 
-  ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %m.load = load <4 x i32>, ptr %d
   %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
-  ; CHECK: st.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   store <4 x i32> %m.add, ptr %d
 
-  ; CHECK: ld.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %n.load = load <2 x i64>, ptr %d
   %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
-  ; CHECK: st.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
   store <2 x i64> %n.add, ptr %d
 
-  ; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %o.load = load <2 x float>, ptr %d
   %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
-  ; CHECK: st.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
   store <2 x float> %o.add, ptr %d
 
-  ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %p.load = load <4 x float>, ptr %d
   %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
-  ; CHECK: st.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
   store <4 x float> %p.add, ptr %d
 
-  ; CHECK: ld.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %q.load = load <2 x double>, ptr %d
   %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
-  ; CHECK: st.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
   store <2 x double> %q.add, ptr %d
 
   ret void
 }
 
-; CHECK-LABEL: generic_volatile
 define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
-  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_volatile(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<29>;
+; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .f32 %f<15>;
+; CHECK-NEXT:    .reg .b64 %rd<11>;
+; CHECK-NEXT:    .reg .f64 %fd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_param_0];
+; CHECK-NEXT:    ld.volatile.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [generic_volatile_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [generic_volatile_param_2];
+; CHECK-NEXT:    st.volatile.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [generic_volatile_param_3];
+; CHECK-NEXT:    ld.volatile.u16 %rs3, [%rd2];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.u64 %rd5, [%rd4];
+; CHECK-NEXT:    add.s64 %rd6, %rd5, 1;
+; CHECK-NEXT:    st.volatile.u64 [%rd4], %rd6;
+; CHECK-NEXT:    ld.volatile.f32 %f1, [%rd3];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.f32 [%rd3], %f2;
+; CHECK-NEXT:    ld.volatile.f64 %fd1, [%rd3];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.f64 [%rd3], %fd2;
+; CHECK-NEXT:    ld.volatile.v2.u8 {%rs5, %rs6}, [%rd2];
+; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT:    st.volatile.v2.u8 [%rd2], {%rs8, %rs7};
+; CHECK-NEXT:    ld.volatile.u32 %r3, [%rd3];
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs9, %r4;
+; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs10;
+; CHECK-NEXT:    bfe.u32 %r6, %r3, 8, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r6;
+; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r7, %rs12;
+; CHECK-NEXT:    bfi.b32 %r8, %r7, %r5, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r3, 16, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs13, %r9;
+; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs14;
+; CHECK-NEXT:    bfi.b32 %r11, %r10, %r8, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs15, %r12;
+; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r13, %rs16;
+; CHECK-NEXT:    bfi.b32 %r14, %r13, %r11, 24, 8;
+; CHECK-NEXT:    st.volatile.u32 [%rd3], %r14;
+; CHECK-NEXT:    ld.volatile.u32 %r15, [%rd3];
+; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r15;
+; CHECK-NEXT:    add.s16 %rs19, %rs18, 1;
+; CHECK-NEXT:    add.s16 %rs20, %rs17, 1;
+; CHECK-NEXT:    mov.b32 %r16, {%rs20, %rs19};
+; CHECK-NEXT:    st.volatile.u32 [%rd3], %r16;
+; CHECK-NEXT:    ld.volatile.v4.u16 {%rs21, %rs22, %rs23, %rs24}, [%rd4];
+; CHECK-NEXT:    add.s16 %rs25, %rs24, 1;
+; CHECK-NEXT:    add.s16 %rs26, %rs23, 1;
+; CHECK-NEXT:    add.s16 %rs27, %rs22, 1;
+; CHECK-NEXT:    add.s16 %rs28, %rs21, 1;
+; CHECK-NEXT:    st.volatile.v4.u16 [%rd4], {%rs28, %rs27, %rs26, %rs25};
+; CHECK-NEXT:    ld.volatile.v2.u32 {%r17, %r18}, [%rd4];
+; CHECK-NEXT:    add.s32 %r19, %r18, 1;
+; CHECK-NEXT:    add.s32 %r20, %r17, 1;
+; CHECK-NEXT:    st.volatile.v2.u32 [%rd4], {%r20, %r19};
+; CHECK-NEXT:    ld.volatile.v4.u32 {%r21, %r22, %r23, %r24}, [%rd4];
+; CHECK-NEXT:    add.s32 %r25, %r24, 1;
+; CHECK-NEXT:    add.s32 %r26, %r23, 1;
+; CHECK-NEXT:    add.s32 %r27, %r22, 1;
+; CHECK-NEXT:    add.s32 %r28, %r21, 1;
+; CHECK-NEXT:    st.volatile.v4.u32 [%rd4], {%r28, %r27, %r26, %r25};
+; CHECK-NEXT:    ld.volatile.v2.u64 {%rd7, %rd8}, [%rd4];
+; CHECK-NEXT:    add.s64 %rd9, %rd8, 1;
+; CHECK-NEXT:    add.s64 %rd10, %rd7, 1;
+; CHECK-NEXT:    st.volatile.v2.u64 [%rd4], {%rd10, %rd9};
+; CHECK-NEXT:    ld.volatile.v2.f32 {%f3, %f4}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.v2.f32 [%rd4], {%f6, %f5};
+; CHECK-NEXT:    ld.volatile.v4.f32 {%f7, %f8, %f9, %f10}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f11, %f10, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f12, %f9, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f13, %f8, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f14, %f7, 0f3F800000;
+; CHECK-NEXT:    st.volatile.v4.f32 [%rd4], {%f14, %f13, %f12, %f11};
+; CHECK-NEXT:    ld.volatile.v2.f64 {%fd3, %fd4}, [%rd4];
+; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %fd6, %fd3, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.v2.f64 [%rd4], {%fd6, %fd5};
+; CHECK-NEXT:    ret;
   %a.load = load volatile i8, ptr %a
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store volatile i8 %a.add, ptr %a
 
-  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load volatile i16, ptr %b
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store volatile i16 %b.add, ptr %b
 
-  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load volatile i32, ptr %c
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store volatile i32 %c.add, ptr %c
 
-  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load volatile i64, ptr %d
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store volatile i64 %d.add, ptr %d
 
-  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load volatile float, ptr %c
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store volatile float %e.add, ptr %c
 
-  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load volatile double, ptr %c
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store volatile double %f.add, ptr %c
 
   ; TODO: volatile, atomic, and volatile atomic memory operations on vector types.
@@ -184,254 +333,358 @@ define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr
 
   ; TODO: make this operation consistent with the one for <4 x i8>
   ; This operation lowers to a "element wise volatile PTX operation".
-  ; CHECK: ld.volatile.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %h.load = load volatile <2 x i8>, ptr %b
   %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
-  ; CHECK: st.volatile.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store volatile <2 x i8> %h.add, ptr %b
 
   ; TODO: make this operation consistent with the one for <2 x i8>
   ; This operation lowers to a "full vector volatile PTX operation".
-  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %i.load = load volatile <4 x i8>, ptr %c
   %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
-  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store volatile <4 x i8> %i.add, ptr %c
 
-  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %j.load = load volatile <2 x i16>, ptr %c
   %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
-  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store volatile <2 x i16> %j.add, ptr %c
 
-  ; CHECK: ld.volatile.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %k.load = load volatile <4 x i16>, ptr %d
   %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
-  ; CHECK: st.volatile.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store volatile <4 x i16> %k.add, ptr %d
 
-  ; CHECK: ld.volatile.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %l.load = load volatile <2 x i32>, ptr %d
   %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
-  ; CHECK: st.volatile.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
   store volatile <2 x i32> %l.add, ptr %d
 
-  ; CHECK: ld.volatile.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %m.load = load volatile <4 x i32>, ptr %d
   %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
-  ; CHECK: st.volatile.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   store volatile <4 x i32> %m.add, ptr %d
 
-  ; CHECK: ld.volatile.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %n.load = load volatile <2 x i64>, ptr %d
   %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
-  ; CHECK: st.volatile.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
   store volatile <2 x i64> %n.add, ptr %d
 
-  ; CHECK: ld.volatile.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %o.load = load volatile <2 x float>, ptr %d
   %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
-  ; CHECK: st.volatile.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
   store volatile <2 x float> %o.add, ptr %d
 
-  ; CHECK: ld.volatile.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %p.load = load volatile <4 x float>, ptr %d
   %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
-  ; CHECK: st.volatile.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
   store volatile <4 x float> %p.add, ptr %d
 
-  ; CHECK: ld.volatile.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %q.load = load volatile <2 x double>, ptr %d
   %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
-  ; CHECK: st.volatile.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
   store volatile <2 x double> %q.add, ptr %d
 
   ret void
 }
 
-; CHECK-LABEL: generic_unordered_sys
 define void @generic_unordered_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; SM60-LABEL: generic_unordered_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b16 %rs<5>;
+; SM60-NEXT:    .reg .b32 %r<3>;
+; SM60-NEXT:    .reg .f32 %f<3>;
+; SM60-NEXT:    .reg .b64 %rd<8>;
+; SM60-NEXT:    .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [generic_unordered_sys_param_0];
+; SM60-NEXT:    ld.volatile.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.u64 %rd2, [generic_unordered_sys_param_1];
+; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM60-NEXT:    ld.param.u64 %rd3, [generic_unordered_sys_param_2];
+; SM60-NEXT:    st.volatile.u8 [%rd1], %rs2;
+; SM60-NEXT:    ld.param.u64 %rd4, [generic_unordered_sys_param_3];
+; SM60-NEXT:    ld.volatile.u16 %rs3, [%rd2];
+; SM60-NEXT:    ld.param.u64 %rd5, [generic_unordered_sys_param_4];
+; SM60-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM60-NEXT:    st.volatile.u16 [%rd2], %rs4;
+; SM60-NEXT:    ld.volatile.u32 %r1, [%rd3];
+; SM60-NEXT:    add.s32 %r2, %r1, 1;
+; SM60-NEXT:    st.volatile.u32 [%rd3], %r2;
+; SM60-NEXT:    ld.volatile.u64 %rd6, [%rd4];
+; SM60-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM60-NEXT:    st.volatile.u64 [%rd4], %rd7;
+; SM60-NEXT:    ld.volatile.f32 %f1, [%rd5];
+; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT:    st.volatile.f32 [%rd5], %f2;
+; SM60-NEXT:    ld.volatile.f64 %fd1, [%rd5];
+; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.f64 [%rd5], %fd2;
+; SM60-NEXT:    ret;
+;
+; SM70-LABEL: generic_unordered_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b16 %rs<5>;
+; SM70-NEXT:    .reg .b32 %r<3>;
+; SM70-NEXT:    .reg .f32 %f<3>;
+; SM70-NEXT:    .reg .b64 %rd<8>;
+; SM70-NEXT:    .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [generic_unordered_sys_param_0];
+; SM70-NEXT:    ld.relaxed.sys.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.u64 %rd2, [generic_unordered_sys_param_1];
+; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM70-NEXT:    ld.param.u64 %rd3, [generic_unordered_sys_param_2];
+; SM70-NEXT:    st.relaxed.sys.u8 [%rd1], %rs2;
+; SM70-NEXT:    ld.param.u64 %rd4, [generic_unordered_sys_param_3];
+; SM70-NEXT:    ld.relaxed.sys.u16 %rs3, [%rd2];
+; SM70-NEXT:    ld.param.u64 %rd5, [generic_unordered_sys_param_4];
+; SM70-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM70-NEXT:    st.relaxed.sys.u16 [%rd2], %rs4;
+; SM70-NEXT:    ld.relaxed.sys.u32 %r1, [%rd3];
+; SM70-NEXT:    add.s32 %r2, %r1, 1;
+; SM70-NEXT:    st.relaxed.sys.u32 [%rd3], %r2;
+; SM70-NEXT:    ld.relaxed.sys.u64 %rd6, [%rd4];
+; SM70-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM70-NEXT:    st.relaxed.sys.u64 [%rd4], %rd7;
+; SM70-NEXT:    ld.relaxed.sys.f32 %f1, [%rd5];
+; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT:    st.relaxed.sys.f32 [%rd5], %f2;
+; SM70-NEXT:    ld.relaxed.sys.f64 %fd1, [%rd5];
+; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT:    st.relaxed.sys.f64 [%rd5], %fd2;
+; SM70-NEXT:    ret;
   %a.load = load atomic i8, ptr %a unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a unordered, align 1
 
-  ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b unordered, align 2
 
-  ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c unordered, align 4
 
-  ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d unordered, align 8
 
-  ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e unordered, align 4
   %e.add = fadd float %e.load, 1.0
-  ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e unordered, align 4
 
-  ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e unordered, align 8
 
   ret void
 }
 
-; CHECK-LABEL: generic_unordered_volatile_sys
 define void @generic_unordered_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_unordered_volatile_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [generic_unordered_volatile_sys_param_0];
+; CHECK-NEXT:    ld.volatile.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [generic_unordered_volatile_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [generic_unordered_volatile_sys_param_2];
+; CHECK-NEXT:    st.volatile.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [generic_unordered_volatile_sys_param_3];
+; CHECK-NEXT:    ld.volatile.u16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.u64 %rd5, [generic_unordered_volatile_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.u64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.u64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.f32 %f1, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.f32 [%rd5], %f2;
+; CHECK-NEXT:    ld.volatile.f64 %fd1, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.f64 [%rd5], %fd2;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a unordered, align 1
 
-  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b unordered, align 2
 
-  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c unordered, align 4
 
-  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d unordered, align 8
 
-  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e unordered, align 4
   %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e unordered, align 4
 
-  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e unordered, align 8
 
   ret void
 }
 
-; CHECK-LABEL: generic_monotonic_sys
 define void @generic_monotonic_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; SM60-LABEL: generic_monotonic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b16 %rs<5>;
+; SM60-NEXT:    .reg .b32 %r<3>;
+; SM60-NEXT:    .reg .f32 %f<3>;
+; SM60-NEXT:    .reg .b64 %rd<8>;
+; SM60-NEXT:    .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [generic_monotonic_sys_param_0];
+; SM60-NEXT:    ld.volatile.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.u64 %rd2, [generic_monotonic_sys_param_1];
+; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM60-NEXT:    ld.param.u64 %rd3, [generic_monotonic_sys_param_2];
+; SM60-NEXT:    st.volatile.u8 [%rd1], %rs2;
+; SM60-NEXT:    ld.param.u64 %rd4, [generic_monotonic_sys_param_3];
+; SM60-NEXT:    ld.volatile.u16 %rs3, [%rd2];
+; SM60-NEXT:    ld.param.u64 %rd5, [generic_monotonic_sys_param_4];
+; SM60-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM60-NEXT:    st.volatile.u16 [%rd2], %rs4;
+; SM60-NEXT:    ld.volatile.u32 %r1, [%rd3];
+; SM60-NEXT:    add.s32 %r2, %r1, 1;
+; SM60-NEXT:    st.volatile.u32 [%rd3], %r2;
+; SM60-NEXT:    ld.volatile.u64 %rd6, [%rd4];
+; SM60-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM60-NEXT:    st.volatile.u64 [%rd4], %rd7;
+; SM60-NEXT:    ld.volatile.f32 %f1, [%rd5];
+; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT:    st.volatile.f32 [%rd5], %f2;
+; SM60-NEXT:    ld.volatile.f64 %fd1, [%rd5];
+; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.f64 [%rd5], %fd2;
+; SM60-NEXT:    ret;
+;
+; SM70-LABEL: generic_monotonic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b16 %rs<5>;
+; SM70-NEXT:    .reg .b32 %r<3>;
+; SM70-NEXT:    .reg .f32 %f<3>;
+; SM70-NEXT:    .reg .b64 %rd<8>;
+; SM70-NEXT:    .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [generic_monotonic_sys_param_0];
+; SM70-NEXT:    ld.relaxed.sys.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.u64 %rd2, [generic_monotonic_sys_param_1];
+; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM70-NEXT:    ld.param.u64 %rd3, [generic_monotonic_sys_param_2];
+; SM70-NEXT:    st.relaxed.sys.u8 [%rd1], %rs2;
+; SM70-NEXT:    ld.param.u64 %rd4, [generic_monotonic_sys_param_3];
+; SM70-NEXT:    ld.relaxed.sys.u16 %rs3, [%rd2];
+; SM70-NEXT:    ld.param.u64 %rd5, [generic_monotonic_sys_param_4];
+; SM70-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM70-NEXT:    st.relaxed.sys.u16 [%rd2], %rs4;
+; SM70-NEXT:    ld.relaxed.sys.u32 %r1, [%rd3];
+; SM70-NEXT:    add.s32 %r2, %r1, 1;
+; SM70-NEXT:    st.relaxed.sys.u32 [%rd3], %r2;
+; SM70-NEXT:    ld.relaxed.sys.u64 %rd6, [%rd4];
+; SM70-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM70-NEXT:    st.relaxed.sys.u64 [%rd4], %rd7;
+; SM70-NEXT:    ld.relaxed.sys.f32 %f1, [%rd5];
+; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT:    st.relaxed.sys.f32 [%rd5], %f2;
+; SM70-NEXT:    ld.relaxed.sys.f64 %fd1, [%rd5];
+; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT:    st.relaxed.sys.f64 [%rd5], %fd2;
+; SM70-NEXT:    ret;
   %a.load = load atomic i8, ptr %a monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr %a monotonic, align 1
 
-  ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr %b monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr %b monotonic, align 2
 
-  ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr %c monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr %c monotonic, align 4
 
-  ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr %d monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr %d monotonic, align 8
 
-  ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr %e monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr %e monotonic, align 4
 
-  ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr %e monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr %e monotonic, align 8
 
   ret void
 }
 
-; CHECK-LABEL: generic_monotonic_volatile_sys
 define void @generic_monotonic_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: generic_monotonic_volatile_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [generic_monotonic_volatile_sys_param_0];
+; CHECK-NEXT:    ld.volatile.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [generic_monotonic_volatile_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [generic_monotonic_volatile_sys_param_2];
+; CHECK-NEXT:    st.volatile.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [generic_monotonic_volatile_sys_param_3];
+; CHECK-NEXT:    ld.volatile.u16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.u64 %rd5, [generic_monotonic_volatile_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.u64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.u64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.f32 %f1, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.f32 [%rd5], %f2;
+; CHECK-NEXT:    ld.volatile.f64 %fd1, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.f64 [%rd5], %fd2;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr %a monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr %a monotonic, align 1
 
-  ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr %b monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr %b monotonic, align 2
 
-  ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr %c monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr %c monotonic, align 4
 
-  ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr %d monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr %d monotonic, align 8
 
-  ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr %e monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr %e monotonic, align 4
 
-  ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr %e monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr %e monotonic, align 8
 
   ret void
@@ -439,415 +692,711 @@ define void @generic_monotonic_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr
 
 ;; global statespace
 
-; CHECK-LABEL: global_weak
 define void @global_weak(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
-  ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_weak(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<29>;
+; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .f32 %f<15>;
+; CHECK-NEXT:    .reg .b64 %rd<11>;
+; CHECK-NEXT:    .reg .f64 %fd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [global_weak_param_0];
+; CHECK-NEXT:    ld.global.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [global_weak_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [global_weak_param_2];
+; CHECK-NEXT:    st.global.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [global_weak_param_3];
+; CHECK-NEXT:    ld.global.u16 %rs3, [%rd2];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.global.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.global.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.global.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.global.u64 %rd5, [%rd4];
+; CHECK-NEXT:    add.s64 %rd6, %rd5, 1;
+; CHECK-NEXT:    st.global.u64 [%rd4], %rd6;
+; CHECK-NEXT:    ld.global.f32 %f1, [%rd3];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.global.f32 [%rd3], %f2;
+; CHECK-NEXT:    ld.global.f64 %fd1, [%rd3];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.global.f64 [%rd3], %fd2;
+; CHECK-NEXT:    ld.global.v2.u8 {%rs5, %rs6}, [%rd2];
+; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT:    st.global.v2.u8 [%rd2], {%rs8, %rs7};
+; CHECK-NEXT:    ld.global.u32 %r3, [%rd3];
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs9, %r4;
+; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs10;
+; CHECK-NEXT:    bfe.u32 %r6, %r3, 8, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r6;
+; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r7, %rs12;
+; CHECK-NEXT:    bfi.b32 %r8, %r7, %r5, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r3, 16, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs13, %r9;
+; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs14;
+; CHECK-NEXT:    bfi.b32 %r11, %r10, %r8, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs15, %r12;
+; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r13, %rs16;
+; CHECK-NEXT:    bfi.b32 %r14, %r13, %r11, 24, 8;
+; CHECK-NEXT:    st.global.u32 [%rd3], %r14;
+; CHECK-NEXT:    ld.global.u32 %r15, [%rd3];
+; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r15;
+; CHECK-NEXT:    add.s16 %rs19, %rs18, 1;
+; CHECK-NEXT:    add.s16 %rs20, %rs17, 1;
+; CHECK-NEXT:    mov.b32 %r16, {%rs20, %rs19};
+; CHECK-NEXT:    st.global.u32 [%rd3], %r16;
+; CHECK-NEXT:    ld.global.v4.u16 {%rs21, %rs22, %rs23, %rs24}, [%rd4];
+; CHECK-NEXT:    add.s16 %rs25, %rs24, 1;
+; CHECK-NEXT:    add.s16 %rs26, %rs23, 1;
+; CHECK-NEXT:    add.s16 %rs27, %rs22, 1;
+; CHECK-NEXT:    add.s16 %rs28, %rs21, 1;
+; CHECK-NEXT:    st.global.v4.u16 [%rd4], {%rs28, %rs27, %rs26, %rs25};
+; CHECK-NEXT:    ld.global.v2.u32 {%r17, %r18}, [%rd4];
+; CHECK-NEXT:    add.s32 %r19, %r18, 1;
+; CHECK-NEXT:    add.s32 %r20, %r17, 1;
+; CHECK-NEXT:    st.global.v2.u32 [%rd4], {%r20, %r19};
+; CHECK-NEXT:    ld.global.v4.u32 {%r21, %r22, %r23, %r24}, [%rd4];
+; CHECK-NEXT:    add.s32 %r25, %r24, 1;
+; CHECK-NEXT:    add.s32 %r26, %r23, 1;
+; CHECK-NEXT:    add.s32 %r27, %r22, 1;
+; CHECK-NEXT:    add.s32 %r28, %r21, 1;
+; CHECK-NEXT:    st.global.v4.u32 [%rd4], {%r28, %r27, %r26, %r25};
+; CHECK-NEXT:    ld.global.v2.u64 {%rd7, %rd8}, [%rd4];
+; CHECK-NEXT:    add.s64 %rd9, %rd8, 1;
+; CHECK-NEXT:    add.s64 %rd10, %rd7, 1;
+; CHECK-NEXT:    st.global.v2.u64 [%rd4], {%rd10, %rd9};
+; CHECK-NEXT:    ld.global.v2.f32 {%f3, %f4}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT:    st.global.v2.f32 [%rd4], {%f6, %f5};
+; CHECK-NEXT:    ld.global.v4.f32 {%f7, %f8, %f9, %f10}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f11, %f10, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f12, %f9, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f13, %f8, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f14, %f7, 0f3F800000;
+; CHECK-NEXT:    st.global.v4.f32 [%rd4], {%f14, %f13, %f12, %f11};
+; CHECK-NEXT:    ld.global.v2.f64 {%fd3, %fd4}, [%rd4];
+; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %fd6, %fd3, 0d3FF0000000000000;
+; CHECK-NEXT:    st.global.v2.f64 [%rd4], {%fd6, %fd5};
+; CHECK-NEXT:    ret;
   %a.load = load i8, ptr addrspace(1) %a
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store i8 %a.add, ptr addrspace(1) %a
 
-  ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load i16, ptr addrspace(1) %b
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store i16 %b.add, ptr addrspace(1) %b
 
-  ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load i32, ptr addrspace(1) %c
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store i32 %c.add, ptr addrspace(1) %c
 
-  ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load i64, ptr addrspace(1) %d
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store i64 %d.add, ptr addrspace(1) %d
 
-  ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load float, ptr addrspace(1) %c
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store float %e.add, ptr addrspace(1) %c
 
-  ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load double, ptr addrspace(1) %c
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store double %f.add, ptr addrspace(1) %c
 
-  ; CHECK: ld.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %h.load = load <2 x i8>, ptr addrspace(1) %b
   %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
-  ; CHECK: st.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store <2 x i8> %h.add, ptr addrspace(1) %b
 
-  ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %i.load = load <4 x i8>, ptr addrspace(1) %c
   %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
-  ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store <4 x i8> %i.add, ptr addrspace(1) %c
 
-  ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %j.load = load <2 x i16>, ptr addrspace(1) %c
   %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
-  ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store <2 x i16> %j.add, ptr addrspace(1) %c
 
-  ; CHECK: ld.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %k.load = load <4 x i16>, ptr addrspace(1) %d
   %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
-  ; CHECK: st.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store <4 x i16> %k.add, ptr addrspace(1) %d
 
-  ; CHECK: ld.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %l.load = load <2 x i32>, ptr addrspace(1) %d
   %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
-  ; CHECK: st.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
   store <2 x i32> %l.add, ptr addrspace(1) %d
 
-  ; CHECK: ld.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %m.load = load <4 x i32>, ptr addrspace(1) %d
   %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
-  ; CHECK: st.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   store <4 x i32> %m.add, ptr addrspace(1) %d
 
-  ; CHECK: ld.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %n.load = load <2 x i64>, ptr addrspace(1) %d
   %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
-  ; CHECK: st.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
   store <2 x i64> %n.add, ptr addrspace(1) %d
 
-  ; CHECK: ld.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %o.load = load <2 x float>, ptr addrspace(1) %d
   %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
-  ; CHECK: st.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
   store <2 x float> %o.add, ptr addrspace(1) %d
 
-  ; CHECK: ld.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %p.load = load <4 x float>, ptr addrspace(1) %d
   %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
-  ; CHECK: st.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
   store <4 x float> %p.add, ptr addrspace(1) %d
 
-  ; CHECK: ld.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %q.load = load <2 x double>, ptr addrspace(1) %d
   %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
-  ; CHECK: st.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
   store <2 x double> %q.add, ptr addrspace(1) %d
 
   ret void
 }
 
-; CHECK-LABEL: global_volatile
 define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
-  ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: global_volatile(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<29>;
+; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .f32 %f<15>;
+; CHECK-NEXT:    .reg .b64 %rd<11>;
+; CHECK-NEXT:    .reg .f64 %fd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_param_0];
+; CHECK-NEXT:    ld.volatile.global.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [global_volatile_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [global_volatile_param_2];
+; CHECK-NEXT:    st.volatile.global.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [global_volatile_param_3];
+; CHECK-NEXT:    ld.volatile.global.u16 %rs3, [%rd2];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.global.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.global.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.global.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.global.u64 %rd5, [%rd4];
+; CHECK-NEXT:    add.s64 %rd6, %rd5, 1;
+; CHECK-NEXT:    st.volatile.global.u64 [%rd4], %rd6;
+; CHECK-NEXT:    ld.volatile.global.f32 %f1, [%rd3];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.global.f32 [%rd3], %f2;
+; CHECK-NEXT:    ld.volatile.global.f64 %fd1, [%rd3];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.global.f64 [%rd3], %fd2;
+; CHECK-NEXT:    ld.volatile.global.v2.u8 {%rs5, %rs6}, [%rd2];
+; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT:    st.volatile.global.v2.u8 [%rd2], {%rs8, %rs7};
+; CHECK-NEXT:    ld.volatile.global.u32 %r3, [%rd3];
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs9, %r4;
+; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs10;
+; CHECK-NEXT:    bfe.u32 %r6, %r3, 8, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r6;
+; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r7, %rs12;
+; CHECK-NEXT:    bfi.b32 %r8, %r7, %r5, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r3, 16, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs13, %r9;
+; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs14;
+; CHECK-NEXT:    bfi.b32 %r11, %r10, %r8, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs15, %r12;
+; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r13, %rs16;
+; CHECK-NEXT:    bfi.b32 %r14, %r13, %r11, 24, 8;
+; CHECK-NEXT:    st.volatile.global.u32 [%rd3], %r14;
+; CHECK-NEXT:    ld.volatile.global.u32 %r15, [%rd3];
+; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r15;
+; CHECK-NEXT:    add.s16 %rs19, %rs18, 1;
+; CHECK-NEXT:    add.s16 %rs20, %rs17, 1;
+; CHECK-NEXT:    mov.b32 %r16, {%rs20, %rs19};
+; CHECK-NEXT:    st.volatile.global.u32 [%rd3], %r16;
+; CHECK-NEXT:    ld.volatile.global.v4.u16 {%rs21, %rs22, %rs23, %rs24}, [%rd4];
+; CHECK-NEXT:    add.s16 %rs25, %rs24, 1;
+; CHECK-NEXT:    add.s16 %rs26, %rs23, 1;
+; CHECK-NEXT:    add.s16 %rs27, %rs22, 1;
+; CHECK-NEXT:    add.s16 %rs28, %rs21, 1;
+; CHECK-NEXT:    st.volatile.global.v4.u16 [%rd4], {%rs28, %rs27, %rs26, %rs25};
+; CHECK-NEXT:    ld.volatile.global.v2.u32 {%r17, %r18}, [%rd4];
+; CHECK-NEXT:    add.s32 %r19, %r18, 1;
+; CHECK-NEXT:    add.s32 %r20, %r17, 1;
+; CHECK-NEXT:    st.volatile.global.v2.u32 [%rd4], {%r20, %r19};
+; CHECK-NEXT:    ld.volatile.global.v4.u32 {%r21, %r22, %r23, %r24}, [%rd4];
+; CHECK-NEXT:    add.s32 %r25, %r24, 1;
+; CHECK-NEXT:    add.s32 %r26, %r23, 1;
+; CHECK-NEXT:    add.s32 %r27, %r22, 1;
+; CHECK-NEXT:    add.s32 %r28, %r21, 1;
+; CHECK-NEXT:    st.volatile.global.v4.u32 [%rd4], {%r28, %r27, %r26, %r25};
+; CHECK-NEXT:    ld.volatile.global.v2.u64 {%rd7, %rd8}, [%rd4];
+; CHECK-NEXT:    add.s64 %rd9, %rd8, 1;
+; CHECK-NEXT:    add.s64 %rd10, %rd7, 1;
+; CHECK-NEXT:    st.volatile.global.v2.u64 [%rd4], {%rd10, %rd9};
+; CHECK-NEXT:    ld.volatile.global.v2.f32 {%f3, %f4}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.global.v2.f32 [%rd4], {%f6, %f5};
+; CHECK-NEXT:    ld.volatile.global.v4.f32 {%f7, %f8, %f9, %f10}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f11, %f10, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f12, %f9, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f13, %f8, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f14, %f7, 0f3F800000;
+; CHECK-NEXT:    st.volatile.global.v4.f32 [%rd4], {%f14, %f13, %f12, %f11};
+; CHECK-NEXT:    ld.volatile.global.v2.f64 {%fd3, %fd4}, [%rd4];
+; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %fd6, %fd3, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.global.v2.f64 [%rd4], {%fd6, %fd5};
+; CHECK-NEXT:    ret;
   %a.load = load volatile i8, ptr addrspace(1) %a
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store volatile i8 %a.add, ptr addrspace(1) %a
 
-  ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load volatile i16, ptr addrspace(1) %b
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store volatile i16 %b.add, ptr addrspace(1) %b
 
-  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load volatile i32, ptr addrspace(1) %c
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store volatile i32 %c.add, ptr addrspace(1) %c
 
-  ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load volatile i64, ptr addrspace(1) %d
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store volatile i64 %d.add, ptr addrspace(1) %d
 
-  ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load volatile float, ptr addrspace(1) %c
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store volatile float %e.add, ptr addrspace(1) %c
 
-  ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load volatile double, ptr addrspace(1) %c
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store volatile double %f.add, ptr addrspace(1) %c
 
-  ; CHECK: ld.volatile.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %h.load = load volatile <2 x i8>, ptr addrspace(1) %b
   %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
-  ; CHECK: st.volatile.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store volatile<2 x i8> %h.add, ptr addrspace(1) %b
 
-  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %i.load = load volatile <4 x i8>, ptr addrspace(1) %c
   %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
-  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store volatile<4 x i8> %i.add, ptr addrspace(1) %c
 
-  ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %j.load = load volatile <2 x i16>, ptr addrspace(1) %c
   %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
-  ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store volatile<2 x i16> %j.add, ptr addrspace(1) %c
 
-  ; CHECK: ld.volatile.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %k.load = load volatile <4 x i16>, ptr addrspace(1) %d
   %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
-  ; CHECK: st.volatile.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store volatile<4 x i16> %k.add, ptr addrspace(1) %d
 
-  ; CHECK: ld.volatile.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %l.load = load volatile <2 x i32>, ptr addrspace(1) %d
   %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
-  ; CHECK: st.volatile.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
   store volatile<2 x i32> %l.add, ptr addrspace(1) %d
 
-  ; CHECK: ld.volatile.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %m.load = load volatile <4 x i32>, ptr addrspace(1) %d
   %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
-  ; CHECK: st.volatile.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   store volatile<4 x i32> %m.add, ptr addrspace(1) %d
 
-  ; CHECK: ld.volatile.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %n.load = load volatile <2 x i64>, ptr addrspace(1) %d
   %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
-  ; CHECK: st.volatile.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
   store volatile<2 x i64> %n.add, ptr addrspace(1) %d
 
-  ; CHECK: ld.volatile.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %o.load = load volatile <2 x float>, ptr addrspace(1) %d
   %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
-  ; CHECK: st.volatile.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
   store volatile<2 x float> %o.add, ptr addrspace(1) %d
 
-  ; CHECK: ld.volatile.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %p.load = load volatile <4 x float>, ptr addrspace(1) %d
   %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
-  ; CHECK: st.volatile.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
   store volatile<4 x float> %p.add, ptr addrspace(1) %d
 
-  ; CHECK: ld.volatile.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %q.load = load volatile <2 x double>, ptr addrspace(1) %d
   %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
-  ; CHECK: st.volatile.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
   store volatile<2 x double> %q.add, ptr addrspace(1) %d
 
   ret void
 }
 
-; CHECK-LABEL: global_unordered_sys
 define void @global_unordered_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; SM60-LABEL: global_unordered_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b16 %rs<5>;
+; SM60-NEXT:    .reg .b32 %r<3>;
+; SM60-NEXT:    .reg .f32 %f<3>;
+; SM60-NEXT:    .reg .b64 %rd<8>;
+; SM60-NEXT:    .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [global_unordered_sys_param_0];
+; SM60-NEXT:    ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.u64 %rd2, [global_unordered_sys_param_1];
+; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM60-NEXT:    ld.param.u64 %rd3, [global_unordered_sys_param_2];
+; SM60-NEXT:    st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT:    ld.param.u64 %rd4, [global_unordered_sys_param_3];
+; SM60-NEXT:    ld.volatile.global.u16 %rs3, [%rd2];
+; SM60-NEXT:    ld.param.u64 %rd5, [global_unordered_sys_param_4];
+; SM60-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM60-NEXT:    st.volatile.global.u16 [%rd2], %rs4;
+; SM60-NEXT:    ld.volatile.global.u32 %r1, [%rd3];
+; SM60-NEXT:    add.s32 %r2, %r1, 1;
+; SM60-NEXT:    st.volatile.global.u32 [%rd3], %r2;
+; SM60-NEXT:    ld.volatile.global.u64 %rd6, [%rd4];
+; SM60-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM60-NEXT:    st.volatile.global.u64 [%rd4], %rd7;
+; SM60-NEXT:    ld.volatile.global.f32 %f1, [%rd5];
+; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT:    st.volatile.global.f32 [%rd5], %f2;
+; SM60-NEXT:    ld.volatile.global.f64 %fd1, [%rd5];
+; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.global.f64 [%rd5], %fd2;
+; SM60-NEXT:    ret;
+;
+; SM70-LABEL: global_unordered_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b16 %rs<5>;
+; SM70-NEXT:    .reg .b32 %r<3>;
+; SM70-NEXT:    .reg .f32 %f<3>;
+; SM70-NEXT:    .reg .b64 %rd<8>;
+; SM70-NEXT:    .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [global_unordered_sys_param_0];
+; SM70-NEXT:    ld.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.u64 %rd2, [global_unordered_sys_param_1];
+; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM70-NEXT:    ld.param.u64 %rd3, [global_unordered_sys_param_2];
+; SM70-NEXT:    st.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT:    ld.param.u64 %rd4, [global_unordered_sys_param_3];
+; SM70-NEXT:    ld.relaxed.sys.global.u16 %rs3, [%rd2];
+; SM70-NEXT:    ld.param.u64 %rd5, [global_unordered_sys_param_4];
+; SM70-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM70-NEXT:    st.relaxed.sys.global.u16 [%rd2], %rs4;
+; SM70-NEXT:    ld.relaxed.sys.global.u32 %r1, [%rd3];
+; SM70-NEXT:    add.s32 %r2, %r1, 1;
+; SM70-NEXT:    st.relaxed.sys.global.u32 [%rd3], %r2;
+; SM70-NEXT:    ld.relaxed.sys.global.u64 %rd6, [%rd4];
+; SM70-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM70-NEXT:    st.relaxed.sys.global.u64 [%rd4], %rd7;
+; SM70-NEXT:    ld.relaxed.sys.global.f32 %f1, [%rd5];
+; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT:    st.relaxed.sys.global.f32 [%rd5], %f2;
+; SM70-NEXT:    ld.relaxed.sys.global.f64 %fd1, [%rd5];
+; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT:    st.relaxed.sys.global.f64 [%rd5], %fd2;
+; SM70-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1
 
-  ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2
 
-  ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4
 
-  ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8
 
-  ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4
   %e.add = fadd float %e.load, 1.0
-  ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e unordered, align 4
 
-  ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e unordered, align 8
 
   ret void
 }
 
-; CHECK-LABEL: global_unordered_volatile_sys
 define void @global_unordered_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; SM60-LABEL: global_unordered_volatile_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b16 %rs<5>;
+; SM60-NEXT:    .reg .b32 %r<3>;
+; SM60-NEXT:    .reg .f32 %f<3>;
+; SM60-NEXT:    .reg .b64 %rd<8>;
+; SM60-NEXT:    .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [global_unordered_volatile_sys_param_0];
+; SM60-NEXT:    ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.u64 %rd2, [global_unordered_volatile_sys_param_1];
+; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM60-NEXT:    ld.param.u64 %rd3, [global_unordered_volatile_sys_param_2];
+; SM60-NEXT:    st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT:    ld.param.u64 %rd4, [global_unordered_volatile_sys_param_3];
+; SM60-NEXT:    ld.volatile.global.u16 %rs3, [%rd2];
+; SM60-NEXT:    ld.param.u64 %rd5, [global_unordered_volatile_sys_param_4];
+; SM60-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM60-NEXT:    st.volatile.global.u16 [%rd2], %rs4;
+; SM60-NEXT:    ld.volatile.global.u32 %r1, [%rd3];
+; SM60-NEXT:    add.s32 %r2, %r1, 1;
+; SM60-NEXT:    st.volatile.global.u32 [%rd3], %r2;
+; SM60-NEXT:    ld.volatile.global.u64 %rd6, [%rd4];
+; SM60-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM60-NEXT:    st.volatile.global.u64 [%rd4], %rd7;
+; SM60-NEXT:    ld.volatile.global.f32 %f1, [%rd5];
+; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT:    st.volatile.global.f32 [%rd5], %f2;
+; SM60-NEXT:    ld.volatile.global.f64 %fd1, [%rd5];
+; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.global.f64 [%rd5], %fd2;
+; SM60-NEXT:    ret;
+;
+; SM70-LABEL: global_unordered_volatile_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b16 %rs<5>;
+; SM70-NEXT:    .reg .b32 %r<3>;
+; SM70-NEXT:    .reg .f32 %f<3>;
+; SM70-NEXT:    .reg .b64 %rd<8>;
+; SM70-NEXT:    .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [global_unordered_volatile_sys_param_0];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.u64 %rd2, [global_unordered_volatile_sys_param_1];
+; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM70-NEXT:    ld.param.u64 %rd3, [global_unordered_volatile_sys_param_2];
+; SM70-NEXT:    st.mmio.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT:    ld.param.u64 %rd4, [global_unordered_volatile_sys_param_3];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.u16 %rs3, [%rd2];
+; SM70-NEXT:    ld.param.u64 %rd5, [global_unordered_volatile_sys_param_4];
+; SM70-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.u16 [%rd2], %rs4;
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.u32 %r1, [%rd3];
+; SM70-NEXT:    add.s32 %r2, %r1, 1;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.u32 [%rd3], %r2;
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.u64 %rd6, [%rd4];
+; SM70-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.u64 [%rd4], %rd7;
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.f32 %f1, [%rd5];
+; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.f32 [%rd5], %f2;
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.f64 %fd1, [%rd5];
+; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.f64 [%rd5], %fd2;
+; SM70-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1
 
-  ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2
 
-  ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4
 
-  ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8
 
-  ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4
   %e.add = fadd float %e.load, 1.0
-  ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4
 
-  ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8
 
   ret void
 }
 
-; CHECK-LABEL: global_monotonic_sys
 define void @global_monotonic_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; SM60-LABEL: global_monotonic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b16 %rs<5>;
+; SM60-NEXT:    .reg .b32 %r<3>;
+; SM60-NEXT:    .reg .f32 %f<3>;
+; SM60-NEXT:    .reg .b64 %rd<8>;
+; SM60-NEXT:    .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [global_monotonic_sys_param_0];
+; SM60-NEXT:    ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.u64 %rd2, [global_monotonic_sys_param_1];
+; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM60-NEXT:    ld.param.u64 %rd3, [global_monotonic_sys_param_2];
+; SM60-NEXT:    st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT:    ld.param.u64 %rd4, [global_monotonic_sys_param_3];
+; SM60-NEXT:    ld.volatile.global.u16 %rs3, [%rd2];
+; SM60-NEXT:    ld.param.u64 %rd5, [global_monotonic_sys_param_4];
+; SM60-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM60-NEXT:    st.volatile.global.u16 [%rd2], %rs4;
+; SM60-NEXT:    ld.volatile.global.u32 %r1, [%rd3];
+; SM60-NEXT:    add.s32 %r2, %r1, 1;
+; SM60-NEXT:    st.volatile.global.u32 [%rd3], %r2;
+; SM60-NEXT:    ld.volatile.global.u64 %rd6, [%rd4];
+; SM60-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM60-NEXT:    st.volatile.global.u64 [%rd4], %rd7;
+; SM60-NEXT:    ld.volatile.global.f32 %f1, [%rd5];
+; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT:    st.volatile.global.f32 [%rd5], %f2;
+; SM60-NEXT:    ld.volatile.global.f64 %fd1, [%rd5];
+; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.global.f64 [%rd5], %fd2;
+; SM60-NEXT:    ret;
+;
+; SM70-LABEL: global_monotonic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b16 %rs<5>;
+; SM70-NEXT:    .reg .b32 %r<3>;
+; SM70-NEXT:    .reg .f32 %f<3>;
+; SM70-NEXT:    .reg .b64 %rd<8>;
+; SM70-NEXT:    .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [global_monotonic_sys_param_0];
+; SM70-NEXT:    ld.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.u64 %rd2, [global_monotonic_sys_param_1];
+; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM70-NEXT:    ld.param.u64 %rd3, [global_monotonic_sys_param_2];
+; SM70-NEXT:    st.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT:    ld.param.u64 %rd4, [global_monotonic_sys_param_3];
+; SM70-NEXT:    ld.relaxed.sys.global.u16 %rs3, [%rd2];
+; SM70-NEXT:    ld.param.u64 %rd5, [global_monotonic_sys_param_4];
+; SM70-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM70-NEXT:    st.relaxed.sys.global.u16 [%rd2], %rs4;
+; SM70-NEXT:    ld.relaxed.sys.global.u32 %r1, [%rd3];
+; SM70-NEXT:    add.s32 %r2, %r1, 1;
+; SM70-NEXT:    st.relaxed.sys.global.u32 [%rd3], %r2;
+; SM70-NEXT:    ld.relaxed.sys.global.u64 %rd6, [%rd4];
+; SM70-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM70-NEXT:    st.relaxed.sys.global.u64 [%rd4], %rd7;
+; SM70-NEXT:    ld.relaxed.sys.global.f32 %f1, [%rd5];
+; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT:    st.relaxed.sys.global.f32 [%rd5], %f2;
+; SM70-NEXT:    ld.relaxed.sys.global.f64 %fd1, [%rd5];
+; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT:    st.relaxed.sys.global.f64 [%rd5], %fd2;
+; SM70-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
 
-  ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
 
-  ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
 
-  ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
 
-  ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
 
-  ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
 
   ret void
 }
 
-; CHECK-LABEL: global_monotonic_volatile_sys
 define void @global_monotonic_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
-  ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; SM60-LABEL: global_monotonic_volatile_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b16 %rs<5>;
+; SM60-NEXT:    .reg .b32 %r<3>;
+; SM60-NEXT:    .reg .f32 %f<3>;
+; SM60-NEXT:    .reg .b64 %rd<8>;
+; SM60-NEXT:    .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [global_monotonic_volatile_sys_param_0];
+; SM60-NEXT:    ld.volatile.global.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.u64 %rd2, [global_monotonic_volatile_sys_param_1];
+; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM60-NEXT:    ld.param.u64 %rd3, [global_monotonic_volatile_sys_param_2];
+; SM60-NEXT:    st.volatile.global.u8 [%rd1], %rs2;
+; SM60-NEXT:    ld.param.u64 %rd4, [global_monotonic_volatile_sys_param_3];
+; SM60-NEXT:    ld.volatile.global.u16 %rs3, [%rd2];
+; SM60-NEXT:    ld.param.u64 %rd5, [global_monotonic_volatile_sys_param_4];
+; SM60-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM60-NEXT:    st.volatile.global.u16 [%rd2], %rs4;
+; SM60-NEXT:    ld.volatile.global.u32 %r1, [%rd3];
+; SM60-NEXT:    add.s32 %r2, %r1, 1;
+; SM60-NEXT:    st.volatile.global.u32 [%rd3], %r2;
+; SM60-NEXT:    ld.volatile.global.u64 %rd6, [%rd4];
+; SM60-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM60-NEXT:    st.volatile.global.u64 [%rd4], %rd7;
+; SM60-NEXT:    ld.volatile.global.f32 %f1, [%rd5];
+; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT:    st.volatile.global.f32 [%rd5], %f2;
+; SM60-NEXT:    ld.volatile.global.f64 %fd1, [%rd5];
+; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.global.f64 [%rd5], %fd2;
+; SM60-NEXT:    ret;
+;
+; SM70-LABEL: global_monotonic_volatile_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b16 %rs<5>;
+; SM70-NEXT:    .reg .b32 %r<3>;
+; SM70-NEXT:    .reg .f32 %f<3>;
+; SM70-NEXT:    .reg .b64 %rd<8>;
+; SM70-NEXT:    .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [global_monotonic_volatile_sys_param_0];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.u64 %rd2, [global_monotonic_volatile_sys_param_1];
+; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM70-NEXT:    ld.param.u64 %rd3, [global_monotonic_volatile_sys_param_2];
+; SM70-NEXT:    st.mmio.relaxed.sys.global.u8 [%rd1], %rs2;
+; SM70-NEXT:    ld.param.u64 %rd4, [global_monotonic_volatile_sys_param_3];
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.u16 %rs3, [%rd2];
+; SM70-NEXT:    ld.param.u64 %rd5, [global_monotonic_volatile_sys_param_4];
+; SM70-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.u16 [%rd2], %rs4;
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.u32 %r1, [%rd3];
+; SM70-NEXT:    add.s32 %r2, %r1, 1;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.u32 [%rd3], %r2;
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.u64 %rd6, [%rd4];
+; SM70-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.u64 [%rd4], %rd7;
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.f32 %f1, [%rd5];
+; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.f32 [%rd5], %f2;
+; SM70-NEXT:    ld.mmio.relaxed.sys.global.f64 %fd1, [%rd5];
+; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT:    st.mmio.relaxed.sys.global.f64 [%rd5], %fd2;
+; SM70-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
 
-  ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
 
-  ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
 
-  ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
 
-  ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
 
-  ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
 
   ret void
@@ -855,391 +1404,643 @@ define void @global_monotonic_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1)
 
 ;; shared statespace
 
-; CHECK-LABEL: shared_weak
 define void @shared_weak(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
-  ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_weak(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<29>;
+; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .f32 %f<15>;
+; CHECK-NEXT:    .reg .b64 %rd<11>;
+; CHECK-NEXT:    .reg .f64 %fd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [shared_weak_param_0];
+; CHECK-NEXT:    ld.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [shared_weak_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [shared_weak_param_2];
+; CHECK-NEXT:    st.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [shared_weak_param_3];
+; CHECK-NEXT:    ld.shared.u16 %rs3, [%rd2];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.shared.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.shared.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.shared.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.shared.u64 %rd5, [%rd4];
+; CHECK-NEXT:    add.s64 %rd6, %rd5, 1;
+; CHECK-NEXT:    st.shared.u64 [%rd4], %rd6;
+; CHECK-NEXT:    ld.shared.f32 %f1, [%rd3];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.shared.f32 [%rd3], %f2;
+; CHECK-NEXT:    ld.shared.f64 %fd1, [%rd3];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.shared.f64 [%rd3], %fd2;
+; CHECK-NEXT:    ld.shared.v2.u8 {%rs5, %rs6}, [%rd2];
+; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT:    st.shared.v2.u8 [%rd2], {%rs8, %rs7};
+; CHECK-NEXT:    ld.shared.u32 %r3, [%rd3];
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs9, %r4;
+; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs10;
+; CHECK-NEXT:    bfe.u32 %r6, %r3, 8, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r6;
+; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r7, %rs12;
+; CHECK-NEXT:    bfi.b32 %r8, %r7, %r5, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r3, 16, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs13, %r9;
+; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs14;
+; CHECK-NEXT:    bfi.b32 %r11, %r10, %r8, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs15, %r12;
+; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r13, %rs16;
+; CHECK-NEXT:    bfi.b32 %r14, %r13, %r11, 24, 8;
+; CHECK-NEXT:    st.shared.u32 [%rd3], %r14;
+; CHECK-NEXT:    ld.shared.u32 %r15, [%rd3];
+; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r15;
+; CHECK-NEXT:    add.s16 %rs19, %rs18, 1;
+; CHECK-NEXT:    add.s16 %rs20, %rs17, 1;
+; CHECK-NEXT:    mov.b32 %r16, {%rs20, %rs19};
+; CHECK-NEXT:    st.shared.u32 [%rd3], %r16;
+; CHECK-NEXT:    ld.shared.v4.u16 {%rs21, %rs22, %rs23, %rs24}, [%rd4];
+; CHECK-NEXT:    add.s16 %rs25, %rs24, 1;
+; CHECK-NEXT:    add.s16 %rs26, %rs23, 1;
+; CHECK-NEXT:    add.s16 %rs27, %rs22, 1;
+; CHECK-NEXT:    add.s16 %rs28, %rs21, 1;
+; CHECK-NEXT:    st.shared.v4.u16 [%rd4], {%rs28, %rs27, %rs26, %rs25};
+; CHECK-NEXT:    ld.shared.v2.u32 {%r17, %r18}, [%rd4];
+; CHECK-NEXT:    add.s32 %r19, %r18, 1;
+; CHECK-NEXT:    add.s32 %r20, %r17, 1;
+; CHECK-NEXT:    st.shared.v2.u32 [%rd4], {%r20, %r19};
+; CHECK-NEXT:    ld.shared.v4.u32 {%r21, %r22, %r23, %r24}, [%rd4];
+; CHECK-NEXT:    add.s32 %r25, %r24, 1;
+; CHECK-NEXT:    add.s32 %r26, %r23, 1;
+; CHECK-NEXT:    add.s32 %r27, %r22, 1;
+; CHECK-NEXT:    add.s32 %r28, %r21, 1;
+; CHECK-NEXT:    st.shared.v4.u32 [%rd4], {%r28, %r27, %r26, %r25};
+; CHECK-NEXT:    ld.shared.v2.u64 {%rd7, %rd8}, [%rd4];
+; CHECK-NEXT:    add.s64 %rd9, %rd8, 1;
+; CHECK-NEXT:    add.s64 %rd10, %rd7, 1;
+; CHECK-NEXT:    st.shared.v2.u64 [%rd4], {%rd10, %rd9};
+; CHECK-NEXT:    ld.shared.v2.f32 {%f3, %f4}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT:    st.shared.v2.f32 [%rd4], {%f6, %f5};
+; CHECK-NEXT:    ld.shared.v4.f32 {%f7, %f8, %f9, %f10}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f11, %f10, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f12, %f9, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f13, %f8, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f14, %f7, 0f3F800000;
+; CHECK-NEXT:    st.shared.v4.f32 [%rd4], {%f14, %f13, %f12, %f11};
+; CHECK-NEXT:    ld.shared.v2.f64 {%fd3, %fd4}, [%rd4];
+; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %fd6, %fd3, 0d3FF0000000000000;
+; CHECK-NEXT:    st.shared.v2.f64 [%rd4], {%fd6, %fd5};
+; CHECK-NEXT:    ret;
   %a.load = load i8, ptr addrspace(3) %a
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store i8 %a.add, ptr addrspace(3) %a
 
-  ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load i16, ptr addrspace(3) %b
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store i16 %b.add, ptr addrspace(3) %b
 
-  ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load i32, ptr addrspace(3) %c
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store i32 %c.add, ptr addrspace(3) %c
 
-  ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load i64, ptr addrspace(3) %d
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store i64 %d.add, ptr addrspace(3) %d
 
-  ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load float, ptr addrspace(3) %c
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store float %e.add, ptr addrspace(3) %c
 
-  ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load double, ptr addrspace(3) %c
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store double %f.add, ptr addrspace(3) %c
 
-  ; CHECK: ld.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %h.load = load <2 x i8>, ptr addrspace(3) %b
   %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
-  ; CHECK: st.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store <2 x i8> %h.add, ptr addrspace(3) %b
 
-  ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %i.load = load <4 x i8>, ptr addrspace(3) %c
   %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
-  ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store <4 x i8> %i.add, ptr addrspace(3) %c
 
-  ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %j.load = load <2 x i16>, ptr addrspace(3) %c
   %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
-  ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store <2 x i16> %j.add, ptr addrspace(3) %c
 
-  ; CHECK: ld.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %k.load = load <4 x i16>, ptr addrspace(3) %d
   %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
-  ; CHECK: st.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store <4 x i16> %k.add, ptr addrspace(3) %d
 
-  ; CHECK: ld.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %l.load = load <2 x i32>, ptr addrspace(3) %d
   %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
-  ; CHECK: st.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
   store <2 x i32> %l.add, ptr addrspace(3) %d
 
-  ; CHECK: ld.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %m.load = load <4 x i32>, ptr addrspace(3) %d
   %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
-  ; CHECK: st.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   store <4 x i32> %m.add, ptr addrspace(3) %d
 
-  ; CHECK: ld.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %n.load = load <2 x i64>, ptr addrspace(3) %d
   %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
-  ; CHECK: st.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
   store <2 x i64> %n.add, ptr addrspace(3) %d
 
-  ; CHECK: ld.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %o.load = load <2 x float>, ptr addrspace(3) %d
   %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
-  ; CHECK: st.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
   store <2 x float> %o.add, ptr addrspace(3) %d
 
-  ; CHECK: ld.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %p.load = load <4 x float>, ptr addrspace(3) %d
   %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
-  ; CHECK: st.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
   store <4 x float> %p.add, ptr addrspace(3) %d
 
-  ; CHECK: ld.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %q.load = load <2 x double>, ptr addrspace(3) %d
   %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
-  ; CHECK: st.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
   store <2 x double> %q.add, ptr addrspace(3) %d
 
   ret void
 }
 
-; CHECK-LABEL: shared_volatile
 define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_volatile(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<29>;
+; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .f32 %f<15>;
+; CHECK-NEXT:    .reg .b64 %rd<11>;
+; CHECK-NEXT:    .reg .f64 %fd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_param_0];
+; CHECK-NEXT:    ld.volatile.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [shared_volatile_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [shared_volatile_param_2];
+; CHECK-NEXT:    st.volatile.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [shared_volatile_param_3];
+; CHECK-NEXT:    ld.volatile.shared.u16 %rs3, [%rd2];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.shared.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.shared.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.shared.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.shared.u64 %rd5, [%rd4];
+; CHECK-NEXT:    add.s64 %rd6, %rd5, 1;
+; CHECK-NEXT:    st.volatile.shared.u64 [%rd4], %rd6;
+; CHECK-NEXT:    ld.volatile.shared.f32 %f1, [%rd3];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.f32 [%rd3], %f2;
+; CHECK-NEXT:    ld.volatile.shared.f64 %fd1, [%rd3];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.shared.f64 [%rd3], %fd2;
+; CHECK-NEXT:    ld.volatile.shared.v2.u8 {%rs5, %rs6}, [%rd2];
+; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT:    st.volatile.shared.v2.u8 [%rd2], {%rs8, %rs7};
+; CHECK-NEXT:    ld.volatile.shared.u32 %r3, [%rd3];
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs9, %r4;
+; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs10;
+; CHECK-NEXT:    bfe.u32 %r6, %r3, 8, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r6;
+; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r7, %rs12;
+; CHECK-NEXT:    bfi.b32 %r8, %r7, %r5, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r3, 16, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs13, %r9;
+; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs14;
+; CHECK-NEXT:    bfi.b32 %r11, %r10, %r8, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs15, %r12;
+; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r13, %rs16;
+; CHECK-NEXT:    bfi.b32 %r14, %r13, %r11, 24, 8;
+; CHECK-NEXT:    st.volatile.shared.u32 [%rd3], %r14;
+; CHECK-NEXT:    ld.volatile.shared.u32 %r15, [%rd3];
+; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r15;
+; CHECK-NEXT:    add.s16 %rs19, %rs18, 1;
+; CHECK-NEXT:    add.s16 %rs20, %rs17, 1;
+; CHECK-NEXT:    mov.b32 %r16, {%rs20, %rs19};
+; CHECK-NEXT:    st.volatile.shared.u32 [%rd3], %r16;
+; CHECK-NEXT:    ld.volatile.shared.v4.u16 {%rs21, %rs22, %rs23, %rs24}, [%rd4];
+; CHECK-NEXT:    add.s16 %rs25, %rs24, 1;
+; CHECK-NEXT:    add.s16 %rs26, %rs23, 1;
+; CHECK-NEXT:    add.s16 %rs27, %rs22, 1;
+; CHECK-NEXT:    add.s16 %rs28, %rs21, 1;
+; CHECK-NEXT:    st.volatile.shared.v4.u16 [%rd4], {%rs28, %rs27, %rs26, %rs25};
+; CHECK-NEXT:    ld.volatile.shared.v2.u32 {%r17, %r18}, [%rd4];
+; CHECK-NEXT:    add.s32 %r19, %r18, 1;
+; CHECK-NEXT:    add.s32 %r20, %r17, 1;
+; CHECK-NEXT:    st.volatile.shared.v2.u32 [%rd4], {%r20, %r19};
+; CHECK-NEXT:    ld.volatile.shared.v4.u32 {%r21, %r22, %r23, %r24}, [%rd4];
+; CHECK-NEXT:    add.s32 %r25, %r24, 1;
+; CHECK-NEXT:    add.s32 %r26, %r23, 1;
+; CHECK-NEXT:    add.s32 %r27, %r22, 1;
+; CHECK-NEXT:    add.s32 %r28, %r21, 1;
+; CHECK-NEXT:    st.volatile.shared.v4.u32 [%rd4], {%r28, %r27, %r26, %r25};
+; CHECK-NEXT:    ld.volatile.shared.v2.u64 {%rd7, %rd8}, [%rd4];
+; CHECK-NEXT:    add.s64 %rd9, %rd8, 1;
+; CHECK-NEXT:    add.s64 %rd10, %rd7, 1;
+; CHECK-NEXT:    st.volatile.shared.v2.u64 [%rd4], {%rd10, %rd9};
+; CHECK-NEXT:    ld.volatile.shared.v2.f32 {%f3, %f4}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.v2.f32 [%rd4], {%f6, %f5};
+; CHECK-NEXT:    ld.volatile.shared.v4.f32 {%f7, %f8, %f9, %f10}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f11, %f10, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f12, %f9, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f13, %f8, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f14, %f7, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.v4.f32 [%rd4], {%f14, %f13, %f12, %f11};
+; CHECK-NEXT:    ld.volatile.shared.v2.f64 {%fd3, %fd4}, [%rd4];
+; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %fd6, %fd3, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.shared.v2.f64 [%rd4], {%fd6, %fd5};
+; CHECK-NEXT:    ret;
   %a.load = load volatile i8, ptr addrspace(3) %a
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store volatile i8 %a.add, ptr addrspace(3) %a
 
-  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load volatile i16, ptr addrspace(3) %b
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store volatile i16 %b.add, ptr addrspace(3) %b
 
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load volatile i32, ptr addrspace(3) %c
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store volatile i32 %c.add, ptr addrspace(3) %c
 
-  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load volatile i64, ptr addrspace(3) %d
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store volatile i64 %d.add, ptr addrspace(3) %d
 
-  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load volatile float, ptr addrspace(3) %c
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store volatile float %e.add, ptr addrspace(3) %c
 
-  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load volatile double, ptr addrspace(3) %c
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store volatile double %f.add, ptr addrspace(3) %c
 
-  ; CHECK: ld.volatile.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %h.load = load volatile <2 x i8>, ptr addrspace(3) %b
   %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
-  ; CHECK: st.volatile.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store volatile <2 x i8> %h.add, ptr addrspace(3) %b
 
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %i.load = load volatile <4 x i8>, ptr addrspace(3) %c
   %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store volatile <4 x i8> %i.add, ptr addrspace(3) %c
 
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %j.load = load volatile <2 x i16>, ptr addrspace(3) %c
   %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store volatile <2 x i16> %j.add, ptr addrspace(3) %c
 
-  ; CHECK: ld.volatile.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %k.load = load volatile <4 x i16>, ptr addrspace(3) %d
   %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
-  ; CHECK: st.volatile.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store volatile <4 x i16> %k.add, ptr addrspace(3) %d
 
-  ; CHECK: ld.volatile.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %l.load = load volatile <2 x i32>, ptr addrspace(3) %d
   %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
-  ; CHECK: st.volatile.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
   store volatile <2 x i32> %l.add, ptr addrspace(3) %d
 
-  ; CHECK: ld.volatile.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %m.load = load volatile <4 x i32>, ptr addrspace(3) %d
   %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
-  ; CHECK: st.volatile.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   store volatile <4 x i32> %m.add, ptr addrspace(3) %d
 
-  ; CHECK: ld.volatile.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %n.load = load volatile <2 x i64>, ptr addrspace(3) %d
   %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
-  ; CHECK: st.volatile.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
   store volatile <2 x i64> %n.add, ptr addrspace(3) %d
 
-  ; CHECK: ld.volatile.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %o.load = load volatile <2 x float>, ptr addrspace(3) %d
   %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
-  ; CHECK: st.volatile.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
   store volatile <2 x float> %o.add, ptr addrspace(3) %d
 
-  ; CHECK: ld.volatile.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %p.load = load volatile <4 x float>, ptr addrspace(3) %d
   %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
-  ; CHECK: st.volatile.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
   store volatile <4 x float> %p.add, ptr addrspace(3) %d
 
-  ; CHECK: ld.volatile.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %q.load = load volatile <2 x double>, ptr addrspace(3) %d
   %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
-  ; CHECK: st.volatile.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
   store volatile <2 x double> %q.add, ptr addrspace(3) %d
 
   ret void
 }
 
-; CHECK-LABEL: shared_unordered_sys
 define void @shared_unordered_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; SM60-LABEL: shared_unordered_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b16 %rs<5>;
+; SM60-NEXT:    .reg .b32 %r<3>;
+; SM60-NEXT:    .reg .f32 %f<3>;
+; SM60-NEXT:    .reg .b64 %rd<8>;
+; SM60-NEXT:    .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [shared_unordered_sys_param_0];
+; SM60-NEXT:    ld.volatile.shared.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.u64 %rd2, [shared_unordered_sys_param_1];
+; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM60-NEXT:    ld.param.u64 %rd3, [shared_unordered_sys_param_2];
+; SM60-NEXT:    st.volatile.shared.u8 [%rd1], %rs2;
+; SM60-NEXT:    ld.param.u64 %rd4, [shared_unordered_sys_param_3];
+; SM60-NEXT:    ld.volatile.shared.u16 %rs3, [%rd2];
+; SM60-NEXT:    ld.param.u64 %rd5, [shared_unordered_sys_param_4];
+; SM60-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM60-NEXT:    st.volatile.shared.u16 [%rd2], %rs4;
+; SM60-NEXT:    ld.volatile.shared.u32 %r1, [%rd3];
+; SM60-NEXT:    add.s32 %r2, %r1, 1;
+; SM60-NEXT:    st.volatile.shared.u32 [%rd3], %r2;
+; SM60-NEXT:    ld.volatile.shared.u64 %rd6, [%rd4];
+; SM60-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM60-NEXT:    st.volatile.shared.u64 [%rd4], %rd7;
+; SM60-NEXT:    ld.volatile.shared.f32 %f1, [%rd5];
+; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT:    st.volatile.shared.f32 [%rd5], %f2;
+; SM60-NEXT:    ld.volatile.shared.f64 %fd1, [%rd5];
+; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.shared.f64 [%rd5], %fd2;
+; SM60-NEXT:    ret;
+;
+; SM70-LABEL: shared_unordered_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b16 %rs<5>;
+; SM70-NEXT:    .reg .b32 %r<3>;
+; SM70-NEXT:    .reg .f32 %f<3>;
+; SM70-NEXT:    .reg .b64 %rd<8>;
+; SM70-NEXT:    .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [shared_unordered_sys_param_0];
+; SM70-NEXT:    ld.relaxed.sys.shared.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.u64 %rd2, [shared_unordered_sys_param_1];
+; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM70-NEXT:    ld.param.u64 %rd3, [shared_unordered_sys_param_2];
+; SM70-NEXT:    st.relaxed.sys.shared.u8 [%rd1], %rs2;
+; SM70-NEXT:    ld.param.u64 %rd4, [shared_unordered_sys_param_3];
+; SM70-NEXT:    ld.relaxed.sys.shared.u16 %rs3, [%rd2];
+; SM70-NEXT:    ld.param.u64 %rd5, [shared_unordered_sys_param_4];
+; SM70-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM70-NEXT:    st.relaxed.sys.shared.u16 [%rd2], %rs4;
+; SM70-NEXT:    ld.relaxed.sys.shared.u32 %r1, [%rd3];
+; SM70-NEXT:    add.s32 %r2, %r1, 1;
+; SM70-NEXT:    st.relaxed.sys.shared.u32 [%rd3], %r2;
+; SM70-NEXT:    ld.relaxed.sys.shared.u64 %rd6, [%rd4];
+; SM70-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM70-NEXT:    st.relaxed.sys.shared.u64 [%rd4], %rd7;
+; SM70-NEXT:    ld.relaxed.sys.shared.f32 %f1, [%rd5];
+; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT:    st.relaxed.sys.shared.f32 [%rd5], %f2;
+; SM70-NEXT:    ld.relaxed.sys.shared.f64 %fd1, [%rd5];
+; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT:    st.relaxed.sys.shared.f64 [%rd5], %fd2;
+; SM70-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1
 
-  ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2
 
-  ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4
 
-  ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8
 
-  ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4
   %e.add = fadd float %e.load, 1.0
-  ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e unordered, align 4
 
-  ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e unordered, align 8
 
   ret void
 }
 
-; CHECK-LABEL: shared_unordered_volatile_sys
 define void @shared_unordered_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_unordered_volatile_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [shared_unordered_volatile_sys_param_0];
+; CHECK-NEXT:    ld.volatile.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [shared_unordered_volatile_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [shared_unordered_volatile_sys_param_2];
+; CHECK-NEXT:    st.volatile.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [shared_unordered_volatile_sys_param_3];
+; CHECK-NEXT:    ld.volatile.shared.u16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.u64 %rd5, [shared_unordered_volatile_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.shared.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.shared.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.shared.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.shared.u64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.shared.u64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.shared.f32 %f1, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.f32 [%rd5], %f2;
+; CHECK-NEXT:    ld.volatile.shared.f64 %fd1, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.shared.f64 [%rd5], %fd2;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1
 
-  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2
 
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4
 
-  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8
 
-  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4
   %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4
 
-  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8
 
   ret void
 }
 
-; CHECK-LABEL: shared_monotonic_sys
 define void @shared_monotonic_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; SM60-LABEL: shared_monotonic_sys(
+; SM60:       {
+; SM60-NEXT:    .reg .b16 %rs<5>;
+; SM60-NEXT:    .reg .b32 %r<3>;
+; SM60-NEXT:    .reg .f32 %f<3>;
+; SM60-NEXT:    .reg .b64 %rd<8>;
+; SM60-NEXT:    .reg .f64 %fd<3>;
+; SM60-EMPTY:
+; SM60-NEXT:  // %bb.0:
+; SM60-NEXT:    ld.param.u64 %rd1, [shared_monotonic_sys_param_0];
+; SM60-NEXT:    ld.volatile.shared.u8 %rs1, [%rd1];
+; SM60-NEXT:    ld.param.u64 %rd2, [shared_monotonic_sys_param_1];
+; SM60-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM60-NEXT:    ld.param.u64 %rd3, [shared_monotonic_sys_param_2];
+; SM60-NEXT:    st.volatile.shared.u8 [%rd1], %rs2;
+; SM60-NEXT:    ld.param.u64 %rd4, [shared_monotonic_sys_param_3];
+; SM60-NEXT:    ld.volatile.shared.u16 %rs3, [%rd2];
+; SM60-NEXT:    ld.param.u64 %rd5, [shared_monotonic_sys_param_4];
+; SM60-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM60-NEXT:    st.volatile.shared.u16 [%rd2], %rs4;
+; SM60-NEXT:    ld.volatile.shared.u32 %r1, [%rd3];
+; SM60-NEXT:    add.s32 %r2, %r1, 1;
+; SM60-NEXT:    st.volatile.shared.u32 [%rd3], %r2;
+; SM60-NEXT:    ld.volatile.shared.u64 %rd6, [%rd4];
+; SM60-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM60-NEXT:    st.volatile.shared.u64 [%rd4], %rd7;
+; SM60-NEXT:    ld.volatile.shared.f32 %f1, [%rd5];
+; SM60-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM60-NEXT:    st.volatile.shared.f32 [%rd5], %f2;
+; SM60-NEXT:    ld.volatile.shared.f64 %fd1, [%rd5];
+; SM60-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM60-NEXT:    st.volatile.shared.f64 [%rd5], %fd2;
+; SM60-NEXT:    ret;
+;
+; SM70-LABEL: shared_monotonic_sys(
+; SM70:       {
+; SM70-NEXT:    .reg .b16 %rs<5>;
+; SM70-NEXT:    .reg .b32 %r<3>;
+; SM70-NEXT:    .reg .f32 %f<3>;
+; SM70-NEXT:    .reg .b64 %rd<8>;
+; SM70-NEXT:    .reg .f64 %fd<3>;
+; SM70-EMPTY:
+; SM70-NEXT:  // %bb.0:
+; SM70-NEXT:    ld.param.u64 %rd1, [shared_monotonic_sys_param_0];
+; SM70-NEXT:    ld.relaxed.sys.shared.u8 %rs1, [%rd1];
+; SM70-NEXT:    ld.param.u64 %rd2, [shared_monotonic_sys_param_1];
+; SM70-NEXT:    add.s16 %rs2, %rs1, 1;
+; SM70-NEXT:    ld.param.u64 %rd3, [shared_monotonic_sys_param_2];
+; SM70-NEXT:    st.relaxed.sys.shared.u8 [%rd1], %rs2;
+; SM70-NEXT:    ld.param.u64 %rd4, [shared_monotonic_sys_param_3];
+; SM70-NEXT:    ld.relaxed.sys.shared.u16 %rs3, [%rd2];
+; SM70-NEXT:    ld.param.u64 %rd5, [shared_monotonic_sys_param_4];
+; SM70-NEXT:    add.s16 %rs4, %rs3, 1;
+; SM70-NEXT:    st.relaxed.sys.shared.u16 [%rd2], %rs4;
+; SM70-NEXT:    ld.relaxed.sys.shared.u32 %r1, [%rd3];
+; SM70-NEXT:    add.s32 %r2, %r1, 1;
+; SM70-NEXT:    st.relaxed.sys.shared.u32 [%rd3], %r2;
+; SM70-NEXT:    ld.relaxed.sys.shared.u64 %rd6, [%rd4];
+; SM70-NEXT:    add.s64 %rd7, %rd6, 1;
+; SM70-NEXT:    st.relaxed.sys.shared.u64 [%rd4], %rd7;
+; SM70-NEXT:    ld.relaxed.sys.shared.f32 %f1, [%rd5];
+; SM70-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; SM70-NEXT:    st.relaxed.sys.shared.f32 [%rd5], %f2;
+; SM70-NEXT:    ld.relaxed.sys.shared.f64 %fd1, [%rd5];
+; SM70-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; SM70-NEXT:    st.relaxed.sys.shared.f64 [%rd5], %fd2;
+; SM70-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
 
-  ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
-  ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
 
-  ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
-  ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
 
-  ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
-  ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
 
-  ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
-  ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
 
-  ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
-  ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
-  ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
 
   ret void
 }
 
-; CHECK-LABEL: shared_monotonic_volatile_sys
 define void @shared_monotonic_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
-  ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: shared_monotonic_volatile_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [shared_monotonic_volatile_sys_param_0];
+; CHECK-NEXT:    ld.volatile.shared.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [shared_monotonic_volatile_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [shared_monotonic_volatile_sys_param_2];
+; CHECK-NEXT:    st.volatile.shared.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [shared_monotonic_volatile_sys_param_3];
+; CHECK-NEXT:    ld.volatile.shared.u16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.u64 %rd5, [shared_monotonic_volatile_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.volatile.shared.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.volatile.shared.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.volatile.shared.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.volatile.shared.u64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.volatile.shared.u64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.volatile.shared.f32 %f1, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.volatile.shared.f32 [%rd5], %f2;
+; CHECK-NEXT:    ld.volatile.shared.f64 %fd1, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.volatile.shared.f64 [%rd5], %fd2;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
 
-  ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
 
-  ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
 
-  ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
 
-  ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
 
-  ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
 
   ret void
@@ -1247,367 +2048,575 @@ define void @shared_monotonic_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3)
 
 ;; local statespace
 
-; CHECK-LABEL: local_weak
 define void @local_weak(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_weak(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<29>;
+; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .f32 %f<15>;
+; CHECK-NEXT:    .reg .b64 %rd<11>;
+; CHECK-NEXT:    .reg .f64 %fd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [local_weak_param_0];
+; CHECK-NEXT:    ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [local_weak_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [local_weak_param_2];
+; CHECK-NEXT:    st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [local_weak_param_3];
+; CHECK-NEXT:    ld.local.u16 %rs3, [%rd2];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.u64 %rd5, [%rd4];
+; CHECK-NEXT:    add.s64 %rd6, %rd5, 1;
+; CHECK-NEXT:    st.local.u64 [%rd4], %rd6;
+; CHECK-NEXT:    ld.local.f32 %f1, [%rd3];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.local.f32 [%rd3], %f2;
+; CHECK-NEXT:    ld.local.f64 %fd1, [%rd3];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.f64 [%rd3], %fd2;
+; CHECK-NEXT:    ld.local.v2.u8 {%rs5, %rs6}, [%rd2];
+; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT:    st.local.v2.u8 [%rd2], {%rs8, %rs7};
+; CHECK-NEXT:    ld.local.u32 %r3, [%rd3];
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs9, %r4;
+; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs10;
+; CHECK-NEXT:    bfe.u32 %r6, %r3, 8, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r6;
+; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r7, %rs12;
+; CHECK-NEXT:    bfi.b32 %r8, %r7, %r5, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r3, 16, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs13, %r9;
+; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs14;
+; CHECK-NEXT:    bfi.b32 %r11, %r10, %r8, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs15, %r12;
+; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r13, %rs16;
+; CHECK-NEXT:    bfi.b32 %r14, %r13, %r11, 24, 8;
+; CHECK-NEXT:    st.local.u32 [%rd3], %r14;
+; CHECK-NEXT:    ld.local.u32 %r15, [%rd3];
+; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r15;
+; CHECK-NEXT:    add.s16 %rs19, %rs18, 1;
+; CHECK-NEXT:    add.s16 %rs20, %rs17, 1;
+; CHECK-NEXT:    mov.b32 %r16, {%rs20, %rs19};
+; CHECK-NEXT:    st.local.u32 [%rd3], %r16;
+; CHECK-NEXT:    ld.local.v4.u16 {%rs21, %rs22, %rs23, %rs24}, [%rd4];
+; CHECK-NEXT:    add.s16 %rs25, %rs24, 1;
+; CHECK-NEXT:    add.s16 %rs26, %rs23, 1;
+; CHECK-NEXT:    add.s16 %rs27, %rs22, 1;
+; CHECK-NEXT:    add.s16 %rs28, %rs21, 1;
+; CHECK-NEXT:    st.local.v4.u16 [%rd4], {%rs28, %rs27, %rs26, %rs25};
+; CHECK-NEXT:    ld.local.v2.u32 {%r17, %r18}, [%rd4];
+; CHECK-NEXT:    add.s32 %r19, %r18, 1;
+; CHECK-NEXT:    add.s32 %r20, %r17, 1;
+; CHECK-NEXT:    st.local.v2.u32 [%rd4], {%r20, %r19};
+; CHECK-NEXT:    ld.local.v4.u32 {%r21, %r22, %r23, %r24}, [%rd4];
+; CHECK-NEXT:    add.s32 %r25, %r24, 1;
+; CHECK-NEXT:    add.s32 %r26, %r23, 1;
+; CHECK-NEXT:    add.s32 %r27, %r22, 1;
+; CHECK-NEXT:    add.s32 %r28, %r21, 1;
+; CHECK-NEXT:    st.local.v4.u32 [%rd4], {%r28, %r27, %r26, %r25};
+; CHECK-NEXT:    ld.local.v2.u64 {%rd7, %rd8}, [%rd4];
+; CHECK-NEXT:    add.s64 %rd9, %rd8, 1;
+; CHECK-NEXT:    add.s64 %rd10, %rd7, 1;
+; CHECK-NEXT:    st.local.v2.u64 [%rd4], {%rd10, %rd9};
+; CHECK-NEXT:    ld.local.v2.f32 {%f3, %f4}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT:    st.local.v2.f32 [%rd4], {%f6, %f5};
+; CHECK-NEXT:    ld.local.v4.f32 {%f7, %f8, %f9, %f10}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f11, %f10, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f12, %f9, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f13, %f8, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f14, %f7, 0f3F800000;
+; CHECK-NEXT:    st.local.v4.f32 [%rd4], {%f14, %f13, %f12, %f11};
+; CHECK-NEXT:    ld.local.v2.f64 {%fd3, %fd4}, [%rd4];
+; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %fd6, %fd3, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.v2.f64 [%rd4], {%fd6, %fd5};
+; CHECK-NEXT:    ret;
   %a.load = load i8, ptr addrspace(5) %a
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store i8 %a.add, ptr addrspace(5) %a
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load i16, ptr addrspace(5) %b
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store i16 %b.add, ptr addrspace(5) %b
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load i32, ptr addrspace(5) %c
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store i32 %c.add, ptr addrspace(5) %c
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load i64, ptr addrspace(5) %d
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store i64 %d.add, ptr addrspace(5) %d
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load float, ptr addrspace(5) %c
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store float %e.add, ptr addrspace(5) %c
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load double, ptr addrspace(5) %c
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store double %f.add, ptr addrspace(5) %c
 
-  ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %h.load = load <2 x i8>, ptr addrspace(5) %b
   %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
-  ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store <2 x i8> %h.add, ptr addrspace(5) %b
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %i.load = load <4 x i8>, ptr addrspace(5) %c
   %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store <4 x i8> %i.add, ptr addrspace(5) %c
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %j.load = load <2 x i16>, ptr addrspace(5) %c
   %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store <2 x i16> %j.add, ptr addrspace(5) %c
 
-  ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %k.load = load <4 x i16>, ptr addrspace(5) %d
   %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
-  ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store <4 x i16> %k.add, ptr addrspace(5) %d
 
-  ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %l.load = load <2 x i32>, ptr addrspace(5) %d
   %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
-  ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
   store <2 x i32> %l.add, ptr addrspace(5) %d
 
-  ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %m.load = load <4 x i32>, ptr addrspace(5) %d
   %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
-  ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   store <4 x i32> %m.add, ptr addrspace(5) %d
 
-  ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %n.load = load <2 x i64>, ptr addrspace(5) %d
   %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
-  ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
   store <2 x i64> %n.add, ptr addrspace(5) %d
 
-  ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %o.load = load <2 x float>, ptr addrspace(5) %d
   %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
-  ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
   store <2 x float> %o.add, ptr addrspace(5) %d
 
-  ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %p.load = load <4 x float>, ptr addrspace(5) %d
   %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
-  ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
   store <4 x float> %p.add, ptr addrspace(5) %d
 
-  ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %q.load = load <2 x double>, ptr addrspace(5) %d
   %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
-  ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
   store <2 x double> %q.add, ptr addrspace(5) %d
 
   ret void
 }
 
-; CHECK-LABEL: local_volatile
 define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_volatile(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<29>;
+; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .f32 %f<15>;
+; CHECK-NEXT:    .reg .b64 %rd<11>;
+; CHECK-NEXT:    .reg .f64 %fd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_param_0];
+; CHECK-NEXT:    ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [local_volatile_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [local_volatile_param_2];
+; CHECK-NEXT:    st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [local_volatile_param_3];
+; CHECK-NEXT:    ld.local.u16 %rs3, [%rd2];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.u64 %rd5, [%rd4];
+; CHECK-NEXT:    add.s64 %rd6, %rd5, 1;
+; CHECK-NEXT:    st.local.u64 [%rd4], %rd6;
+; CHECK-NEXT:    ld.local.f32 %f1, [%rd3];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.local.f32 [%rd3], %f2;
+; CHECK-NEXT:    ld.local.f64 %fd1, [%rd3];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.f64 [%rd3], %fd2;
+; CHECK-NEXT:    ld.local.v2.u8 {%rs5, %rs6}, [%rd2];
+; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
+; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
+; CHECK-NEXT:    st.local.v2.u8 [%rd2], {%rs8, %rs7};
+; CHECK-NEXT:    ld.local.u32 %r3, [%rd3];
+; CHECK-NEXT:    bfe.u32 %r4, %r3, 0, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs9, %r4;
+; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs10;
+; CHECK-NEXT:    bfe.u32 %r6, %r3, 8, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs11, %r6;
+; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r7, %rs12;
+; CHECK-NEXT:    bfi.b32 %r8, %r7, %r5, 8, 8;
+; CHECK-NEXT:    bfe.u32 %r9, %r3, 16, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs13, %r9;
+; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r10, %rs14;
+; CHECK-NEXT:    bfi.b32 %r11, %r10, %r8, 16, 8;
+; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
+; CHECK-NEXT:    cvt.u16.u32 %rs15, %r12;
+; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
+; CHECK-NEXT:    cvt.u32.u16 %r13, %rs16;
+; CHECK-NEXT:    bfi.b32 %r14, %r13, %r11, 24, 8;
+; CHECK-NEXT:    st.local.u32 [%rd3], %r14;
+; CHECK-NEXT:    ld.local.u32 %r15, [%rd3];
+; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r15;
+; CHECK-NEXT:    add.s16 %rs19, %rs18, 1;
+; CHECK-NEXT:    add.s16 %rs20, %rs17, 1;
+; CHECK-NEXT:    mov.b32 %r16, {%rs20, %rs19};
+; CHECK-NEXT:    st.local.u32 [%rd3], %r16;
+; CHECK-NEXT:    ld.local.v4.u16 {%rs21, %rs22, %rs23, %rs24}, [%rd4];
+; CHECK-NEXT:    add.s16 %rs25, %rs24, 1;
+; CHECK-NEXT:    add.s16 %rs26, %rs23, 1;
+; CHECK-NEXT:    add.s16 %rs27, %rs22, 1;
+; CHECK-NEXT:    add.s16 %rs28, %rs21, 1;
+; CHECK-NEXT:    st.local.v4.u16 [%rd4], {%rs28, %rs27, %rs26, %rs25};
+; CHECK-NEXT:    ld.local.v2.u32 {%r17, %r18}, [%rd4];
+; CHECK-NEXT:    add.s32 %r19, %r18, 1;
+; CHECK-NEXT:    add.s32 %r20, %r17, 1;
+; CHECK-NEXT:    st.local.v2.u32 [%rd4], {%r20, %r19};
+; CHECK-NEXT:    ld.local.v4.u32 {%r21, %r22, %r23, %r24}, [%rd4];
+; CHECK-NEXT:    add.s32 %r25, %r24, 1;
+; CHECK-NEXT:    add.s32 %r26, %r23, 1;
+; CHECK-NEXT:    add.s32 %r27, %r22, 1;
+; CHECK-NEXT:    add.s32 %r28, %r21, 1;
+; CHECK-NEXT:    st.local.v4.u32 [%rd4], {%r28, %r27, %r26, %r25};
+; CHECK-NEXT:    ld.local.v2.u64 {%rd7, %rd8}, [%rd4];
+; CHECK-NEXT:    add.s64 %rd9, %rd8, 1;
+; CHECK-NEXT:    add.s64 %rd10, %rd7, 1;
+; CHECK-NEXT:    st.local.v2.u64 [%rd4], {%rd10, %rd9};
+; CHECK-NEXT:    ld.local.v2.f32 {%f3, %f4}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
+; CHECK-NEXT:    st.local.v2.f32 [%rd4], {%f6, %f5};
+; CHECK-NEXT:    ld.local.v4.f32 {%f7, %f8, %f9, %f10}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f11, %f10, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f12, %f9, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f13, %f8, 0f3F800000;
+; CHECK-NEXT:    add.rn.f32 %f14, %f7, 0f3F800000;
+; CHECK-NEXT:    st.local.v4.f32 [%rd4], {%f14, %f13, %f12, %f11};
+; CHECK-NEXT:    ld.local.v2.f64 {%fd3, %fd4}, [%rd4];
+; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, 0d3FF0000000000000;
+; CHECK-NEXT:    add.rn.f64 %fd6, %fd3, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.v2.f64 [%rd4], {%fd6, %fd5};
+; CHECK-NEXT:    ret;
   %a.load = load volatile i8, ptr addrspace(5) %a
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store volatile i8 %a.add, ptr addrspace(5) %a
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load volatile i16, ptr addrspace(5) %b
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store volatile i16 %b.add, ptr addrspace(5) %b
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load volatile i32, ptr addrspace(5) %c
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store volatile i32 %c.add, ptr addrspace(5) %c
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load volatile i64, ptr addrspace(5) %d
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store volatile i64 %d.add, ptr addrspace(5) %d
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load volatile float, ptr addrspace(5) %c
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store volatile float %e.add, ptr addrspace(5) %c
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load volatile double, ptr addrspace(5) %c
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store volatile double %f.add, ptr addrspace(5) %c
 
-  ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %h.load = load volatile <2 x i8>, ptr addrspace(5) %b
   %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
-  ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store volatile <2 x i8> %h.add, ptr addrspace(5) %b
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %i.load = load volatile <4 x i8>, ptr addrspace(5) %c
   %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store volatile <4 x i8> %i.add, ptr addrspace(5) %c
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %j.load = load volatile <2 x i16>, ptr addrspace(5) %c
   %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store volatile <2 x i16> %j.add, ptr addrspace(5) %c
 
-  ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %k.load = load volatile <4 x i16>, ptr addrspace(5) %d
   %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
-  ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
   store volatile <4 x i16> %k.add, ptr addrspace(5) %d
 
-  ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %l.load = load volatile <2 x i32>, ptr addrspace(5) %d
   %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
-  ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
   store volatile <2 x i32> %l.add, ptr addrspace(5) %d
 
-  ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %m.load = load volatile <4 x i32>, ptr addrspace(5) %d
   %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
-  ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   store volatile <4 x i32> %m.add, ptr addrspace(5) %d
 
-  ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %n.load = load volatile <2 x i64>, ptr addrspace(5) %d
   %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
-  ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
   store volatile <2 x i64> %n.add, ptr addrspace(5) %d
 
-  ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %o.load = load volatile <2 x float>, ptr addrspace(5) %d
   %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
-  ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
   store volatile <2 x float> %o.add, ptr addrspace(5) %d
 
-  ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %p.load = load volatile <4 x float>, ptr addrspace(5) %d
   %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
-  ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
   store volatile <4 x float> %p.add, ptr addrspace(5) %d
 
-  ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
   %q.load = load volatile <2 x double>, ptr addrspace(5) %d
   %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
-  ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
   store volatile <2 x double> %q.add, ptr addrspace(5) %d
 
   ret void
 }
 
-; CHECK-LABEL: local_unordered_sys
 define void @local_unordered_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_unordered_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [local_unordered_sys_param_0];
+; CHECK-NEXT:    ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [local_unordered_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [local_unordered_sys_param_2];
+; CHECK-NEXT:    st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [local_unordered_sys_param_3];
+; CHECK-NEXT:    ld.local.u16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.u64 %rd5, [local_unordered_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.u64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.u64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.f32 %f1, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.local.f32 [%rd5], %f2;
+; CHECK-NEXT:    ld.local.f64 %fd1, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.f64 [%rd5], %fd2;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4
   %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e unordered, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e unordered, align 8
 
   ret void
 }
 
-; CHECK-LABEL: local_unordered_volatile_sys
 define void @local_unordered_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_unordered_volatile_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [local_unordered_volatile_sys_param_0];
+; CHECK-NEXT:    ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [local_unordered_volatile_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [local_unordered_volatile_sys_param_2];
+; CHECK-NEXT:    st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [local_unordered_volatile_sys_param_3];
+; CHECK-NEXT:    ld.local.u16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.u64 %rd5, [local_unordered_volatile_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.u64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.u64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.f32 %f1, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.local.f32 [%rd5], %f2;
+; CHECK-NEXT:    ld.local.f64 %fd1, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.f64 [%rd5], %fd2;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4
   %e.add = fadd float %e.load, 1.0
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8
 
   ret void
 }
 
-; CHECK-LABEL: local_monotonic_sys
 define void @local_monotonic_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_monotonic_sys(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [local_monotonic_sys_param_0];
+; CHECK-NEXT:    ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [local_monotonic_sys_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [local_monotonic_sys_param_2];
+; CHECK-NEXT:    st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [local_monotonic_sys_param_3];
+; CHECK-NEXT:    ld.local.u16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.u64 %rd5, [local_monotonic_sys_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.u64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.u64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.f32 %f1, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.local.f32 [%rd5], %f2;
+; CHECK-NEXT:    ld.local.f64 %fd1, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.f64 [%rd5], %fd2;
+; CHECK-NEXT:    ret;
   %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
 
   ret void
 }
 
-; CHECK-LABEL: local_monotonic_volatile
 define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
-  ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+; CHECK-LABEL: local_monotonic_volatile(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .f32 %f<3>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-NEXT:    .reg .f64 %fd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [local_monotonic_volatile_param_0];
+; CHECK-NEXT:    ld.local.u8 %rs1, [%rd1];
+; CHECK-NEXT:    ld.param.u64 %rd2, [local_monotonic_volatile_param_1];
+; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
+; CHECK-NEXT:    ld.param.u64 %rd3, [local_monotonic_volatile_param_2];
+; CHECK-NEXT:    st.local.u8 [%rd1], %rs2;
+; CHECK-NEXT:    ld.param.u64 %rd4, [local_monotonic_volatile_param_3];
+; CHECK-NEXT:    ld.local.u16 %rs3, [%rd2];
+; CHECK-NEXT:    ld.param.u64 %rd5, [local_monotonic_volatile_param_4];
+; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
+; CHECK-NEXT:    st.local.u16 [%rd2], %rs4;
+; CHECK-NEXT:    ld.local.u32 %r1, [%rd3];
+; CHECK-NEXT:    add.s32 %r2, %r1, 1;
+; CHECK-NEXT:    st.local.u32 [%rd3], %r2;
+; CHECK-NEXT:    ld.local.u64 %rd6, [%rd4];
+; CHECK-NEXT:    add.s64 %rd7, %rd6, 1;
+; CHECK-NEXT:    st.local.u64 [%rd4], %rd7;
+; CHECK-NEXT:    ld.local.f32 %f1, [%rd5];
+; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f3F800000;
+; CHECK-NEXT:    st.local.f32 [%rd5], %f2;
+; CHECK-NEXT:    ld.local.f64 %fd1, [%rd5];
+; CHECK-NEXT:    add.rn.f64 %fd2, %fd1, 0d3FF0000000000000;
+; CHECK-NEXT:    st.local.f64 [%rd5], %fd2;
+; CHECK-NEXT:    ret;
   %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
   %a.add = add i8 %a.load, 1
-  ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
 
-  ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
   %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
   %b.add = add i16 %b.load, 1
-  ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
   store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
 
-  ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
   %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
   %c.add = add i32 %c.load, 1
-  ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
   store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
 
-  ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
   %d.add = add i64 %d.load, 1
-  ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
   store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
 
-  ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
   %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
   %e.add = fadd float %e.load, 1.
-  ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
   store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
 
-  ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
   %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
   %f.add = fadd double %f.load, 1.
-  ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
   store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
 
   ret void
diff --git a/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll b/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll
index e139d3c9a9df0e..5bd3580f5e95ec 100644
--- a/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll
+++ b/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll
@@ -1,12 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
 
 ; The load is to the high byte of the 2-byte store
 @g = global i8 -75
 
 define void @f(i16 %v) {
-; CHECK-LABEL: f
-; CHECK: sth 3, -2(1)
-; CHECK: lbz 3, -2(1)
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addis 4, 2, .LC0 at toc@ha
+; CHECK-NEXT:    sth 3, -2(1)
+; CHECK-NEXT:    ld 4, .LC0 at toc@l(4)
+; CHECK-NEXT:    lbz 3, -2(1)
+; CHECK-NEXT:    stb 3, 0(4)
+; CHECK-NEXT:    blr
   %p32 = alloca i16
   store i16 %v, ptr %p32
   %tmp = load i8, ptr %p32

>From da9d77053ab64798635c63dedb207356d54e8e41 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Tue, 22 Oct 2024 15:07:17 +0100
Subject: [PATCH 2/3] EarlyCSE: create casts on type-mismatch

getOrCreateResult suffers from the deficiency that it doesn't attempt to
create casts when types mismatch. Fix this deficiency, making EarlyCSE
more powerful.
---
 llvm/lib/Transforms/Scalar/EarlyCSE.cpp       |  62 +-
 .../AMDGPU/GlobalISel/irtranslator-call.ll    |   4 +-
 .../branch-folding-implicit-def-subreg.ll     | 640 +++++++++---------
 llvm/test/CodeGen/NVPTX/load-store.ll         | 272 +++++---
 .../PowerPC/big-endian-store-forward.ll       |   1 -
 llvm/test/CodeGen/PowerPC/p10-spill-creq.ll   |  62 +-
 .../Transforms/EarlyCSE/invariant.start.ll    |  30 +-
 llvm/test/Transforms/EarlyCSE/opaque-ptr.ll   |  16 +-
 8 files changed, 590 insertions(+), 497 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index a1dbb4e1d5e75f..9714611cda8b0f 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -964,32 +965,45 @@ class EarlyCSE {
   bool overridingStores(const ParseMemoryInst &Earlier,
                         const ParseMemoryInst &Later);
 
-  Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
-    // TODO: We could insert relevant casts on type mismatch here.
-    if (auto *LI = dyn_cast<LoadInst>(Inst))
-      return LI->getType() == ExpectedType ? LI : nullptr;
-    if (auto *SI = dyn_cast<StoreInst>(Inst)) {
-      Value *V = SI->getValueOperand();
-      return V->getType() == ExpectedType ? V : nullptr;
+  Value *getOrCreateResult(Instruction *Inst, Type *ExpectedType) const {
+    if (!isa<IntrinsicInst, LoadInst, StoreInst>(Inst))
+      llvm_unreachable("Instruction not supported");
+
+    // The load or the store's first operand.
+    Value *V;
+    if (auto *II = dyn_cast<IntrinsicInst>(Inst)) {
+      if (isHandledNonTargetIntrinsic(II->getIntrinsicID()))
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::masked_load:
+          V = II;
+          break;
+        case Intrinsic::masked_store:
+          V = II->getOperand(0);
+          break;
+        default:
+          return nullptr;
+        }
+      else
+        return TTI.getOrCreateResultFromMemIntrinsic(II, ExpectedType);
+    } else {
+      V = isa<LoadInst>(Inst) ? Inst : cast<StoreInst>(Inst)->getValueOperand();
     }
-    assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
-    auto *II = cast<IntrinsicInst>(Inst);
-    if (isHandledNonTargetIntrinsic(II->getIntrinsicID()))
-      return getOrCreateResultNonTargetMemIntrinsic(II, ExpectedType);
-    return TTI.getOrCreateResultFromMemIntrinsic(II, ExpectedType);
-  }
 
-  Value *getOrCreateResultNonTargetMemIntrinsic(IntrinsicInst *II,
-                                                Type *ExpectedType) const {
-    // TODO: We could insert relevant casts on type mismatch here.
-    switch (II->getIntrinsicID()) {
-    case Intrinsic::masked_load:
-      return II->getType() == ExpectedType ? II : nullptr;
-    case Intrinsic::masked_store: {
-      Value *V = II->getOperand(0);
-      return V->getType() == ExpectedType ? V : nullptr;
-    }
-    }
+    Type *ActualType = V->getType();
+    BasicBlock *TheBB = Inst->getParent();
+
+    // First handle the case when no cast is required.
+    if (ActualType == ExpectedType)
+      return V;
+
+    // Try to create BitCast, SExt, or Trunc.
+    IRBuilder<> Builder(TheBB, std::next(Inst->getIterator()));
+    if (CastInst::castIsValid(Instruction::BitCast, V, ExpectedType))
+      return Builder.CreateBitCast(V, ExpectedType);
+    if (CastInst::castIsValid(Instruction::SExt, V, ExpectedType))
+      return Builder.CreateSExt(V, ExpectedType);
+    if (CastInst::castIsValid(Instruction::Trunc, V, ExpectedType))
+      return Builder.CreateTrunc(V, ExpectedType);
     return nullptr;
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index c3694158e7b971..6fe26286b74c22 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -3683,7 +3683,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 {
   ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (invariant load (p1) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1)
   ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s8) from `ptr addrspace(1) undef`, addrspace 1)
-  ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s16) from `ptr addrspace(1) undef`, addrspace 1)
+  ; CHECK-NEXT:   [[SEXT:%[0-9]+]]:_(s16) = G_SEXT [[LOAD2]](s8)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_i8_i8_i16
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(p4) = COPY [[COPY7]]
@@ -3720,7 +3720,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 {
   ; CHECK-NEXT:   G_STORE [[COPY18]](s16), [[PTR_ADD3]](p5) :: (store (s16) into stack + 8, align 8, addrspace 5)
   ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
   ; CHECK-NEXT:   [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[AMDGPU_WAVE_ADDRESS]], [[C6]](s32)
-  ; CHECK-NEXT:   G_STORE [[LOAD3]](s16), [[PTR_ADD4]](p5) :: (store (s16) into stack + 12, align 4, addrspace 5)
+  ; CHECK-NEXT:   G_STORE [[SEXT]](s16), [[PTR_ADD4]](p5) :: (store (s16) into stack + 12, align 4, addrspace 5)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   $vgpr2 = COPY [[UV2]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 055e9850de3d68..265204726da124 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -4,7 +4,7 @@
 define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 %arg2, i1 %arg3, i1 %arg4, i1 %arg5, i1 %arg6, ptr addrspace(3) %arg7, ptr addrspace(3) %arg8, ptr addrspace(3) %arg9, ptr addrspace(3) %arg10) {
   ; GFX90A-LABEL: name: f1
   ; GFX90A: bb.0.bb:
-  ; GFX90A-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX90A-NEXT:   successors: %bb.56(0x40000000), %bb.1(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr15, $sgpr10_sgpr11
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $sgpr32 = S_MOV_B32 0
@@ -30,39 +30,25 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr24_sgpr25, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.2, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.56, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.1.bb103:
-  ; GFX90A-NEXT:   successors: %bb.59(0x40000000), %bb.2(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr30_sgpr31 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr26_sgpr27, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.59, implicit $vcc
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.2:
-  ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54, $sgpr55, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $vgpr2, $vgpr3
+  ; GFX90A-NEXT: bb.1:
+  ; GFX90A-NEXT:   successors: %bb.60(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr23 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
   ; GFX90A-NEXT:   renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
+  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr25 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr23 = IMPLICIT_DEF implicit-def $vgpr22
-  ; GFX90A-NEXT:   renamable $vgpr25 = IMPLICIT_DEF implicit-def $vgpr24
   ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_MOV_B64 0
+  ; GFX90A-NEXT:   S_BRANCH %bb.60
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.3.Flow17:
-  ; GFX90A-NEXT:   successors: %bb.4(0x40000000), %bb.58(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.58, implicit $vcc
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.4.bb15:
-  ; GFX90A-NEXT:   successors: %bb.35(0x40000000), %bb.5(0x40000000)
+  ; GFX90A-NEXT: bb.2.bb15:
+  ; GFX90A-NEXT:   successors: %bb.33(0x40000000), %bb.3(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec
@@ -73,10 +59,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr40, renamable $vcc = V_ADD_CO_U32_e64 $vgpr46, killed $vgpr0, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr41, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr47, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr26_sgpr27, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.35, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.33, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.5:
-  ; GFX90A-NEXT:   successors: %bb.6(0x80000000)
+  ; GFX90A-NEXT: bb.3:
+  ; GFX90A-NEXT:   successors: %bb.4(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 -1
@@ -103,96 +89,96 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr50 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr51 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.6.Flow20:
-  ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.4.Flow20:
+  ; GFX90A-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr19 = COPY renamable $sgpr15, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr18 = COPY $sgpr15, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr21 = COPY $sgpr15, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr20 = COPY $sgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr52 = COPY $sgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr25 = COPY $sgpr15, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr23 = COPY $sgpr15, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr22 = COPY $sgpr15, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr25 = COPY $sgpr15, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr24 = COPY $sgpr15, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.7.Flow19:
-  ; GFX90A-NEXT:   successors: %bb.63(0x40000000), %bb.8(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.5.Flow19:
+  ; GFX90A-NEXT:   successors: %bb.65(0x40000000), %bb.6(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr25, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $vgpr52, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
   ; GFX90A-NEXT:   $sgpr30_sgpr31 = S_AND_SAVEEXEC_B64 $sgpr28_sgpr29, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.63, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.65, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.8.Flow32:
-  ; GFX90A-NEXT:   successors: %bb.9(0x40000000), %bb.10(0x40000000)
+  ; GFX90A-NEXT: bb.6.Flow32:
+  ; GFX90A-NEXT:   successors: %bb.7(0x40000000), %bb.8(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.10, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.8, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.9.bb89:
-  ; GFX90A-NEXT:   successors: %bb.10(0x80000000)
+  ; GFX90A-NEXT: bb.7.bb89:
+  ; GFX90A-NEXT:   successors: %bb.8(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.10.Flow33:
-  ; GFX90A-NEXT:   successors: %bb.11(0x40000000), %bb.12(0x40000000)
+  ; GFX90A-NEXT: bb.8.Flow33:
+  ; GFX90A-NEXT:   successors: %bb.9(0x40000000), %bb.10(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.12, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.10, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.11.bb84:
-  ; GFX90A-NEXT:   successors: %bb.12(0x80000000)
+  ; GFX90A-NEXT: bb.9.bb84:
+  ; GFX90A-NEXT:   successors: %bb.10(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.12.Flow34:
-  ; GFX90A-NEXT:   successors: %bb.13(0x40000000), %bb.14(0x40000000)
+  ; GFX90A-NEXT: bb.10.Flow34:
+  ; GFX90A-NEXT:   successors: %bb.11(0x40000000), %bb.12(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr52_sgpr53, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.14, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.12, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.13.bb79:
-  ; GFX90A-NEXT:   successors: %bb.14(0x80000000)
+  ; GFX90A-NEXT: bb.11.bb79:
+  ; GFX90A-NEXT:   successors: %bb.12(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.14.Flow35:
-  ; GFX90A-NEXT:   successors: %bb.15(0x40000000), %bb.16(0x40000000)
+  ; GFX90A-NEXT: bb.12.Flow35:
+  ; GFX90A-NEXT:   successors: %bb.13(0x40000000), %bb.14(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $sgpr16_sgpr17, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.16, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.14, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.15.bb72:
-  ; GFX90A-NEXT:   successors: %bb.16(0x80000000)
+  ; GFX90A-NEXT: bb.13.bb72:
+  ; GFX90A-NEXT:   successors: %bb.14(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr8 = S_ADD_U32 renamable $sgpr6, 48, implicit-def $scc
@@ -202,162 +188,162 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit undef $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.16.Flow36:
-  ; GFX90A-NEXT:   successors: %bb.17(0x40000000), %bb.18(0x40000000)
+  ; GFX90A-NEXT: bb.14.Flow36:
+  ; GFX90A-NEXT:   successors: %bb.15(0x40000000), %bb.16(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr50_sgpr51, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.18, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.16, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.17.bb67:
-  ; GFX90A-NEXT:   successors: %bb.18(0x80000000)
+  ; GFX90A-NEXT: bb.15.bb67:
+  ; GFX90A-NEXT:   successors: %bb.16(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.18.Flow37:
-  ; GFX90A-NEXT:   successors: %bb.19(0x40000000), %bb.20(0x40000000)
+  ; GFX90A-NEXT: bb.16.Flow37:
+  ; GFX90A-NEXT:   successors: %bb.17(0x40000000), %bb.18(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr48_sgpr49, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.20, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.18, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.19.bb62:
-  ; GFX90A-NEXT:   successors: %bb.20(0x80000000)
+  ; GFX90A-NEXT: bb.17.bb62:
+  ; GFX90A-NEXT:   successors: %bb.18(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.20.Flow38:
-  ; GFX90A-NEXT:   successors: %bb.21(0x40000000), %bb.22(0x40000000)
+  ; GFX90A-NEXT: bb.18.Flow38:
+  ; GFX90A-NEXT:   successors: %bb.19(0x40000000), %bb.20(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.22, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.20, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.21.bb54:
-  ; GFX90A-NEXT:   successors: %bb.22(0x80000000)
+  ; GFX90A-NEXT: bb.19.bb54:
+  ; GFX90A-NEXT:   successors: %bb.20(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.22.Flow39:
-  ; GFX90A-NEXT:   successors: %bb.23(0x40000000), %bb.24(0x40000000)
+  ; GFX90A-NEXT: bb.20.Flow39:
+  ; GFX90A-NEXT:   successors: %bb.21(0x40000000), %bb.22(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.24, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.22, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.23.bb47:
-  ; GFX90A-NEXT:   successors: %bb.24(0x80000000)
+  ; GFX90A-NEXT: bb.21.bb47:
+  ; GFX90A-NEXT:   successors: %bb.22(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.24.Flow40:
-  ; GFX90A-NEXT:   successors: %bb.25(0x40000000), %bb.26(0x40000000)
+  ; GFX90A-NEXT: bb.22.Flow40:
+  ; GFX90A-NEXT:   successors: %bb.23(0x40000000), %bb.24(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.26, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.24, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.25.bb40:
-  ; GFX90A-NEXT:   successors: %bb.26(0x80000000)
+  ; GFX90A-NEXT: bb.23.bb40:
+  ; GFX90A-NEXT:   successors: %bb.24(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.26.Flow41:
-  ; GFX90A-NEXT:   successors: %bb.27(0x40000000), %bb.28(0x40000000)
+  ; GFX90A-NEXT: bb.24.Flow41:
+  ; GFX90A-NEXT:   successors: %bb.25(0x40000000), %bb.26(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr38_sgpr39, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.28, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.26, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.27.bb33:
-  ; GFX90A-NEXT:   successors: %bb.28(0x80000000)
+  ; GFX90A-NEXT: bb.25.bb33:
+  ; GFX90A-NEXT:   successors: %bb.26(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.28.Flow42:
-  ; GFX90A-NEXT:   successors: %bb.34(0x40000000), %bb.29(0x40000000)
+  ; GFX90A-NEXT: bb.26.Flow42:
+  ; GFX90A-NEXT:   successors: %bb.32(0x40000000), %bb.27(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.34, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.32, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.29.Flow43:
-  ; GFX90A-NEXT:   successors: %bb.30(0x40000000), %bb.31(0x40000000)
+  ; GFX90A-NEXT: bb.27.Flow43:
+  ; GFX90A-NEXT:   successors: %bb.28(0x40000000), %bb.29(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.31, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.29, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.30.bb19:
-  ; GFX90A-NEXT:   successors: %bb.31(0x80000000)
+  ; GFX90A-NEXT: bb.28.bb19:
+  ; GFX90A-NEXT:   successors: %bb.29(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.31.Flow44:
-  ; GFX90A-NEXT:   successors: %bb.32(0x40000000), %bb.33(0x40000000)
+  ; GFX90A-NEXT: bb.29.Flow44:
+  ; GFX90A-NEXT:   successors: %bb.30(0x40000000), %bb.31(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr54_sgpr55, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.33, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.31, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.32.UnifiedUnreachableBlock:
-  ; GFX90A-NEXT:   successors: %bb.33(0x80000000)
+  ; GFX90A-NEXT: bb.30.UnifiedUnreachableBlock:
+  ; GFX90A-NEXT:   successors: %bb.31(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   SI_MASKED_UNREACHABLE
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.33.UnifiedReturnBlock:
+  ; GFX90A-NEXT: bb.31.UnifiedReturnBlock:
   ; GFX90A-NEXT:   liveins: $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   S_ENDPGM 0
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.34.bb26:
-  ; GFX90A-NEXT:   successors: %bb.29(0x80000000)
+  ; GFX90A-NEXT: bb.32.bb26:
+  ; GFX90A-NEXT:   successors: %bb.27(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr54_sgpr55, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_OR_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.29
+  ; GFX90A-NEXT:   S_BRANCH %bb.27
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.35.bb20:
-  ; GFX90A-NEXT:   successors: %bb.37(0x40000000), %bb.36(0x40000000)
+  ; GFX90A-NEXT: bb.33.bb20:
+  ; GFX90A-NEXT:   successors: %bb.35(0x40000000), %bb.34(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1)
@@ -387,24 +373,24 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr50 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr51 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr30_sgpr31 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.37, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.35, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.36.Flow21:
-  ; GFX90A-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.34.Flow21:
+  ; GFX90A-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.6
+  ; GFX90A-NEXT:   S_BRANCH %bb.4
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.37.bb27:
-  ; GFX90A-NEXT:   successors: %bb.39(0x40000000), %bb.38(0x40000000)
+  ; GFX90A-NEXT: bb.35.bb27:
+  ; GFX90A-NEXT:   successors: %bb.37(0x40000000), %bb.36(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr42_sgpr43, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr40_sgpr41
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1)
@@ -426,18 +412,18 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr50 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr51 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr36_sgpr37 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.39, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.37, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.38.Flow22:
-  ; GFX90A-NEXT:   successors: %bb.36(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.36.Flow22:
+  ; GFX90A-NEXT:   successors: %bb.34(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -454,10 +440,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_ANDN2_B64 killed renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_OR_B64 killed renamable $sgpr28_sgpr29, killed renamable $sgpr54_sgpr55, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.36
+  ; GFX90A-NEXT:   S_BRANCH %bb.34
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.39.bb34:
-  ; GFX90A-NEXT:   successors: %bb.41(0x40000000), %bb.40(0x40000000)
+  ; GFX90A-NEXT: bb.37.bb34:
+  ; GFX90A-NEXT:   successors: %bb.39(0x40000000), %bb.38(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1)
@@ -476,18 +462,18 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr50 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr51 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.41, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.39, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.40.Flow23:
-  ; GFX90A-NEXT:   successors: %bb.38(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.38.Flow23:
+  ; GFX90A-NEXT:   successors: %bb.36(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr16_sgpr17 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -503,10 +489,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr38_sgpr39, killed renamable $sgpr40_sgpr41, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.38
+  ; GFX90A-NEXT:   S_BRANCH %bb.36
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.41.bb41:
-  ; GFX90A-NEXT:   successors: %bb.47(0x40000000), %bb.42(0x40000000)
+  ; GFX90A-NEXT: bb.39.bb41:
+  ; GFX90A-NEXT:   successors: %bb.45(0x40000000), %bb.40(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
@@ -527,18 +513,18 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr50 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr51 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.47, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.45, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.42.Flow24:
-  ; GFX90A-NEXT:   successors: %bb.40(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.40.Flow24:
+  ; GFX90A-NEXT:   successors: %bb.38(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc
   ; GFX90A-NEXT:   renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec
@@ -554,10 +540,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr16_sgpr17 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr16_sgpr17, killed renamable $sgpr54_sgpr55, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.40
+  ; GFX90A-NEXT:   S_BRANCH %bb.38
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.43.bb55:
-  ; GFX90A-NEXT:   successors: %bb.49(0x40000000), %bb.44(0x40000000)
+  ; GFX90A-NEXT: bb.41.bb55:
+  ; GFX90A-NEXT:   successors: %bb.47(0x40000000), %bb.42(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr44_sgpr45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc
@@ -567,10 +553,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr46_sgpr47, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.49, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.47, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.44:
-  ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
+  ; GFX90A-NEXT: bb.42:
+  ; GFX90A-NEXT:   successors: %bb.43(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29
@@ -581,23 +567,23 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr50 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr51 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.45.Flow26:
-  ; GFX90A-NEXT:   successors: %bb.46(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr16, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+  ; GFX90A-NEXT: bb.43.Flow26:
+  ; GFX90A-NEXT:   successors: %bb.44(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr16, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr60, $vgpr61, $vgpr62, $vgpr63
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.46.Flow26:
-  ; GFX90A-NEXT:   successors: %bb.48(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.44.Flow26:
+  ; GFX90A-NEXT:   successors: %bb.46(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
@@ -609,10 +595,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.48
+  ; GFX90A-NEXT:   S_BRANCH %bb.46
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.47.bb48:
-  ; GFX90A-NEXT:   successors: %bb.43(0x40000000), %bb.48(0x40000000)
+  ; GFX90A-NEXT: bb.45.bb48:
+  ; GFX90A-NEXT:   successors: %bb.41(0x40000000), %bb.46(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr50_sgpr51, $sgpr44_sgpr45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec
@@ -635,18 +621,18 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr50 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr51 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr16_sgpr17 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.43, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.41, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.48.Flow25:
-  ; GFX90A-NEXT:   successors: %bb.42(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.46.Flow25:
+  ; GFX90A-NEXT:   successors: %bb.40(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr16_sgpr17, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -660,17 +646,17 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr42_sgpr43, killed renamable $sgpr54_sgpr55, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.42
+  ; GFX90A-NEXT:   S_BRANCH %bb.40
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.49.bb63:
-  ; GFX90A-NEXT:   successors: %bb.51(0x40000000), %bb.50(0x40000000)
+  ; GFX90A-NEXT: bb.47.bb63:
+  ; GFX90A-NEXT:   successors: %bb.49(0x40000000), %bb.48(0x40000000)
   ; GFX90A-NEXT:   liveins: $vcc, $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.51, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.49, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.50:
-  ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
+  ; GFX90A-NEXT: bb.48:
+  ; GFX90A-NEXT:   successors: %bb.43(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = COPY renamable $sgpr28_sgpr29
@@ -681,26 +667,26 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr50 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr51 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   S_BRANCH %bb.45
+  ; GFX90A-NEXT:   S_BRANCH %bb.43
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.51.bb68:
-  ; GFX90A-NEXT:   successors: %bb.55(0x40000000), %bb.52(0x40000000)
+  ; GFX90A-NEXT: bb.49.bb68:
+  ; GFX90A-NEXT:   successors: %bb.53(0x40000000), %bb.50(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr46_sgpr47, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.55, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.53, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.52:
-  ; GFX90A-NEXT:   successors: %bb.46(0x80000000)
+  ; GFX90A-NEXT: bb.50:
+  ; GFX90A-NEXT:   successors: %bb.44(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 -1
@@ -711,26 +697,26 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr50 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr51 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   S_BRANCH %bb.46
+  ; GFX90A-NEXT:   S_BRANCH %bb.44
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.53.bb80:
-  ; GFX90A-NEXT:   successors: %bb.60(0x40000000), %bb.54(0x40000000)
+  ; GFX90A-NEXT: bb.51.bb80:
+  ; GFX90A-NEXT:   successors: %bb.62(0x40000000), %bb.52(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr15 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CMP_EQ_U32 killed renamable $sgpr15, 0, implicit-def $scc
   ; GFX90A-NEXT:   renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr7, dead renamable $sgpr48_sgpr49 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.60, implicit killed $scc
+  ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.62, implicit killed $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.54:
-  ; GFX90A-NEXT:   successors: %bb.62(0x80000000)
+  ; GFX90A-NEXT: bb.52:
+  ; GFX90A-NEXT:   successors: %bb.64(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
@@ -740,16 +726,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr50 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr51 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   S_BRANCH %bb.62
+  ; GFX90A-NEXT:   S_BRANCH %bb.64
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.55.bb73:
-  ; GFX90A-NEXT:   successors: %bb.53(0x40000000), %bb.56(0x40000000)
+  ; GFX90A-NEXT: bb.53.bb73:
+  ; GFX90A-NEXT:   successors: %bb.51(0x40000000), %bb.54(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr54_sgpr55:0x000000000000000F, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50_sgpr51
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1)
@@ -765,27 +751,27 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr50 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr51 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr58_sgpr59 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.53, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.51, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.56.Flow29:
-  ; GFX90A-NEXT:   successors: %bb.46(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.54.Flow29:
+  ; GFX90A-NEXT:   successors: %bb.44(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr58_sgpr59, implicit-def $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.46
+  ; GFX90A-NEXT:   S_BRANCH %bb.44
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.57.bb90:
-  ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
+  ; GFX90A-NEXT: bb.55.bb90:
+  ; GFX90A-NEXT:   successors: %bb.63(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr50_sgpr51, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr62_sgpr63, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr51 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr62_sgpr63, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec
@@ -794,17 +780,70 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr54, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr55, killed $vgpr10, 1, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr50 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr8_sgpr9, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_OR_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec
-  ; GFX90A-NEXT:   S_BRANCH %bb.61
+  ; GFX90A-NEXT:   S_BRANCH %bb.63
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.56.bb103:
+  ; GFX90A-NEXT:   successors: %bb.58(0x40000000), %bb.57(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $sgpr30_sgpr31 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr26_sgpr27, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.58, implicit $vcc
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.57:
+  ; GFX90A-NEXT:   successors: %bb.59(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $sgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $sgpr23 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+  ; GFX90A-NEXT:   renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
+  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr23 = IMPLICIT_DEF implicit-def $vgpr22
+  ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_MOV_B64 0
+  ; GFX90A-NEXT:   S_BRANCH %bb.59
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.58.bb105:
+  ; GFX90A-NEXT:   successors: %bb.59(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr52 = V_ASHRREV_I32_e32 31, $vgpr2, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr23 = S_MOV_B32 0
+  ; GFX90A-NEXT:   renamable $sgpr15 = S_MOV_B32 0
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.58:
-  ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX90A-NEXT:   liveins: $exec, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.59.Flow18:
+  ; GFX90A-NEXT:   successors: %bb.60(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $sgpr33, $vgpr31, $vgpr52, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $vgpr25 = COPY renamable $vgpr2, implicit $exec
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.60.Flow17:
+  ; GFX90A-NEXT:   successors: %bb.2(0x40000000), %bb.61(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $sgpr33, $vgpr25, $vgpr31, $vgpr52, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.61:
+  ; GFX90A-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX90A-NEXT:   liveins: $exec, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr23, $vgpr25, $vgpr30, $vgpr31, $vgpr52, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr42_sgpr43, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr15 = COPY killed renamable $sgpr23, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr17 = COPY killed renamable $sgpr15, implicit $exec
@@ -831,35 +870,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr40_vgpr41 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr46_vgpr47 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = COPY renamable $vgpr15, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr52 = COPY renamable $vgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr50 = COPY renamable $vgpr15, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr16 = COPY renamable $vgpr15, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr53 = COPY renamable $vgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr51 = COPY renamable $vgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr13 = COPY renamable $vgpr15, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr12 = COPY renamable $vgpr15, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
-  ; GFX90A-NEXT:   S_BRANCH %bb.7
+  ; GFX90A-NEXT:   S_BRANCH %bb.5
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.59.bb105:
-  ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54_sgpr55:0x000000000000000F, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000FF, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr15, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
-  ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr23 = S_MOV_B32 0
-  ; GFX90A-NEXT:   renamable $sgpr15 = S_MOV_B32 0
-  ; GFX90A-NEXT:   S_BRANCH %bb.3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.60.bb85:
-  ; GFX90A-NEXT:   successors: %bb.57(0x40000000), %bb.61(0x40000000)
+  ; GFX90A-NEXT: bb.62.bb85:
+  ; GFX90A-NEXT:   successors: %bb.55(0x40000000), %bb.63(0x40000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr54_sgpr55:0x000000000000000F, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec
@@ -872,24 +892,24 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr50 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr51 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
   ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
   ; GFX90A-NEXT:   $sgpr50_sgpr51 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.57, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.55, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.61.Flow31:
-  ; GFX90A-NEXT:   successors: %bb.62(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.63.Flow31:
+  ; GFX90A-NEXT:   successors: %bb.64(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.62.Flow30:
-  ; GFX90A-NEXT:   successors: %bb.56(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.64.Flow30:
+  ; GFX90A-NEXT:   successors: %bb.54(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
@@ -897,134 +917,134 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_ANDN2_B64 renamable $sgpr28_sgpr29, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr48_sgpr49, killed renamable $sgpr54_sgpr55, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_BRANCH %bb.56
+  ; GFX90A-NEXT:   S_BRANCH %bb.54
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.63.bb140:
-  ; GFX90A-NEXT:   successors: %bb.69(0x40000000), %bb.64(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.65.bb140:
+  ; GFX90A-NEXT:   successors: %bb.71(0x40000000), %bb.66(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr25, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $vgpr52, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr26_sgpr27, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.69, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.71, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.64.Flow13:
-  ; GFX90A-NEXT:   successors: %bb.65(0x40000000), %bb.67(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.66.Flow13:
+  ; GFX90A-NEXT:   successors: %bb.67(0x40000000), %bb.69(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $vgpr52, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.67, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.69, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.65.bb159:
-  ; GFX90A-NEXT:   successors: %bb.68(0x40000000), %bb.66(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.67.bb159:
+  ; GFX90A-NEXT:   successors: %bb.70(0x40000000), %bb.68(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $vgpr52, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec
   ; GFX90A-NEXT:   $sgpr8_sgpr9 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr8_sgpr9 = S_XOR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.68, implicit $exec
+  ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.70, implicit $exec
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.66.Flow10:
-  ; GFX90A-NEXT:   successors: %bb.67(0x80000000)
+  ; GFX90A-NEXT: bb.68.Flow10:
+  ; GFX90A-NEXT:   successors: %bb.69(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $sgpr8_sgpr9 = S_ANDN2_SAVEEXEC_B64 $sgpr8_sgpr9, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def $scc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.67.Flow14:
-  ; GFX90A-NEXT:   successors: %bb.8(0x80000000)
+  ; GFX90A-NEXT: bb.69.Flow14:
+  ; GFX90A-NEXT:   successors: %bb.6(0x80000000)
   ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = COPY $exec
-  ; GFX90A-NEXT:   S_BRANCH %bb.8
+  ; GFX90A-NEXT:   S_BRANCH %bb.6
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.68.bb161:
-  ; GFX90A-NEXT:   successors: %bb.66(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.70.bb161:
+  ; GFX90A-NEXT:   successors: %bb.68(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr31, $vgpr50, $vgpr51, $vgpr52, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr52, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr23, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr51, 0, $vgpr3, 0, 0, 6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 killed $vgpr50, killed $vgpr13, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
-  ; GFX90A-NEXT:   S_BRANCH %bb.66
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.69.bb174:
-  ; GFX90A-NEXT:   successors: %bb.73(0x40000000), %bb.70(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr20, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr8_sgpr9, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr10, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr32 = V_OR_B32_e32 $vgpr36, $vgpr12, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr8_sgpr9, implicit $exec
+  ; GFX90A-NEXT:   S_BRANCH %bb.68
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT: bb.71.bb174:
+  ; GFX90A-NEXT:   successors: %bb.75(0x40000000), %bb.72(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr25, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $vgpr52, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr24_sgpr25, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $vgpr24 = V_OR_B32_e32 1, $vgpr22, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr38 = V_OR_B32_e32 $vgpr24, killed $vgpr25, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr32 = V_OR_B32_e32 $vgpr38, $vgpr20, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr26 = V_CNDMASK_B32_e64 0, $vgpr32, 0, 0, $sgpr8_sgpr9, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr36 = V_OR_B32_e32 $vgpr26, $vgpr18, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr34 = V_OR_B32_e32 $vgpr36, $vgpr10, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr28 = V_OR_B32_e32 $vgpr34, $vgpr12, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr48 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr28, killed $sgpr8_sgpr9, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr8_sgpr9 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.73, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.75, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.70.Flow:
-  ; GFX90A-NEXT:   successors: %bb.71(0x40000000), %bb.72(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.72.Flow:
+  ; GFX90A-NEXT:   successors: %bb.73(0x40000000), %bb.74(0x40000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $vgpr52, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc
-  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.72, implicit $vcc
+  ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.74, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.71.bb186:
-  ; GFX90A-NEXT:   successors: %bb.72(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.73.bb186:
+  ; GFX90A-NEXT:   successors: %bb.74(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $vgpr52, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr19, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr18, $vgpr2, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr27 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr37 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr29 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr51 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr33 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, renamable $vgpr26_vgpr27, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr25 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr39 = COPY renamable $vgpr25, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr33 = COPY renamable $vgpr25, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr37 = COPY renamable $vgpr25, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr35 = COPY renamable $vgpr25, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr27 = COPY renamable $vgpr25, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr49 = COPY renamable $vgpr25, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr29 = COPY renamable $vgpr25, implicit $exec
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr25, renamable $vgpr24_vgpr25, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr10, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr27, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr25, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr25, killed renamable $vgpr26_vgpr27, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr10, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr25, killed renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.72.Flow9:
-  ; GFX90A-NEXT:   successors: %bb.64(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.74.Flow9:
+  ; GFX90A-NEXT:   successors: %bb.66(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $vgpr52, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_MOV_B64 0
-  ; GFX90A-NEXT:   S_BRANCH %bb.64
+  ; GFX90A-NEXT:   S_BRANCH %bb.66
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT: bb.73.bb196:
-  ; GFX90A-NEXT:   successors: %bb.70(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: bb.75.bb196:
+  ; GFX90A-NEXT:   successors: %bb.72(0x80000000)
+  ; GFX90A-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr50, $vgpr51, $vgpr52, $sgpr4_sgpr5, $sgpr6_sgpr7:0x000000000000000F, $sgpr10_sgpr11, $sgpr16_sgpr17, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr16_sgpr17_sgpr18_sgpr19:0x00000000000000F0, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x0000000000000003, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 $vgpr48, killed $vgpr16, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr8_sgpr9 = S_MOV_B64 0
-  ; GFX90A-NEXT:   S_BRANCH %bb.70
+  ; GFX90A-NEXT:   S_BRANCH %bb.72
 bb:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
   %i11 = icmp eq i32 %i, 0
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index 8435e016096621..21ac6a37751d87 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -27,9 +27,9 @@ define void @generic_weak(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
 ; CHECK-LABEL: generic_weak(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<29>;
-; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .b32 %r<35>;
 ; CHECK-NEXT:    .reg .f32 %f<15>;
-; CHECK-NEXT:    .reg .b64 %rd<11>;
+; CHECK-NEXT:    .reg .b64 %rd<25>;
 ; CHECK-NEXT:    .reg .f64 %fd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -79,9 +79,7 @@ define void @generic_weak(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r13, %rs16;
 ; CHECK-NEXT:    bfi.b32 %r14, %r13, %r11, 24, 8;
-; CHECK-NEXT:    st.u32 [%rd3], %r14;
-; CHECK-NEXT:    ld.u32 %r15, [%rd3];
-; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r15;
+; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r14;
 ; CHECK-NEXT:    add.s16 %rs19, %rs18, 1;
 ; CHECK-NEXT:    add.s16 %rs20, %rs17, 1;
 ; CHECK-NEXT:    mov.b32 %r16, {%rs20, %rs19};
@@ -89,34 +87,52 @@ define void @generic_weak(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
 ; CHECK-NEXT:    ld.v4.u16 {%rs21, %rs22, %rs23, %rs24}, [%rd4];
 ; CHECK-NEXT:    add.s16 %rs25, %rs24, 1;
 ; CHECK-NEXT:    add.s16 %rs26, %rs23, 1;
+; CHECK-NEXT:    mov.b32 %r17, {%rs26, %rs25};
 ; CHECK-NEXT:    add.s16 %rs27, %rs22, 1;
 ; CHECK-NEXT:    add.s16 %rs28, %rs21, 1;
-; CHECK-NEXT:    st.v4.u16 [%rd4], {%rs28, %rs27, %rs26, %rs25};
-; CHECK-NEXT:    ld.v2.u32 {%r17, %r18}, [%rd4];
-; CHECK-NEXT:    add.s32 %r19, %r18, 1;
-; CHECK-NEXT:    add.s32 %r20, %r17, 1;
-; CHECK-NEXT:    st.v2.u32 [%rd4], {%r20, %r19};
-; CHECK-NEXT:    ld.v4.u32 {%r21, %r22, %r23, %r24}, [%rd4];
-; CHECK-NEXT:    add.s32 %r25, %r24, 1;
-; CHECK-NEXT:    add.s32 %r26, %r23, 1;
-; CHECK-NEXT:    add.s32 %r27, %r22, 1;
-; CHECK-NEXT:    add.s32 %r28, %r21, 1;
-; CHECK-NEXT:    st.v4.u32 [%rd4], {%r28, %r27, %r26, %r25};
-; CHECK-NEXT:    ld.v2.u64 {%rd7, %rd8}, [%rd4];
-; CHECK-NEXT:    add.s64 %rd9, %rd8, 1;
-; CHECK-NEXT:    add.s64 %rd10, %rd7, 1;
-; CHECK-NEXT:    st.v2.u64 [%rd4], {%rd10, %rd9};
+; CHECK-NEXT:    mov.b32 %r18, {%rs28, %rs27};
+; CHECK-NEXT:    add.s32 %r21, %r17, 1;
+; CHECK-NEXT:    add.s32 %r22, %r18, 1;
+; CHECK-NEXT:    st.v2.u32 [%rd4], {%r22, %r21};
+; CHECK-NEXT:    ld.v4.u32 {%r23, %r24, %r25, %r26}, [%rd4];
+; CHECK-NEXT:    add.s32 %r27, %r23, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd7, %r27;
+; CHECK-NEXT:    add.s32 %r28, %r24, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd8, %r28;
+; CHECK-NEXT:    shl.b64 %rd9, %rd8, 32;
+; CHECK-NEXT:    or.b64 %rd10, %rd7, %rd9;
+; CHECK-NEXT:    add.s32 %r29, %r25, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd11, %r29;
+; CHECK-NEXT:    add.s32 %r30, %r26, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd12, %r30;
+; CHECK-NEXT:    shl.b64 %rd13, %rd12, 32;
+; CHECK-NEXT:    or.b64 %rd14, %rd11, %rd13;
+; CHECK-NEXT:    add.s64 %rd15, %rd14, 1;
+; CHECK-NEXT:    add.s64 %rd16, %rd10, 1;
+; CHECK-NEXT:    st.v2.u64 [%rd4], {%rd16, %rd15};
 ; CHECK-NEXT:    ld.v2.f32 {%f3, %f4}, [%rd4];
 ; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
 ; CHECK-NEXT:    st.v2.f32 [%rd4], {%f6, %f5};
 ; CHECK-NEXT:    ld.v4.f32 {%f7, %f8, %f9, %f10}, [%rd4];
-; CHECK-NEXT:    add.rn.f32 %f11, %f10, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f12, %f9, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f13, %f8, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f14, %f7, 0f3F800000;
-; CHECK-NEXT:    st.v4.f32 [%rd4], {%f14, %f13, %f12, %f11};
-; CHECK-NEXT:    ld.v2.f64 {%fd3, %fd4}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f11, %f9, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r31, %f11;
+; CHECK-NEXT:    cvt.u64.u32 %rd17, %r31;
+; CHECK-NEXT:    add.rn.f32 %f12, %f10, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r32, %f12;
+; CHECK-NEXT:    cvt.u64.u32 %rd18, %r32;
+; CHECK-NEXT:    shl.b64 %rd19, %rd18, 32;
+; CHECK-NEXT:    or.b64 %rd20, %rd17, %rd19;
+; CHECK-NEXT:    add.rn.f32 %f13, %f7, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r33, %f13;
+; CHECK-NEXT:    cvt.u64.u32 %rd21, %r33;
+; CHECK-NEXT:    add.rn.f32 %f14, %f8, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r34, %f14;
+; CHECK-NEXT:    cvt.u64.u32 %rd22, %r34;
+; CHECK-NEXT:    shl.b64 %rd23, %rd22, 32;
+; CHECK-NEXT:    or.b64 %rd24, %rd21, %rd23;
+; CHECK-NEXT:    mov.b64 %fd3, %rd24;
+; CHECK-NEXT:    mov.b64 %fd4, %rd20;
 ; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, 0d3FF0000000000000;
 ; CHECK-NEXT:    add.rn.f64 %fd6, %fd3, 0d3FF0000000000000;
 ; CHECK-NEXT:    st.v2.f64 [%rd4], {%fd6, %fd5};
@@ -696,9 +712,9 @@ define void @global_weak(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace
 ; CHECK-LABEL: global_weak(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<29>;
-; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .b32 %r<35>;
 ; CHECK-NEXT:    .reg .f32 %f<15>;
-; CHECK-NEXT:    .reg .b64 %rd<11>;
+; CHECK-NEXT:    .reg .b64 %rd<25>;
 ; CHECK-NEXT:    .reg .f64 %fd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -748,9 +764,7 @@ define void @global_weak(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r13, %rs16;
 ; CHECK-NEXT:    bfi.b32 %r14, %r13, %r11, 24, 8;
-; CHECK-NEXT:    st.global.u32 [%rd3], %r14;
-; CHECK-NEXT:    ld.global.u32 %r15, [%rd3];
-; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r15;
+; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r14;
 ; CHECK-NEXT:    add.s16 %rs19, %rs18, 1;
 ; CHECK-NEXT:    add.s16 %rs20, %rs17, 1;
 ; CHECK-NEXT:    mov.b32 %r16, {%rs20, %rs19};
@@ -758,34 +772,52 @@ define void @global_weak(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace
 ; CHECK-NEXT:    ld.global.v4.u16 {%rs21, %rs22, %rs23, %rs24}, [%rd4];
 ; CHECK-NEXT:    add.s16 %rs25, %rs24, 1;
 ; CHECK-NEXT:    add.s16 %rs26, %rs23, 1;
+; CHECK-NEXT:    mov.b32 %r17, {%rs26, %rs25};
 ; CHECK-NEXT:    add.s16 %rs27, %rs22, 1;
 ; CHECK-NEXT:    add.s16 %rs28, %rs21, 1;
-; CHECK-NEXT:    st.global.v4.u16 [%rd4], {%rs28, %rs27, %rs26, %rs25};
-; CHECK-NEXT:    ld.global.v2.u32 {%r17, %r18}, [%rd4];
-; CHECK-NEXT:    add.s32 %r19, %r18, 1;
-; CHECK-NEXT:    add.s32 %r20, %r17, 1;
-; CHECK-NEXT:    st.global.v2.u32 [%rd4], {%r20, %r19};
-; CHECK-NEXT:    ld.global.v4.u32 {%r21, %r22, %r23, %r24}, [%rd4];
-; CHECK-NEXT:    add.s32 %r25, %r24, 1;
-; CHECK-NEXT:    add.s32 %r26, %r23, 1;
-; CHECK-NEXT:    add.s32 %r27, %r22, 1;
-; CHECK-NEXT:    add.s32 %r28, %r21, 1;
-; CHECK-NEXT:    st.global.v4.u32 [%rd4], {%r28, %r27, %r26, %r25};
-; CHECK-NEXT:    ld.global.v2.u64 {%rd7, %rd8}, [%rd4];
-; CHECK-NEXT:    add.s64 %rd9, %rd8, 1;
-; CHECK-NEXT:    add.s64 %rd10, %rd7, 1;
-; CHECK-NEXT:    st.global.v2.u64 [%rd4], {%rd10, %rd9};
+; CHECK-NEXT:    mov.b32 %r18, {%rs28, %rs27};
+; CHECK-NEXT:    add.s32 %r21, %r17, 1;
+; CHECK-NEXT:    add.s32 %r22, %r18, 1;
+; CHECK-NEXT:    st.global.v2.u32 [%rd4], {%r22, %r21};
+; CHECK-NEXT:    ld.global.v4.u32 {%r23, %r24, %r25, %r26}, [%rd4];
+; CHECK-NEXT:    add.s32 %r27, %r23, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd7, %r27;
+; CHECK-NEXT:    add.s32 %r28, %r24, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd8, %r28;
+; CHECK-NEXT:    shl.b64 %rd9, %rd8, 32;
+; CHECK-NEXT:    or.b64 %rd10, %rd7, %rd9;
+; CHECK-NEXT:    add.s32 %r29, %r25, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd11, %r29;
+; CHECK-NEXT:    add.s32 %r30, %r26, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd12, %r30;
+; CHECK-NEXT:    shl.b64 %rd13, %rd12, 32;
+; CHECK-NEXT:    or.b64 %rd14, %rd11, %rd13;
+; CHECK-NEXT:    add.s64 %rd15, %rd14, 1;
+; CHECK-NEXT:    add.s64 %rd16, %rd10, 1;
+; CHECK-NEXT:    st.global.v2.u64 [%rd4], {%rd16, %rd15};
 ; CHECK-NEXT:    ld.global.v2.f32 {%f3, %f4}, [%rd4];
 ; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
 ; CHECK-NEXT:    st.global.v2.f32 [%rd4], {%f6, %f5};
 ; CHECK-NEXT:    ld.global.v4.f32 {%f7, %f8, %f9, %f10}, [%rd4];
-; CHECK-NEXT:    add.rn.f32 %f11, %f10, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f12, %f9, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f13, %f8, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f14, %f7, 0f3F800000;
-; CHECK-NEXT:    st.global.v4.f32 [%rd4], {%f14, %f13, %f12, %f11};
-; CHECK-NEXT:    ld.global.v2.f64 {%fd3, %fd4}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f11, %f9, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r31, %f11;
+; CHECK-NEXT:    cvt.u64.u32 %rd17, %r31;
+; CHECK-NEXT:    add.rn.f32 %f12, %f10, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r32, %f12;
+; CHECK-NEXT:    cvt.u64.u32 %rd18, %r32;
+; CHECK-NEXT:    shl.b64 %rd19, %rd18, 32;
+; CHECK-NEXT:    or.b64 %rd20, %rd17, %rd19;
+; CHECK-NEXT:    add.rn.f32 %f13, %f7, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r33, %f13;
+; CHECK-NEXT:    cvt.u64.u32 %rd21, %r33;
+; CHECK-NEXT:    add.rn.f32 %f14, %f8, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r34, %f14;
+; CHECK-NEXT:    cvt.u64.u32 %rd22, %r34;
+; CHECK-NEXT:    shl.b64 %rd23, %rd22, 32;
+; CHECK-NEXT:    or.b64 %rd24, %rd21, %rd23;
+; CHECK-NEXT:    mov.b64 %fd3, %rd24;
+; CHECK-NEXT:    mov.b64 %fd4, %rd20;
 ; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, 0d3FF0000000000000;
 ; CHECK-NEXT:    add.rn.f64 %fd6, %fd3, 0d3FF0000000000000;
 ; CHECK-NEXT:    st.global.v2.f64 [%rd4], {%fd6, %fd5};
@@ -1408,9 +1440,9 @@ define void @shared_weak(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace
 ; CHECK-LABEL: shared_weak(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<29>;
-; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .b32 %r<35>;
 ; CHECK-NEXT:    .reg .f32 %f<15>;
-; CHECK-NEXT:    .reg .b64 %rd<11>;
+; CHECK-NEXT:    .reg .b64 %rd<25>;
 ; CHECK-NEXT:    .reg .f64 %fd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -1460,9 +1492,7 @@ define void @shared_weak(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r13, %rs16;
 ; CHECK-NEXT:    bfi.b32 %r14, %r13, %r11, 24, 8;
-; CHECK-NEXT:    st.shared.u32 [%rd3], %r14;
-; CHECK-NEXT:    ld.shared.u32 %r15, [%rd3];
-; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r15;
+; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r14;
 ; CHECK-NEXT:    add.s16 %rs19, %rs18, 1;
 ; CHECK-NEXT:    add.s16 %rs20, %rs17, 1;
 ; CHECK-NEXT:    mov.b32 %r16, {%rs20, %rs19};
@@ -1470,34 +1500,52 @@ define void @shared_weak(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace
 ; CHECK-NEXT:    ld.shared.v4.u16 {%rs21, %rs22, %rs23, %rs24}, [%rd4];
 ; CHECK-NEXT:    add.s16 %rs25, %rs24, 1;
 ; CHECK-NEXT:    add.s16 %rs26, %rs23, 1;
+; CHECK-NEXT:    mov.b32 %r17, {%rs26, %rs25};
 ; CHECK-NEXT:    add.s16 %rs27, %rs22, 1;
 ; CHECK-NEXT:    add.s16 %rs28, %rs21, 1;
-; CHECK-NEXT:    st.shared.v4.u16 [%rd4], {%rs28, %rs27, %rs26, %rs25};
-; CHECK-NEXT:    ld.shared.v2.u32 {%r17, %r18}, [%rd4];
-; CHECK-NEXT:    add.s32 %r19, %r18, 1;
-; CHECK-NEXT:    add.s32 %r20, %r17, 1;
-; CHECK-NEXT:    st.shared.v2.u32 [%rd4], {%r20, %r19};
-; CHECK-NEXT:    ld.shared.v4.u32 {%r21, %r22, %r23, %r24}, [%rd4];
-; CHECK-NEXT:    add.s32 %r25, %r24, 1;
-; CHECK-NEXT:    add.s32 %r26, %r23, 1;
-; CHECK-NEXT:    add.s32 %r27, %r22, 1;
-; CHECK-NEXT:    add.s32 %r28, %r21, 1;
-; CHECK-NEXT:    st.shared.v4.u32 [%rd4], {%r28, %r27, %r26, %r25};
-; CHECK-NEXT:    ld.shared.v2.u64 {%rd7, %rd8}, [%rd4];
-; CHECK-NEXT:    add.s64 %rd9, %rd8, 1;
-; CHECK-NEXT:    add.s64 %rd10, %rd7, 1;
-; CHECK-NEXT:    st.shared.v2.u64 [%rd4], {%rd10, %rd9};
+; CHECK-NEXT:    mov.b32 %r18, {%rs28, %rs27};
+; CHECK-NEXT:    add.s32 %r21, %r17, 1;
+; CHECK-NEXT:    add.s32 %r22, %r18, 1;
+; CHECK-NEXT:    st.shared.v2.u32 [%rd4], {%r22, %r21};
+; CHECK-NEXT:    ld.shared.v4.u32 {%r23, %r24, %r25, %r26}, [%rd4];
+; CHECK-NEXT:    add.s32 %r27, %r23, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd7, %r27;
+; CHECK-NEXT:    add.s32 %r28, %r24, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd8, %r28;
+; CHECK-NEXT:    shl.b64 %rd9, %rd8, 32;
+; CHECK-NEXT:    or.b64 %rd10, %rd7, %rd9;
+; CHECK-NEXT:    add.s32 %r29, %r25, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd11, %r29;
+; CHECK-NEXT:    add.s32 %r30, %r26, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd12, %r30;
+; CHECK-NEXT:    shl.b64 %rd13, %rd12, 32;
+; CHECK-NEXT:    or.b64 %rd14, %rd11, %rd13;
+; CHECK-NEXT:    add.s64 %rd15, %rd14, 1;
+; CHECK-NEXT:    add.s64 %rd16, %rd10, 1;
+; CHECK-NEXT:    st.shared.v2.u64 [%rd4], {%rd16, %rd15};
 ; CHECK-NEXT:    ld.shared.v2.f32 {%f3, %f4}, [%rd4];
 ; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
 ; CHECK-NEXT:    st.shared.v2.f32 [%rd4], {%f6, %f5};
 ; CHECK-NEXT:    ld.shared.v4.f32 {%f7, %f8, %f9, %f10}, [%rd4];
-; CHECK-NEXT:    add.rn.f32 %f11, %f10, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f12, %f9, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f13, %f8, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f14, %f7, 0f3F800000;
-; CHECK-NEXT:    st.shared.v4.f32 [%rd4], {%f14, %f13, %f12, %f11};
-; CHECK-NEXT:    ld.shared.v2.f64 {%fd3, %fd4}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f11, %f9, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r31, %f11;
+; CHECK-NEXT:    cvt.u64.u32 %rd17, %r31;
+; CHECK-NEXT:    add.rn.f32 %f12, %f10, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r32, %f12;
+; CHECK-NEXT:    cvt.u64.u32 %rd18, %r32;
+; CHECK-NEXT:    shl.b64 %rd19, %rd18, 32;
+; CHECK-NEXT:    or.b64 %rd20, %rd17, %rd19;
+; CHECK-NEXT:    add.rn.f32 %f13, %f7, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r33, %f13;
+; CHECK-NEXT:    cvt.u64.u32 %rd21, %r33;
+; CHECK-NEXT:    add.rn.f32 %f14, %f8, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r34, %f14;
+; CHECK-NEXT:    cvt.u64.u32 %rd22, %r34;
+; CHECK-NEXT:    shl.b64 %rd23, %rd22, 32;
+; CHECK-NEXT:    or.b64 %rd24, %rd21, %rd23;
+; CHECK-NEXT:    mov.b64 %fd3, %rd24;
+; CHECK-NEXT:    mov.b64 %fd4, %rd20;
 ; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, 0d3FF0000000000000;
 ; CHECK-NEXT:    add.rn.f64 %fd6, %fd3, 0d3FF0000000000000;
 ; CHECK-NEXT:    st.shared.v2.f64 [%rd4], {%fd6, %fd5};
@@ -2052,9 +2100,9 @@ define void @local_weak(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(
 ; CHECK-LABEL: local_weak(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<29>;
-; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .b32 %r<35>;
 ; CHECK-NEXT:    .reg .f32 %f<15>;
-; CHECK-NEXT:    .reg .b64 %rd<11>;
+; CHECK-NEXT:    .reg .b64 %rd<25>;
 ; CHECK-NEXT:    .reg .f64 %fd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -2104,9 +2152,7 @@ define void @local_weak(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(
 ; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
 ; CHECK-NEXT:    cvt.u32.u16 %r13, %rs16;
 ; CHECK-NEXT:    bfi.b32 %r14, %r13, %r11, 24, 8;
-; CHECK-NEXT:    st.local.u32 [%rd3], %r14;
-; CHECK-NEXT:    ld.local.u32 %r15, [%rd3];
-; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r15;
+; CHECK-NEXT:    mov.b32 {%rs17, %rs18}, %r14;
 ; CHECK-NEXT:    add.s16 %rs19, %rs18, 1;
 ; CHECK-NEXT:    add.s16 %rs20, %rs17, 1;
 ; CHECK-NEXT:    mov.b32 %r16, {%rs20, %rs19};
@@ -2114,34 +2160,52 @@ define void @local_weak(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(
 ; CHECK-NEXT:    ld.local.v4.u16 {%rs21, %rs22, %rs23, %rs24}, [%rd4];
 ; CHECK-NEXT:    add.s16 %rs25, %rs24, 1;
 ; CHECK-NEXT:    add.s16 %rs26, %rs23, 1;
+; CHECK-NEXT:    mov.b32 %r17, {%rs26, %rs25};
 ; CHECK-NEXT:    add.s16 %rs27, %rs22, 1;
 ; CHECK-NEXT:    add.s16 %rs28, %rs21, 1;
-; CHECK-NEXT:    st.local.v4.u16 [%rd4], {%rs28, %rs27, %rs26, %rs25};
-; CHECK-NEXT:    ld.local.v2.u32 {%r17, %r18}, [%rd4];
-; CHECK-NEXT:    add.s32 %r19, %r18, 1;
-; CHECK-NEXT:    add.s32 %r20, %r17, 1;
-; CHECK-NEXT:    st.local.v2.u32 [%rd4], {%r20, %r19};
-; CHECK-NEXT:    ld.local.v4.u32 {%r21, %r22, %r23, %r24}, [%rd4];
-; CHECK-NEXT:    add.s32 %r25, %r24, 1;
-; CHECK-NEXT:    add.s32 %r26, %r23, 1;
-; CHECK-NEXT:    add.s32 %r27, %r22, 1;
-; CHECK-NEXT:    add.s32 %r28, %r21, 1;
-; CHECK-NEXT:    st.local.v4.u32 [%rd4], {%r28, %r27, %r26, %r25};
-; CHECK-NEXT:    ld.local.v2.u64 {%rd7, %rd8}, [%rd4];
-; CHECK-NEXT:    add.s64 %rd9, %rd8, 1;
-; CHECK-NEXT:    add.s64 %rd10, %rd7, 1;
-; CHECK-NEXT:    st.local.v2.u64 [%rd4], {%rd10, %rd9};
+; CHECK-NEXT:    mov.b32 %r18, {%rs28, %rs27};
+; CHECK-NEXT:    add.s32 %r21, %r17, 1;
+; CHECK-NEXT:    add.s32 %r22, %r18, 1;
+; CHECK-NEXT:    st.local.v2.u32 [%rd4], {%r22, %r21};
+; CHECK-NEXT:    ld.local.v4.u32 {%r23, %r24, %r25, %r26}, [%rd4];
+; CHECK-NEXT:    add.s32 %r27, %r23, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd7, %r27;
+; CHECK-NEXT:    add.s32 %r28, %r24, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd8, %r28;
+; CHECK-NEXT:    shl.b64 %rd9, %rd8, 32;
+; CHECK-NEXT:    or.b64 %rd10, %rd7, %rd9;
+; CHECK-NEXT:    add.s32 %r29, %r25, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd11, %r29;
+; CHECK-NEXT:    add.s32 %r30, %r26, 1;
+; CHECK-NEXT:    cvt.u64.u32 %rd12, %r30;
+; CHECK-NEXT:    shl.b64 %rd13, %rd12, 32;
+; CHECK-NEXT:    or.b64 %rd14, %rd11, %rd13;
+; CHECK-NEXT:    add.s64 %rd15, %rd14, 1;
+; CHECK-NEXT:    add.s64 %rd16, %rd10, 1;
+; CHECK-NEXT:    st.local.v2.u64 [%rd4], {%rd16, %rd15};
 ; CHECK-NEXT:    ld.local.v2.f32 {%f3, %f4}, [%rd4];
 ; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
 ; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
 ; CHECK-NEXT:    st.local.v2.f32 [%rd4], {%f6, %f5};
 ; CHECK-NEXT:    ld.local.v4.f32 {%f7, %f8, %f9, %f10}, [%rd4];
-; CHECK-NEXT:    add.rn.f32 %f11, %f10, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f12, %f9, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f13, %f8, 0f3F800000;
-; CHECK-NEXT:    add.rn.f32 %f14, %f7, 0f3F800000;
-; CHECK-NEXT:    st.local.v4.f32 [%rd4], {%f14, %f13, %f12, %f11};
-; CHECK-NEXT:    ld.local.v2.f64 {%fd3, %fd4}, [%rd4];
+; CHECK-NEXT:    add.rn.f32 %f11, %f9, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r31, %f11;
+; CHECK-NEXT:    cvt.u64.u32 %rd17, %r31;
+; CHECK-NEXT:    add.rn.f32 %f12, %f10, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r32, %f12;
+; CHECK-NEXT:    cvt.u64.u32 %rd18, %r32;
+; CHECK-NEXT:    shl.b64 %rd19, %rd18, 32;
+; CHECK-NEXT:    or.b64 %rd20, %rd17, %rd19;
+; CHECK-NEXT:    add.rn.f32 %f13, %f7, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r33, %f13;
+; CHECK-NEXT:    cvt.u64.u32 %rd21, %r33;
+; CHECK-NEXT:    add.rn.f32 %f14, %f8, 0f3F800000;
+; CHECK-NEXT:    mov.b32 %r34, %f14;
+; CHECK-NEXT:    cvt.u64.u32 %rd22, %r34;
+; CHECK-NEXT:    shl.b64 %rd23, %rd22, 32;
+; CHECK-NEXT:    or.b64 %rd24, %rd21, %rd23;
+; CHECK-NEXT:    mov.b64 %fd3, %rd24;
+; CHECK-NEXT:    mov.b64 %fd4, %rd20;
 ; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, 0d3FF0000000000000;
 ; CHECK-NEXT:    add.rn.f64 %fd6, %fd3, 0d3FF0000000000000;
 ; CHECK-NEXT:    st.local.v2.f64 [%rd4], {%fd6, %fd5};
diff --git a/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll b/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll
index 5bd3580f5e95ec..3216f3c548308e 100644
--- a/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll
+++ b/llvm/test/CodeGen/PowerPC/big-endian-store-forward.ll
@@ -10,7 +10,6 @@ define void @f(i16 %v) {
 ; CHECK-NEXT:    addis 4, 2, .LC0 at toc@ha
 ; CHECK-NEXT:    sth 3, -2(1)
 ; CHECK-NEXT:    ld 4, .LC0 at toc@l(4)
-; CHECK-NEXT:    lbz 3, -2(1)
 ; CHECK-NEXT:    stb 3, 0(4)
 ; CHECK-NEXT:    blr
   %p32 = alloca i16
diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll b/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll
index ac9641ff35b0cb..728e5431217aa0 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll
@@ -31,28 +31,22 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 {
 ; CHECK-NEXT:    cmpdi cr1, r4, 0
 ; CHECK-NEXT:    cmpdi cr5, r5, 0
 ; CHECK-NEXT:    cmpldi cr6, r3, 0
-; CHECK-NEXT:    beq cr6, .LBB0_3
-; CHECK-NEXT:  # %bb.1: # %bb10
-; CHECK-NEXT:    lwz r3, 0(r3)
-; CHECK-NEXT:    bc 12, 4*cr1+eq, .LBB0_4
-; CHECK-NEXT:  .LBB0_2: # %bb14
+; CHECK-NEXT:    bc 12, 4*cr1+eq, .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %bb14
 ; CHECK-NEXT:    lwz r5, 0(r3)
-; CHECK-NEXT:    b .LBB0_5
-; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:    # implicit-def: $r3
-; CHECK-NEXT:    bc 4, 4*cr1+eq, .LBB0_2
-; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    # implicit-def: $r5
-; CHECK-NEXT:  .LBB0_5: # %bb16
+; CHECK-NEXT:  .LBB0_3: # %bb16
 ; CHECK-NEXT:    crnot 4*cr1+lt, eq
 ; CHECK-NEXT:    crnot 4*cr5+un, 4*cr5+eq
-; CHECK-NEXT:    bc 12, 4*cr5+eq, .LBB0_7
-; CHECK-NEXT:  # %bb.6: # %bb18
+; CHECK-NEXT:    bc 12, 4*cr5+eq, .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %bb18
 ; CHECK-NEXT:    lwz r4, 0(r3)
-; CHECK-NEXT:    b .LBB0_8
-; CHECK-NEXT:  .LBB0_7:
+; CHECK-NEXT:    b .LBB0_6
+; CHECK-NEXT:  .LBB0_5:
 ; CHECK-NEXT:    # implicit-def: $r4
-; CHECK-NEXT:  .LBB0_8: # %bb20
+; CHECK-NEXT:  .LBB0_6: # %bb20
 ; CHECK-NEXT:    mfcr r12
 ; CHECK-NEXT:    cmpwi cr2, r3, -1
 ; CHECK-NEXT:    cmpwi cr3, r4, -1
@@ -62,38 +56,38 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 {
 ; CHECK-NEXT:    crand 4*cr5+gt, 4*cr2+gt, 4*cr1+lt
 ; CHECK-NEXT:    crand 4*cr5+lt, 4*cr3+gt, 4*cr5+un
 ; CHECK-NEXT:    # implicit-def: $x3
-; CHECK-NEXT:    bc 4, 4*cr5+gt, .LBB0_10
-; CHECK-NEXT:  # %bb.9: # %bb34
+; CHECK-NEXT:    bc 4, 4*cr5+gt, .LBB0_8
+; CHECK-NEXT:  # %bb.7: # %bb34
 ; CHECK-NEXT:    ld r3, 0(r3)
-; CHECK-NEXT:  .LBB0_10: # %bb36
+; CHECK-NEXT:  .LBB0_8: # %bb36
 ; CHECK-NEXT:    cmpwi cr2, r5, 0
 ; CHECK-NEXT:    # implicit-def: $x4
-; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_12
-; CHECK-NEXT:  # %bb.11: # %bb38
+; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_10
+; CHECK-NEXT:  # %bb.9: # %bb38
 ; CHECK-NEXT:    ld r4, 0(r3)
-; CHECK-NEXT:  .LBB0_12: # %bb40
+; CHECK-NEXT:  .LBB0_10: # %bb40
 ; CHECK-NEXT:    crand 4*cr6+gt, 4*cr7+lt, 4*cr1+lt
 ; CHECK-NEXT:    crand 4*cr6+lt, 4*cr6+lt, 4*cr5+un
 ; CHECK-NEXT:    crnot 4*cr6+un, 4*cr1+eq
 ; CHECK-NEXT:    # implicit-def: $x6
-; CHECK-NEXT:    bc 4, 4*cr6+lt, .LBB0_14
-; CHECK-NEXT:  # %bb.13: # %bb48
+; CHECK-NEXT:    bc 4, 4*cr6+lt, .LBB0_12
+; CHECK-NEXT:  # %bb.11: # %bb48
 ; CHECK-NEXT:    ld r6, 0(r3)
-; CHECK-NEXT:  .LBB0_14: # %bb50
+; CHECK-NEXT:  .LBB0_12: # %bb50
 ; CHECK-NEXT:    cmpwi cr3, r5, -1
 ; CHECK-NEXT:    crand 4*cr7+lt, 4*cr2+lt, 4*cr6+un
 ; CHECK-NEXT:    # implicit-def: $r5
-; CHECK-NEXT:    bc 4, 4*cr6+gt, .LBB0_16
-; CHECK-NEXT:  # %bb.15: # %bb52
+; CHECK-NEXT:    bc 4, 4*cr6+gt, .LBB0_14
+; CHECK-NEXT:  # %bb.13: # %bb52
 ; CHECK-NEXT:    lwz r5, 0(r3)
-; CHECK-NEXT:  .LBB0_16: # %bb54
+; CHECK-NEXT:  .LBB0_14: # %bb54
 ; CHECK-NEXT:    mfocrf r7, 128
 ; CHECK-NEXT:    stw r7, -4(r1)
 ; CHECK-NEXT:    # implicit-def: $r7
-; CHECK-NEXT:    bc 4, 4*cr7+lt, .LBB0_18
-; CHECK-NEXT:  # %bb.17: # %bb56
+; CHECK-NEXT:    bc 4, 4*cr7+lt, .LBB0_16
+; CHECK-NEXT:  # %bb.15: # %bb56
 ; CHECK-NEXT:    lwz r7, 0(r3)
-; CHECK-NEXT:  .LBB0_18: # %bb58
+; CHECK-NEXT:  .LBB0_16: # %bb58
 ; CHECK-NEXT:    lwz r6, 92(r6)
 ; CHECK-NEXT:    crand 4*cr7+un, 4*cr3+gt, 4*cr6+un
 ; CHECK-NEXT:    cmpwi cr3, r5, 1
@@ -106,10 +100,10 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 {
 ; CHECK-NEXT:    crand 4*cr7+lt, 4*cr4+lt, 4*cr7+lt
 ; CHECK-NEXT:    cmpwi r6, 1
 ; CHECK-NEXT:    crand 4*cr6+lt, lt, 4*cr6+lt
-; CHECK-NEXT:    bc 4, 4*cr6+gt, .LBB0_20
-; CHECK-NEXT:  # %bb.19: # %bb68
+; CHECK-NEXT:    bc 4, 4*cr6+gt, .LBB0_18
+; CHECK-NEXT:  # %bb.17: # %bb68
 ; CHECK-NEXT:    ld r5, 0(r3)
-; CHECK-NEXT:  .LBB0_20: # %bb70
+; CHECK-NEXT:  .LBB0_18: # %bb70
 ; CHECK-NEXT:    ld r6, 0(r3)
 ; CHECK-NEXT:    lwz r9, -4(r1)
 ; CHECK-NEXT:    crandc 4*cr5+gt, 4*cr5+gt, 4*cr7+eq
diff --git a/llvm/test/Transforms/EarlyCSE/invariant.start.ll b/llvm/test/Transforms/EarlyCSE/invariant.start.ll
index 554d3ce519b5ee..ad25137d20f466 100644
--- a/llvm/test/Transforms/EarlyCSE/invariant.start.ll
+++ b/llvm/test/Transforms/EarlyCSE/invariant.start.ll
@@ -472,15 +472,22 @@ define void @test_dse_after_load(ptr %p, i1 %cnd) {
 ; typed due to the user of a Value to represent the address.  Note that other
 ; passes will canonicalize away the bitcasts in this example.
 define i32 @test_false_negative_types(ptr %p) {
-; CHECK-LABEL: define {{[^@]+}}@test_false_negative_types
-; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p0(i64 4, ptr [[P]])
-; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[P]], align 4
-; CHECK-NEXT:    call void @clobber()
-; CHECK-NEXT:    [[V2F:%.*]] = load float, ptr [[P]], align 4
-; CHECK-NEXT:    [[V2:%.*]] = bitcast float [[V2F]] to i32
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[V1]], [[V2]]
-; CHECK-NEXT:    ret i32 [[SUB]]
+; NO_ASSUME-LABEL: define {{[^@]+}}@test_false_negative_types
+; NO_ASSUME-SAME: (ptr [[P:%.*]]) {
+; NO_ASSUME-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p0(i64 4, ptr [[P]])
+; NO_ASSUME-NEXT:    [[V1:%.*]] = load i32, ptr [[P]], align 4
+; NO_ASSUME-NEXT:    [[TMP2:%.*]] = bitcast i32 [[V1]] to float
+; NO_ASSUME-NEXT:    call void @clobber()
+; NO_ASSUME-NEXT:    ret i32 0
+;
+; USE_ASSUME-LABEL: define {{[^@]+}}@test_false_negative_types
+; USE_ASSUME-SAME: (ptr [[P:%.*]]) {
+; USE_ASSUME-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p0(i64 4, ptr [[P]])
+; USE_ASSUME-NEXT:    [[V1:%.*]] = load i32, ptr [[P]], align 4
+; USE_ASSUME-NEXT:    [[TMP2:%.*]] = bitcast i32 [[V1]] to float
+; USE_ASSUME-NEXT:    call void @clobber()
+; USE_ASSUME-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 4), "nonnull"(ptr [[P]]), "align"(ptr [[P]], i64 4) ]
+; USE_ASSUME-NEXT:    ret i32 0
 ;
   call ptr @llvm.invariant.start.p0(i64 4, ptr %p)
   %v1 = load i32, ptr %p
@@ -571,13 +578,13 @@ define i32 @test_false_negative_scope(ptr %p) {
 define i32 @test_invariant_load_scope(ptr %p) {
 ; NO_ASSUME-LABEL: define {{[^@]+}}@test_invariant_load_scope
 ; NO_ASSUME-SAME: (ptr [[P:%.*]]) {
-; NO_ASSUME-NEXT:    [[V1:%.*]] = load i32, ptr [[P]], align 4, !invariant.load !4
+; NO_ASSUME-NEXT:    [[V1:%.*]] = load i32, ptr [[P]], align 4, !invariant.load [[META4:![0-9]+]]
 ; NO_ASSUME-NEXT:    call void @clobber()
 ; NO_ASSUME-NEXT:    ret i32 0
 ;
 ; USE_ASSUME-LABEL: define {{[^@]+}}@test_invariant_load_scope
 ; USE_ASSUME-SAME: (ptr [[P:%.*]]) {
-; USE_ASSUME-NEXT:    [[V1:%.*]] = load i32, ptr [[P]], align 4, !invariant.load !4
+; USE_ASSUME-NEXT:    [[V1:%.*]] = load i32, ptr [[P]], align 4, !invariant.load [[META4:![0-9]+]]
 ; USE_ASSUME-NEXT:    call void @clobber()
 ; USE_ASSUME-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[P]], i64 4), "nonnull"(ptr [[P]]), "align"(ptr [[P]], i64 4) ]
 ; USE_ASSUME-NEXT:    ret i32 0
@@ -589,7 +596,6 @@ define i32 @test_invariant_load_scope(ptr %p) {
   ret i32 %sub
 }
 
-; USE_ASSUME: declare void @llvm.assume(i1 noundef)
 
 !0 = !{!1, !1, i64 0}
 !1 = !{!"float", !2, i64 0}
diff --git a/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll b/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll
index da507f13730e87..b7283ab4b30833 100644
--- a/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll
+++ b/llvm/test/Transforms/EarlyCSE/opaque-ptr.ll
@@ -4,10 +4,8 @@
 define i32 @different_types_load(ptr %p) {
 ; CHECK-LABEL: @different_types_load(
 ; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[P]], align 4
-; CHECK-NEXT:    [[V2_C:%.*]] = trunc i64 [[V2]] to i32
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[V1]], [[V2_C]]
-; CHECK-NEXT:    ret i32 [[SUB]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[V1]] to i64
+; CHECK-NEXT:    ret i32 0
 ;
   %v1 = load i32, ptr %p
   %v2 = load i64, ptr %p
@@ -36,10 +34,8 @@ define i32 @different_types_vector_load(ptr %p) {
 define i32 @different_types_store(ptr %p, i32 %a) {
 ; CHECK-LABEL: @different_types_store(
 ; CHECK-NEXT:    store i32 [[A:%.*]], ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[P]], align 4
-; CHECK-NEXT:    [[V2_C:%.*]] = trunc i64 [[V2]] to i32
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[A]], [[V2_C]]
-; CHECK-NEXT:    ret i32 [[SUB]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[A]] to i64
+; CHECK-NEXT:    ret i32 0
 ;
   store i32 %a, ptr %p
   %v2 = load i64, ptr %p
@@ -51,7 +47,7 @@ define i32 @different_types_store(ptr %p, i32 %a) {
 define i32 @different_elt_types_vector_load(ptr %p, <4 x i1> %c) {
 ; CHECK-LABEL: @different_elt_types_vector_load(
 ; CHECK-NEXT:    [[V1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[P:%.*]], i32 4, <4 x i1> [[C:%.*]], <4 x i32> poison)
-; CHECK-NEXT:    [[V2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> [[C]], <4 x float> poison)
+; CHECK-NEXT:    [[V2:%.*]] = bitcast <4 x i32> [[V1]] to <4 x float>
 ; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[V1]], i32 0
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x float> [[V2]], i32 0
 ; CHECK-NEXT:    [[E2I:%.*]] = fptosi float [[E2]] to i32
@@ -70,7 +66,7 @@ define i32 @different_elt_types_vector_load(ptr %p, <4 x i1> %c) {
 define float @different_elt_types_vector_store_load(ptr %p, <4 x i32> %v1, <4 x i1> %c) {
 ; CHECK-LABEL: @different_elt_types_vector_store_load(
 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[V1:%.*]], ptr [[P:%.*]], i32 4, <4 x i1> [[C:%.*]])
-; CHECK-NEXT:    [[V2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[P]], i32 4, <4 x i1> [[C]], <4 x float> poison)
+; CHECK-NEXT:    [[V2:%.*]] = bitcast <4 x i32> [[V1]] to <4 x float>
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x float> [[V2]], i32 0
 ; CHECK-NEXT:    ret float [[E2]]
 ;

>From a4d759c45650fa89bf68ed35006fa53e92f799e4 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Wed, 30 Oct 2024 14:06:45 +0000
Subject: [PATCH 3/3] EarlyCSE: address reviews

---
 llvm/lib/Transforms/Scalar/EarlyCSE.cpp | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 9714611cda8b0f..54b29a67fcab2c 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -966,25 +966,22 @@ class EarlyCSE {
                         const ParseMemoryInst &Later);
 
   Value *getOrCreateResult(Instruction *Inst, Type *ExpectedType) const {
-    if (!isa<IntrinsicInst, LoadInst, StoreInst>(Inst))
-      llvm_unreachable("Instruction not supported");
+    assert((isa<IntrinsicInst, LoadInst, StoreInst>(Inst)) &&
+           "Instruction not supported");
 
     // The load or the store's first operand.
     Value *V;
     if (auto *II = dyn_cast<IntrinsicInst>(Inst)) {
-      if (isHandledNonTargetIntrinsic(II->getIntrinsicID()))
-        switch (II->getIntrinsicID()) {
-        case Intrinsic::masked_load:
-          V = II;
-          break;
-        case Intrinsic::masked_store:
-          V = II->getOperand(0);
-          break;
-        default:
-          return nullptr;
-        }
-      else
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::masked_load:
+        V = II;
+        break;
+      case Intrinsic::masked_store:
+        V = II->getOperand(0);
+        break;
+      default:
         return TTI.getOrCreateResultFromMemIntrinsic(II, ExpectedType);
+      }
     } else {
       V = isa<LoadInst>(Inst) ? Inst : cast<StoreInst>(Inst)->getValueOperand();
     }



More information about the llvm-commits mailing list