[llvm] 7a84624 - AMDGPU: Make various vector undefs legal

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 28 07:49:01 PDT 2022


Author: Matt Arsenault
Date: 2022-09-28T10:48:52-04:00
New Revision: 7a84624079a2656c684bed6100708544500c5a32

URL: https://github.com/llvm/llvm-project/commit/7a84624079a2656c684bed6100708544500c5a32
DIFF: https://github.com/llvm/llvm-project/commit/7a84624079a2656c684bed6100708544500c5a32.diff

LOG: AMDGPU: Make various vector undefs legal

Surprisingly these were getting legalized to something
zero initialized.

This fixes an infinite loop when combining some vector types.
Also fixes zero initializing some undef values.

SimplifyDemandedVectorElts / SimplifyDemandedBits are not checking
for the legality of the output undefs they are replacing unused
operations with. This resulted in turning vectors into undefs
that were later re-legalized back into zero vectors.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/commute-shifts.ll
    llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
    llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
    llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
    llvm/test/CodeGen/AMDGPU/select-undef.ll
    llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
    llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
    llvm/test/CodeGen/AMDGPU/v1024.ll
    llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
    llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
    llvm/test/CodeGen/AMDGPU/wqm.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dd14ca179ea4..b79bdec97876 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -249,6 +249,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       case ISD::STORE:
       case ISD::BUILD_VECTOR:
       case ISD::BITCAST:
+      case ISD::UNDEF:
       case ISD::EXTRACT_VECTOR_ELT:
       case ISD::INSERT_VECTOR_ELT:
       case ISD::EXTRACT_SUBVECTOR:
@@ -516,6 +517,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
         case ISD::STORE:
         case ISD::BUILD_VECTOR:
         case ISD::BITCAST:
+        case ISD::UNDEF:
         case ISD::EXTRACT_VECTOR_ELT:
         case ISD::INSERT_VECTOR_ELT:
         case ISD::INSERT_SUBVECTOR:

diff  --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
index 8df85ba872bf..3697946cb5c3 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll
@@ -5,14 +5,6 @@
 define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
 ; SI-LABEL: main:
 ; SI:       ; %bb.0: ; %bb
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    s_mov_b32 s1, s0
-; SI-NEXT:    s_mov_b32 s2, s0
-; SI-NEXT:    s_mov_b32 s3, s0
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s0
-; SI-NEXT:    s_mov_b32 s6, s0
-; SI-NEXT:    s_mov_b32 s7, s0
 ; SI-NEXT:    image_load v2, v0, s[0:7] dmask:0x1 unorm
 ; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 7, v0
@@ -26,14 +18,6 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
 ;
 ; VI-LABEL: main:
 ; VI:       ; %bb.0: ; %bb
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    s_mov_b32 s1, s0
-; VI-NEXT:    s_mov_b32 s2, s0
-; VI-NEXT:    s_mov_b32 s3, s0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s0
-; VI-NEXT:    s_mov_b32 s6, s0
-; VI-NEXT:    s_mov_b32 s7, s0
 ; VI-NEXT:    image_load v2, v0, s[0:7] dmask:0x1 unorm
 ; VI-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; VI-NEXT:    v_and_b32_e32 v0, 7, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 29fc098899ee..5d985850446c 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -213,7 +213,7 @@ if.else:                                          ; preds = %entry
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
-  %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ undef, %if.then ]
+  %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
   store <3 x i16> %call6.sink, <3 x i16> addrspace(1)* undef
   ret void
 }
@@ -266,7 +266,7 @@ if.else:                                          ; preds = %entry
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
-  %call6.sink = phi <3 x half> [ %call6, %if.else ], [ undef, %if.then ]
+  %call6.sink = phi <3 x half> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
   store <3 x half> %call6.sink, <3 x half> addrspace(1)* undef
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
index b504ee6b5f6b..346131c47d9c 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@@ -4,16 +4,8 @@
 define amdgpu_ps float @_amdgpu_ps_main() #0 {
 ; GCN-LABEL: _amdgpu_ps_main:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_mov_b32 s2, s0
-; GCN-NEXT:    s_mov_b32 s3, s0
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s0
-; GCN-NEXT:    s_mov_b32 s6, s0
-; GCN-NEXT:    s_mov_b32 s7, s0
 ; GCN-NEXT:    image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_clause 0x1
 ; GCN-NEXT:    image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 9c3c09d87485..c2947ad5e59f 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -97,14 +97,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16
 ; GFX9-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX9-NEXT:    s_branch .LBB0_4
 ; GFX9-NEXT:  .LBB0_2:
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s8
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; GFX9-NEXT:  .LBB0_3: ; %T
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -237,14 +230,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i
 ; GFX9-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX9-NEXT:    s_branch .LBB1_4
 ; GFX9-NEXT:  .LBB1_2:
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s8
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; GFX9-NEXT:  .LBB1_3: ; %T
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -377,14 +363,7 @@ define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x h
 ; GFX9-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX9-NEXT:    s_branch .LBB2_4
 ; GFX9-NEXT:  .LBB2_2:
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s8
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; GFX9-NEXT:  .LBB2_3: ; %T
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -555,22 +534,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x
 ; GFX9-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX9-NEXT:    s_branch .LBB3_4
 ; GFX9-NEXT:  .LBB3_2:
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s8
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s8
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s8
-; GFX9-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
-; GFX9-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-NEXT:    v_mov_b32_e32 v7, s11
-; GFX9-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-NEXT:    v_mov_b32_e32 v10, s14
-; GFX9-NEXT:    v_mov_b32_e32 v11, s15
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
 ; GFX9-NEXT:  .LBB3_3: ; %T
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
@@ -743,22 +707,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16
 ; GFX9-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9-NEXT:    s_branch .LBB4_4
 ; GFX9-NEXT:  .LBB4_2:
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s8
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s8
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s8
-; GFX9-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
-; GFX9-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-NEXT:    v_mov_b32_e32 v7, s11
-; GFX9-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-NEXT:    v_mov_b32_e32 v10, s14
-; GFX9-NEXT:    v_mov_b32_e32 v11, s15
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
 ; GFX9-NEXT:  .LBB4_3: ; %T
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
@@ -931,22 +880,7 @@ define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16
 ; GFX9-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX9-NEXT:    s_branch .LBB5_4
 ; GFX9-NEXT:  .LBB5_2:
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s8
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s8
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s8
-; GFX9-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
-; GFX9-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-NEXT:    v_mov_b32_e32 v7, s11
-; GFX9-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-NEXT:    v_mov_b32_e32 v10, s14
-; GFX9-NEXT:    v_mov_b32_e32 v11, s15
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
 ; GFX9-NEXT:  .LBB5_3: ; %T
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 1823db53f159..8f84abdc6da1 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -382,18 +382,10 @@ define <4 x float> @insertelement_to_sgpr() nounwind {
 ; GCN-LABEL: insertelement_to_sgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s12, 0
-; GCN-NEXT:    s_mov_b32 s4, s12
-; GCN-NEXT:    s_mov_b32 s5, s12
-; GCN-NEXT:    s_mov_b32 s6, s12
-; GCN-NEXT:    s_mov_b32 s7, s12
-; GCN-NEXT:    s_mov_b32 s8, s12
-; GCN-NEXT:    s_mov_b32 s9, s12
-; GCN-NEXT:    s_mov_b32 s10, s12
-; GCN-NEXT:    s_mov_b32 s11, s12
-; GCN-NEXT:    image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1
+; GCN-NEXT:    s_mov_b32 s4, 0
+; GCN-NEXT:    image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef

diff  --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll
index 6597d6784e0c..f02cd3fc5e4e 100644
--- a/llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}select_undef_lhs:
 ; GCN: s_waitcnt
@@ -43,3 +43,220 @@ define void @select_undef_n2(float addrspace(1)* %a, i32 %c) {
 }
 
 declare float @llvm.amdgcn.rcp.f32(float)
+
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v6f32:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v6f32(<6 x float> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <6 x float> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <6 x float>, <6 x float> addrspace(3)* undef
+  %add = fadd <6 x float> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <6 x float> %add, <6 x float> addrspace(3)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v6i32:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v6i32(<6 x i32> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <6 x i32> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <6 x i32>, <6 x i32> addrspace(3)* undef
+  %add = add <6 x i32> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <6 x i32> %add, <6 x i32> addrspace(3)* undef
+  ret void
+}
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v5f32:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v5f32(<5 x float> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <5 x float> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <5 x float>, <5 x float> addrspace(3)* undef
+  %add = fadd <5 x float> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <5 x float> %add, <5 x float> addrspace(3)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v5i32:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v5i32(<5 x i32> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <5 x i32> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <5 x i32>, <5 x i32> addrspace(3)* undef
+  %add = add <5 x i32> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <5 x i32> %add, <5 x i32> addrspace(3)* undef
+  ret void
+}
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v3f64:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v3f64(<3 x double> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <3 x double> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <3 x double>, <3 x double> addrspace(3)* %ptr
+  %add = fadd <3 x double> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <3 x double> %add, <3 x double> addrspace(3)* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v3i64:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v3i64(<3 x i64> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <3 x i64> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <3 x i64>, <3 x i64> addrspace(3)* %ptr
+  %add = add <3 x i64> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <3 x i64> %add, <3 x i64> addrspace(3)* %ptr
+  ret void
+}
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v4f16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v4f16(<4 x half> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <4 x half> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <4 x half>, <4 x half> addrspace(3)* %ptr
+  %add = fadd <4 x half> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <4 x half> %add, <4 x half> addrspace(3)* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v4i16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v4i16(<4 x i16> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <4 x i16> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <4 x i16>, <4 x i16> addrspace(3)* %ptr
+  %add = add <4 x i16> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <4 x i16> %add, <4 x i16> addrspace(3)* %ptr
+  ret void
+}
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v2f16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v2f16(<2 x half> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <2 x half> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <2 x half>, <2 x half> addrspace(3)* %ptr
+  %add = fadd <2 x half> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <2 x half> %add, <2 x half> addrspace(3)* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v2i16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v2i16(<2 x i16> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <2 x i16> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <2 x i16>, <2 x i16> addrspace(3)* %ptr
+  %add = add <2 x i16> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <2 x i16> %add, <2 x i16> addrspace(3)* %ptr
+  ret void
+}
+
+; We were expanding undef vectors into zero vectors. Optimizations
+; would then see we used no elements of the vector, and reform the
+; undef vector resulting in a combiner loop.
+; GCN-LABEL: {{^}}inf_loop_undef_vector:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mad_u64_u32
+; GCN-NEXT: v_mul_lo_u32
+; GCN-NEXT: v_mul_lo_u32
+; GCN-NEXT: v_add3_u32
+; GCN-NEXT: global_store_dwordx2
+define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) {
+  %i = insertelement <6 x float> %arg, float %arg1, i64 2
+  %i3 = bitcast <6 x float> %i to <3 x i64>
+  %i4 = extractelement <3 x i64> %i3, i64 0
+  %i5 = extractelement <3 x i64> %i3, i64 1
+  %i6 = mul i64 %i5, %arg2
+  %i7 = add i64 %i6, %i4
+  store volatile i64 %i7, i64 addrspace(1)* undef, align 4
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index 6d611d39eb02..2aa8121eee78 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1406,28 +1406,20 @@ bb7:                                              ; preds = %bb4
 define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, float %arg3) #0 {
 ; SI-LABEL: if_after_kill_block:
 ; SI:       ; %bb.0: ; %bb
-; SI-NEXT:    s_mov_b64 s[2:3], exec
+; SI-NEXT:    s_mov_b64 s[0:1], exec
 ; SI-NEXT:    s_wqm_b64 exec, exec
-; SI-NEXT:    s_mov_b32 s0, 0
 ; SI-NEXT:    v_cmp_nle_f32_e32 vcc, 0, v1
-; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; SI-NEXT:    s_cbranch_execz .LBB13_3
 ; SI-NEXT:  ; %bb.1: ; %bb3
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
-; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], vcc
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_6
 ; SI-NEXT:  ; %bb.2: ; %bb3
 ; SI-NEXT:    s_andn2_b64 exec, exec, vcc
 ; SI-NEXT:  .LBB13_3: ; %bb4
-; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    s_mov_b32 s1, s0
-; SI-NEXT:    s_mov_b32 s2, s0
-; SI-NEXT:    s_mov_b32 s3, s0
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s0
-; SI-NEXT:    s_mov_b32 s6, s0
-; SI-NEXT:    s_mov_b32 s7, s0
+; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
@@ -1448,28 +1440,20 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ;
 ; GFX10-WAVE64-LABEL: if_after_kill_block:
 ; GFX10-WAVE64:       ; %bb.0: ; %bb
-; GFX10-WAVE64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10-WAVE64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX10-WAVE64-NEXT:    s_wqm_b64 exec, exec
 ; GFX10-WAVE64-NEXT:    v_cmp_nle_f32_e32 vcc, 0, v1
-; GFX10-WAVE64-NEXT:    s_mov_b32 s0, 0
-; GFX10-WAVE64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX10-WAVE64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-WAVE64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX10-WAVE64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX10-WAVE64-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX10-WAVE64-NEXT:  ; %bb.1: ; %bb3
 ; GFX10-WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
-; GFX10-WAVE64-NEXT:    s_andn2_b64 s[2:3], s[2:3], vcc
+; GFX10-WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
 ; GFX10-WAVE64-NEXT:    s_cbranch_scc0 .LBB13_6
 ; GFX10-WAVE64-NEXT:  ; %bb.2: ; %bb3
 ; GFX10-WAVE64-NEXT:    s_andn2_b64 exec, exec, vcc
 ; GFX10-WAVE64-NEXT:  .LBB13_3: ; %bb4
-; GFX10-WAVE64-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX10-WAVE64-NEXT:    s_mov_b32 s1, s0
-; GFX10-WAVE64-NEXT:    s_mov_b32 s2, s0
-; GFX10-WAVE64-NEXT:    s_mov_b32 s3, s0
-; GFX10-WAVE64-NEXT:    s_mov_b32 s4, s0
-; GFX10-WAVE64-NEXT:    s_mov_b32 s5, s0
-; GFX10-WAVE64-NEXT:    s_mov_b32 s6, s0
-; GFX10-WAVE64-NEXT:    s_mov_b32 s7, s0
+; GFX10-WAVE64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10-WAVE64-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
 ; GFX10-WAVE64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WAVE64-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
@@ -1488,28 +1472,20 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ;
 ; GFX10-WAVE32-LABEL: if_after_kill_block:
 ; GFX10-WAVE32:       ; %bb.0: ; %bb
-; GFX10-WAVE32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX10-WAVE32-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX10-WAVE32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-WAVE32-NEXT:    v_cmp_nle_f32_e32 vcc_lo, 0, v1
-; GFX10-WAVE32-NEXT:    s_mov_b32 s0, 0
-; GFX10-WAVE32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX10-WAVE32-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX10-WAVE32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-WAVE32-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX10-WAVE32-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX10-WAVE32-NEXT:  ; %bb.1: ; %bb3
 ; GFX10-WAVE32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
-; GFX10-WAVE32-NEXT:    s_andn2_b32 s1, s1, vcc_lo
+; GFX10-WAVE32-NEXT:    s_andn2_b32 s0, s0, vcc_lo
 ; GFX10-WAVE32-NEXT:    s_cbranch_scc0 .LBB13_6
 ; GFX10-WAVE32-NEXT:  ; %bb.2: ; %bb3
 ; GFX10-WAVE32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
 ; GFX10-WAVE32-NEXT:  .LBB13_3: ; %bb4
-; GFX10-WAVE32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX10-WAVE32-NEXT:    s_mov_b32 s1, s0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s2, s0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s3, s0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s4, s0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s5, s0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s6, s0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s7, s0
+; GFX10-WAVE32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-WAVE32-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
 ; GFX10-WAVE32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WAVE32-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
@@ -1528,29 +1504,22 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ;
 ; GFX11-LABEL: if_after_kill_block:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX11-NEXT:    s_wqm_b64 exec, exec
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b64 s[4:5], exec
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX11-NEXT:    v_cmpx_nle_f32_e32 0, v1
-; GFX11-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX11-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX11-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX11-NEXT:  ; %bb.1: ; %bb3
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
-; GFX11-NEXT:    s_and_not1_b64 s[2:3], s[2:3], vcc
+; GFX11-NEXT:    s_and_not1_b64 s[0:1], s[0:1], vcc
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB13_6
 ; GFX11-NEXT:  ; %bb.2: ; %bb3
 ; GFX11-NEXT:    s_and_not1_b64 exec, exec, vcc
 ; GFX11-NEXT:  .LBB13_3: ; %bb4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s2, s0
-; GFX11-NEXT:    s_mov_b32 s3, s0
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    s_mov_b32 s5, s0
-; GFX11-NEXT:    s_mov_b32 s6, s0
-; GFX11-NEXT:    s_mov_b32 s7, s0
+; GFX11-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX11-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
 ; GFX11-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1593,19 +1562,11 @@ bb9:                                              ; preds = %bb4
 define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
 ; SI-LABEL: cbranch_kill:
 ; SI:       ; %bb.0: ; %.entry
-; SI-NEXT:    s_mov_b32 s4, 0
 ; SI-NEXT:    s_mov_b64 s[0:1], exec
 ; SI-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-NEXT:    v_mov_b32_e32 v2, v1
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    s_mov_b32 s5, s4
-; SI-NEXT:    s_mov_b32 s6, s4
-; SI-NEXT:    s_mov_b32 s7, s4
-; SI-NEXT:    s_mov_b32 s8, s4
-; SI-NEXT:    s_mov_b32 s9, s4
-; SI-NEXT:    s_mov_b32 s10, s4
-; SI-NEXT:    s_mov_b32 s11, s4
-; SI-NEXT:    image_sample_l v1, v[1:4], s[4:11], s[0:3] dmask:0x1 da
+; SI-NEXT:    image_sample_l v1, v[1:4], s[0:7], s[0:3] dmask:0x1 da
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v1
 ; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
@@ -1636,16 +1597,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
 ; GFX10-WAVE64-LABEL: cbranch_kill:
 ; GFX10-WAVE64:       ; %bb.0: ; %.entry
 ; GFX10-WAVE64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-WAVE64-NEXT:    s_mov_b32 s4, 0
 ; GFX10-WAVE64-NEXT:    s_mov_b64 s[0:1], exec
-; GFX10-WAVE64-NEXT:    s_mov_b32 s5, s4
-; GFX10-WAVE64-NEXT:    s_mov_b32 s6, s4
-; GFX10-WAVE64-NEXT:    s_mov_b32 s7, s4
-; GFX10-WAVE64-NEXT:    s_mov_b32 s8, s4
-; GFX10-WAVE64-NEXT:    s_mov_b32 s9, s4
-; GFX10-WAVE64-NEXT:    s_mov_b32 s10, s4
-; GFX10-WAVE64-NEXT:    s_mov_b32 s11, s4
-; GFX10-WAVE64-NEXT:    image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-WAVE64-NEXT:    image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-WAVE64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WAVE64-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v1
 ; GFX10-WAVE64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
@@ -1676,16 +1629,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
 ; GFX10-WAVE32-LABEL: cbranch_kill:
 ; GFX10-WAVE32:       ; %bb.0: ; %.entry
 ; GFX10-WAVE32-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s4, 0
 ; GFX10-WAVE32-NEXT:    s_mov_b32 s0, exec_lo
-; GFX10-WAVE32-NEXT:    s_mov_b32 s5, s4
-; GFX10-WAVE32-NEXT:    s_mov_b32 s6, s4
-; GFX10-WAVE32-NEXT:    s_mov_b32 s7, s4
-; GFX10-WAVE32-NEXT:    s_mov_b32 s8, s4
-; GFX10-WAVE32-NEXT:    s_mov_b32 s9, s4
-; GFX10-WAVE32-NEXT:    s_mov_b32 s10, s4
-; GFX10-WAVE32-NEXT:    s_mov_b32 s11, s4
-; GFX10-WAVE32-NEXT:    image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-WAVE32-NEXT:    image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-WAVE32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WAVE32-NEXT:    v_cmp_ge_f32_e32 vcc_lo, 0, v1
 ; GFX10-WAVE32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
@@ -1716,16 +1661,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
 ; GFX11-LABEL: cbranch_kill:
 ; GFX11:       ; %bb.0: ; %.entry
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    s_mov_b64 s[0:1], exec
-; GFX11-NEXT:    s_mov_b32 s5, s4
-; GFX11-NEXT:    s_mov_b32 s6, s4
-; GFX11-NEXT:    s_mov_b32 s7, s4
-; GFX11-NEXT:    s_mov_b32 s8, s4
-; GFX11-NEXT:    s_mov_b32 s9, s4
-; GFX11-NEXT:    s_mov_b32 s10, s4
-; GFX11-NEXT:    s_mov_b32 s11, s4
-; GFX11-NEXT:    image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX11-NEXT:    image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX11-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmpx_ge_f32_e32 0, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index 20fdbec80b59..fe5f5b5fa195 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -30,208 +30,77 @@ bb:
 define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, i32 %tmp5.i.i, i32 %tmp427.i, i1 %tmp438.i, double %tmp27.i, i1 %tmp48.i) {
 ; GLOBALNESS1-LABEL: kernel:
 ; GLOBALNESS1:       ; %bb.0: ; %bb
-; GLOBALNESS1-NEXT:    s_load_dwordx4 s[56:59], s[8:9], 0x0
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, v0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[54:55], s[6:7]
+; GLOBALNESS1-NEXT:    s_load_dwordx4 s[36:39], s[8:9], 0x0
+; GLOBALNESS1-NEXT:    s_load_dword s6, s[8:9], 0x14
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v44, 0
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dword v[0:1], v44, off
-; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GLOBALNESS1-NEXT:    s_load_dword s6, s[8:9], 0x14
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    global_load_dword v0, v44, s[56:57]
-; GLOBALNESS1-NEXT:    s_mov_b32 s61, 0
-; GLOBALNESS1-NEXT:    s_mov_b32 s60, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s62, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s63, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s64, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s65, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s66, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s67, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s68, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s69, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s70, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s71, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s72, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s73, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s74, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s75, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s76, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s77, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s78, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s79, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s80, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s81, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s82, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s83, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s84, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s85, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s86, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s87, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s88, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s89, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s90, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s91, s61
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a32, s60
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a33, s61
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a34, s62
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a35, s63
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a36, s64
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a37, s65
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a38, s66
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a39, s67
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a40, s68
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a41, s69
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a42, s70
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a43, s71
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a44, s72
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a45, s73
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a46, s74
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a47, s75
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a48, s76
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a49, s77
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a50, s78
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a51, s79
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a52, s80
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a53, s81
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a54, s82
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a55, s83
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a56, s84
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a57, s85
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a58, s86
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a59, s87
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a60, s88
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a61, s89
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a62, s90
-; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a63, s91
-; GLOBALNESS1-NEXT:    s_movk_i32 s60, 0x80
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s60, 0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s61, 1
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s62, 2
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s63, 3
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s64, 4
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s65, 5
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s66, 6
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s67, 7
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s68, 8
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s69, 9
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s70, 10
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s71, 11
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s72, 12
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s73, 13
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s74, 14
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s75, 15
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s76, 16
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s77, 17
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s78, 18
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s79, 19
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s80, 20
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s81, 21
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s82, 22
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s83, 23
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s84, 24
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s85, 25
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s86, 26
-; GLOBALNESS1-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GLOBALNESS1-NEXT:    global_load_dword v0, v44, s[36:37]
+; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GLOBALNESS1-NEXT:    s_mov_b64 s[64:65], s[4:5]
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS1-NEXT:    s_load_dword s7, s[8:9], 0x20
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s87, 27
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s88, 28
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s89, 29
-; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, 0x40994400
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s90, 30
 ; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s91, 31
-; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[4:5], v[44:45]
 ; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s8, 32
 ; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s9, 33
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
-; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s58, 0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 34
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 35
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, 0x40994400
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s38, 0
+; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[44:45]
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:    s_xor_b64 s[100:101], s[4:5], -1
+; GLOBALNESS1-NEXT:    s_xor_b64 s[94:95], s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s6, 0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS1-NEXT:    s_xor_b64 s[46:47], s[4:5], -1
+; GLOBALNESS1-NEXT:    s_xor_b64 s[88:89], s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s7, 0
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[6:7]
 ; GLOBALNESS1-NEXT:    s_add_u32 s6, s6, wobble at gotpcrel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s7, s7, wobble at gotpcrel32@hi+12
-; GLOBALNESS1-NEXT:    s_xor_b64 s[50:51], s[4:5], -1
+; GLOBALNESS1-NEXT:    s_xor_b64 s[86:87], s[4:5], -1
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[66:67], s[6:7], 0x0
+; GLOBALNESS1-NEXT:    s_mov_b32 s98, s16
+; GLOBALNESS1-NEXT:    s_mov_b64 s[62:63], s[8:9]
+; GLOBALNESS1-NEXT:    s_mov_b32 s99, s15
+; GLOBALNESS1-NEXT:    s_mov_b32 s100, s14
+; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[92:93], 0x80
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
+; GLOBALNESS1-NEXT:    s_mov_b32 s69, 0x3ff00000
+; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
+; GLOBALNESS1-NEXT:    ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 36
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 37
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 1
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 38
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 39
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 2
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 3
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 40
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 41
-; GLOBALNESS1-NEXT:    s_mov_b32 s57, 0x3ff00000
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s56, 42
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s57, 43
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s58, 44
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s59, 45
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s60, 46
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s61, 47
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s62, 48
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s63, 49
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s64, 50
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s65, 51
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s66, 52
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s67, 53
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s68, 54
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s78, 0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s69, 55
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s79, 1
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s70, 56
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s80, 2
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s71, 57
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s81, 3
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s72, 58
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s82, 4
-; GLOBALNESS1-NEXT:    s_load_dwordx2 s[52:53], s[6:7], 0x0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s73, 59
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s83, 5
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s74, 60
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s84, 6
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s75, 61
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s85, 7
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s76, 62
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s86, 8
-; GLOBALNESS1-NEXT:    s_mov_b32 s44, s16
-; GLOBALNESS1-NEXT:    s_mov_b32 s45, s15
-; GLOBALNESS1-NEXT:    s_mov_b32 s42, s14
-; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v1
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[48:49], 1, v0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s77, 63
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s87, 9
-; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 4
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[90:91], 1, v0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 5
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS1-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v41, 40
-; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v41, 41
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v41, 4
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v41, 5
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_28
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_29
 ; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow6
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow19
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a63, v31
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[8:9]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a62, v30
 ; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a61, v29
 ; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a60, v28
@@ -263,150 +132,72 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a34, v2
 ; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a33, v1
 ; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a32, v0
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_29
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_30
 ; GLOBALNESS1-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
-; GLOBALNESS1-NEXT:    ; Child Loop BB1_16 Depth 2
-; GLOBALNESS1-NEXT:    v_readlane_b32 s60, v41, 0
-; GLOBALNESS1-NEXT:    v_readlane_b32 s61, v41, 1
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    ; Child Loop BB1_15 Depth 2
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1]
 ; GLOBALNESS1-NEXT:    flat_load_dword v40, v[0:1]
-; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS1-NEXT:    s_add_u32 s8, s62, 40
 ; GLOBALNESS1-NEXT:    buffer_store_dword v44, off, s[0:3], 0
-; GLOBALNESS1-NEXT:    flat_load_dword v46, v[0:1]
-; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
-; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:    flat_load_dword v43, v[0:1]
+; GLOBALNESS1-NEXT:    s_addc_u32 s9, s63, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[64:65]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[54:55]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s42
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s45
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s44
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v43
-; GLOBALNESS1-NEXT:    v_readlane_b32 s62, v41, 2
-; GLOBALNESS1-NEXT:    v_readlane_b32 s63, v41, 3
-; GLOBALNESS1-NEXT:    v_readlane_b32 s64, v41, 4
-; GLOBALNESS1-NEXT:    v_readlane_b32 s65, v41, 5
-; GLOBALNESS1-NEXT:    v_readlane_b32 s66, v41, 6
-; GLOBALNESS1-NEXT:    v_readlane_b32 s67, v41, 7
-; GLOBALNESS1-NEXT:    v_readlane_b32 s68, v41, 8
-; GLOBALNESS1-NEXT:    v_readlane_b32 s69, v41, 9
-; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v41, 10
-; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v41, 11
-; GLOBALNESS1-NEXT:    v_readlane_b32 s72, v41, 12
-; GLOBALNESS1-NEXT:    v_readlane_b32 s73, v41, 13
-; GLOBALNESS1-NEXT:    v_readlane_b32 s74, v41, 14
-; GLOBALNESS1-NEXT:    v_readlane_b32 s75, v41, 15
-; GLOBALNESS1-NEXT:    v_readlane_b32 s76, v41, 16
-; GLOBALNESS1-NEXT:    v_readlane_b32 s77, v41, 17
-; GLOBALNESS1-NEXT:    v_readlane_b32 s78, v41, 18
-; GLOBALNESS1-NEXT:    v_readlane_b32 s79, v41, 19
-; GLOBALNESS1-NEXT:    v_readlane_b32 s80, v41, 20
-; GLOBALNESS1-NEXT:    v_readlane_b32 s81, v41, 21
-; GLOBALNESS1-NEXT:    v_readlane_b32 s82, v41, 22
-; GLOBALNESS1-NEXT:    v_readlane_b32 s83, v41, 23
-; GLOBALNESS1-NEXT:    v_readlane_b32 s84, v41, 24
-; GLOBALNESS1-NEXT:    v_readlane_b32 s85, v41, 25
-; GLOBALNESS1-NEXT:    v_readlane_b32 s86, v41, 26
-; GLOBALNESS1-NEXT:    v_readlane_b32 s87, v41, 27
-; GLOBALNESS1-NEXT:    v_readlane_b32 s88, v41, 28
-; GLOBALNESS1-NEXT:    v_readlane_b32 s89, v41, 29
-; GLOBALNESS1-NEXT:    v_readlane_b32 s90, v41, 30
-; GLOBALNESS1-NEXT:    v_readlane_b32 s91, v41, 31
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s100
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s99
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s98
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[52:53]
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[54:55]
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[66:67]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[42:43]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_8
 ; GLOBALNESS1-NEXT:  ; %bb.5: ; %NodeBlock
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s59, 1
+; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s39, 1
 ; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_7
 ; GLOBALNESS1-NEXT:  ; %bb.6: ; %LeafBlock3
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s59, 1
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s39, 1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_8
-; GLOBALNESS1-NEXT:    s_branch .LBB1_9
+; GLOBALNESS1-NEXT:    s_cbranch_execnz .LBB1_8
+; GLOBALNESS1-NEXT:    s_branch .LBB1_23
 ; GLOBALNESS1-NEXT:  .LBB1_7: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS1-NEXT:  .LBB1_8: ; %LeafBlock
+; GLOBALNESS1-NEXT:    s_branch .LBB1_23
+; GLOBALNESS1-NEXT:  .LBB1_8: ; %Flow16
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s59, 0
-; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
-; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow16
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
+; GLOBALNESS1-NEXT:  .LBB1_9: ; %baz.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s68, v41, 0
-; GLOBALNESS1-NEXT:    v_readlane_b32 s69, v41, 1
-; GLOBALNESS1-NEXT:    s_mov_b64 s[56:57], s[68:69]
-; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v41, 2
-; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v41, 3
-; GLOBALNESS1-NEXT:    v_readlane_b32 s72, v41, 4
-; GLOBALNESS1-NEXT:    v_readlane_b32 s73, v41, 5
-; GLOBALNESS1-NEXT:    v_readlane_b32 s74, v41, 6
-; GLOBALNESS1-NEXT:    v_readlane_b32 s75, v41, 7
-; GLOBALNESS1-NEXT:    v_readlane_b32 s76, v41, 8
-; GLOBALNESS1-NEXT:    v_readlane_b32 s77, v41, 9
-; GLOBALNESS1-NEXT:    v_readlane_b32 s78, v41, 10
-; GLOBALNESS1-NEXT:    v_readlane_b32 s79, v41, 11
-; GLOBALNESS1-NEXT:    v_readlane_b32 s80, v41, 12
-; GLOBALNESS1-NEXT:    v_readlane_b32 s81, v41, 13
-; GLOBALNESS1-NEXT:    v_readlane_b32 s82, v41, 14
-; GLOBALNESS1-NEXT:    v_readlane_b32 s83, v41, 15
-; GLOBALNESS1-NEXT:    v_readlane_b32 s84, v41, 16
-; GLOBALNESS1-NEXT:    v_readlane_b32 s85, v41, 17
-; GLOBALNESS1-NEXT:    v_readlane_b32 s86, v41, 18
-; GLOBALNESS1-NEXT:    v_readlane_b32 s87, v41, 19
-; GLOBALNESS1-NEXT:    v_readlane_b32 s88, v41, 20
-; GLOBALNESS1-NEXT:    v_readlane_b32 s89, v41, 21
-; GLOBALNESS1-NEXT:    v_readlane_b32 s90, v41, 22
-; GLOBALNESS1-NEXT:    v_readlane_b32 s91, v41, 23
-; GLOBALNESS1-NEXT:    v_readlane_b32 s92, v41, 24
-; GLOBALNESS1-NEXT:    v_readlane_b32 s93, v41, 25
-; GLOBALNESS1-NEXT:    v_readlane_b32 s94, v41, 26
-; GLOBALNESS1-NEXT:    v_readlane_b32 s95, v41, 27
-; GLOBALNESS1-NEXT:    v_readlane_b32 s96, v41, 28
-; GLOBALNESS1-NEXT:    v_readlane_b32 s97, v41, 29
-; GLOBALNESS1-NEXT:    v_readlane_b32 s98, v41, 30
-; GLOBALNESS1-NEXT:    v_readlane_b32 s99, v41, 31
-; GLOBALNESS1-NEXT:    s_mov_b32 s68, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s69, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s70, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s71, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s72, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s73, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s74, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s75, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s76, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s77, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s78, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s79, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s80, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s81, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s82, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s83, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s84, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s85, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s86, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s87, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s88, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s89, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s90, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s91, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s92, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s93, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s94, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s95, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s96, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s97, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s98, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s99, s57
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS1-NEXT:    flat_load_dword v0, v[0:1]
+; GLOBALNESS1-NEXT:    s_mov_b32 s68, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s70, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s71, s69
+; GLOBALNESS1-NEXT:    s_mov_b32 s72, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s73, s69
+; GLOBALNESS1-NEXT:    s_mov_b32 s74, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s75, s69
+; GLOBALNESS1-NEXT:    s_mov_b32 s76, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s77, s69
+; GLOBALNESS1-NEXT:    s_mov_b32 s78, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s79, s69
+; GLOBALNESS1-NEXT:    s_mov_b32 s80, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s81, s69
+; GLOBALNESS1-NEXT:    s_mov_b32 s82, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s83, s69
+; GLOBALNESS1-NEXT:    s_mov_b32 s84, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s85, s69
+; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[96:97], 0, v0
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], -1
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1]
@@ -422,570 +213,280 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_3
-; GLOBALNESS1-NEXT:  ; %bb.10: ; %baz.exit.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT:    flat_load_dword v0, v[0:1]
-; GLOBALNESS1-NEXT:    v_readlane_b32 s60, v41, 0
-; GLOBALNESS1-NEXT:    v_readlane_b32 s61, v41, 1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s62, v41, 2
-; GLOBALNESS1-NEXT:    v_readlane_b32 s63, v41, 3
-; GLOBALNESS1-NEXT:    v_readlane_b32 s64, v41, 4
-; GLOBALNESS1-NEXT:    v_readlane_b32 s65, v41, 5
-; GLOBALNESS1-NEXT:    v_readlane_b32 s66, v41, 6
-; GLOBALNESS1-NEXT:    v_readlane_b32 s67, v41, 7
-; GLOBALNESS1-NEXT:    v_readlane_b32 s68, v41, 8
-; GLOBALNESS1-NEXT:    v_readlane_b32 s69, v41, 9
-; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v41, 10
-; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v41, 11
-; GLOBALNESS1-NEXT:    v_readlane_b32 s72, v41, 12
-; GLOBALNESS1-NEXT:    v_readlane_b32 s73, v41, 13
-; GLOBALNESS1-NEXT:    v_readlane_b32 s74, v41, 14
-; GLOBALNESS1-NEXT:    v_readlane_b32 s75, v41, 15
-; GLOBALNESS1-NEXT:    v_readlane_b32 s76, v41, 16
-; GLOBALNESS1-NEXT:    v_readlane_b32 s77, v41, 17
-; GLOBALNESS1-NEXT:    v_readlane_b32 s78, v41, 18
-; GLOBALNESS1-NEXT:    v_readlane_b32 s79, v41, 19
-; GLOBALNESS1-NEXT:    v_readlane_b32 s80, v41, 20
-; GLOBALNESS1-NEXT:    v_readlane_b32 s81, v41, 21
-; GLOBALNESS1-NEXT:    v_readlane_b32 s82, v41, 22
-; GLOBALNESS1-NEXT:    v_readlane_b32 s83, v41, 23
-; GLOBALNESS1-NEXT:    v_readlane_b32 s84, v41, 24
-; GLOBALNESS1-NEXT:    v_readlane_b32 s85, v41, 25
-; GLOBALNESS1-NEXT:    v_readlane_b32 s86, v41, 26
-; GLOBALNESS1-NEXT:    v_readlane_b32 s87, v41, 27
-; GLOBALNESS1-NEXT:    v_readlane_b32 s88, v41, 28
-; GLOBALNESS1-NEXT:    v_readlane_b32 s89, v41, 29
-; GLOBALNESS1-NEXT:    v_readlane_b32 s90, v41, 30
-; GLOBALNESS1-NEXT:    v_readlane_b32 s91, v41, 31
-; GLOBALNESS1-NEXT:    s_mov_b64 s[56:57], s[60:61]
-; GLOBALNESS1-NEXT:    v_readlane_b32 s60, v41, 42
-; GLOBALNESS1-NEXT:    v_readlane_b32 s61, v41, 43
-; GLOBALNESS1-NEXT:    v_readlane_b32 s62, v41, 44
-; GLOBALNESS1-NEXT:    v_readlane_b32 s63, v41, 45
-; GLOBALNESS1-NEXT:    v_readlane_b32 s64, v41, 46
-; GLOBALNESS1-NEXT:    v_readlane_b32 s65, v41, 47
-; GLOBALNESS1-NEXT:    v_readlane_b32 s66, v41, 48
-; GLOBALNESS1-NEXT:    v_readlane_b32 s67, v41, 49
-; GLOBALNESS1-NEXT:    v_readlane_b32 s68, v41, 50
-; GLOBALNESS1-NEXT:    v_readlane_b32 s69, v41, 51
-; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v41, 52
-; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v41, 53
-; GLOBALNESS1-NEXT:    v_readlane_b32 s72, v41, 54
-; GLOBALNESS1-NEXT:    v_readlane_b32 s73, v41, 55
-; GLOBALNESS1-NEXT:    v_readlane_b32 s74, v41, 56
-; GLOBALNESS1-NEXT:    v_readlane_b32 s75, v41, 57
-; GLOBALNESS1-NEXT:    v_readlane_b32 s76, v41, 58
-; GLOBALNESS1-NEXT:    v_readlane_b32 s77, v41, 59
-; GLOBALNESS1-NEXT:    v_readlane_b32 s78, v41, 60
-; GLOBALNESS1-NEXT:    v_readlane_b32 s79, v41, 61
-; GLOBALNESS1-NEXT:    v_readlane_b32 s80, v41, 62
-; GLOBALNESS1-NEXT:    v_readlane_b32 s81, v41, 63
-; GLOBALNESS1-NEXT:    v_readlane_b32 s82, v42, 0
-; GLOBALNESS1-NEXT:    v_readlane_b32 s83, v42, 1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s84, v42, 2
-; GLOBALNESS1-NEXT:    v_readlane_b32 s85, v42, 3
-; GLOBALNESS1-NEXT:    v_readlane_b32 s86, v42, 4
-; GLOBALNESS1-NEXT:    v_readlane_b32 s87, v42, 5
-; GLOBALNESS1-NEXT:    s_mov_b32 s60, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s62, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s63, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s64, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s65, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s66, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s67, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s68, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s69, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s70, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s71, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s72, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s73, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s74, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s75, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s76, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s77, s61
-; GLOBALNESS1-NEXT:    s_mov_b32 s57, s61
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s56, 42
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s57, 43
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s58, 44
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s59, 45
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s60, 46
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s61, 47
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s62, 48
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s63, 49
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s64, 50
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s65, 51
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s66, 52
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s67, 53
-; GLOBALNESS1-NEXT:    v_readlane_b32 s88, v42, 6
-; GLOBALNESS1-NEXT:    v_readlane_b32 s89, v42, 7
-; GLOBALNESS1-NEXT:    v_readlane_b32 s90, v42, 8
-; GLOBALNESS1-NEXT:    v_readlane_b32 s91, v42, 9
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s68, 54
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s78, 0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s69, 55
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s79, 1
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s70, 56
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s80, 2
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s71, 57
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s81, 3
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s72, 58
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s82, 4
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s73, 59
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s83, 5
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s74, 60
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s84, 6
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s75, 61
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s85, 7
-; GLOBALNESS1-NEXT:    s_mov_b64 s[92:93], s[54:55]
-; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[54:55], 0, v0
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s76, 62
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s86, 8
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s77, 63
-; GLOBALNESS1-NEXT:    v_writelane_b32 v42, s87, 9
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], s[62:63], s[62:63] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[4:5], s[64:65], s[64:65] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[6:7], s[66:67], s[66:67] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[8:9], s[68:69], s[68:69] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[10:11], s[70:71], s[70:71] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[12:13], s[72:73], s[72:73] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[14:15], s[74:75], s[74:75] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[16:17], s[76:77], s[76:77] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[18:19], s[78:79], s[78:79] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[20:21], s[80:81], s[80:81] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[22:23], s[82:83], s[82:83] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[24:25], s[84:85], s[84:85] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[26:27], s[86:87], s[86:87] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[28:29], s[88:89], s[88:89] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[30:31], s[90:91], s[90:91] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[88:89], s[54:55]
-; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_25
-; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[70:71], s[96:97]
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_26
+; GLOBALNESS1-NEXT:  ; %bb.10: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v41, 36
-; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v41, 37
-; GLOBALNESS1-NEXT:    s_mov_b32 s91, s59
+; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v41, 0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v41, 1
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
-; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_12
+; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb39.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[44:45], off
-; GLOBALNESS1-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
+; GLOBALNESS1-NEXT:  .LBB1_12: ; %bb44.lr.ph.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v43
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
-; GLOBALNESS1-NEXT:    v_readlane_b32 s62, v41, 32
-; GLOBALNESS1-NEXT:    v_readlane_b32 s64, v41, 34
+; GLOBALNESS1-NEXT:    s_mov_b64 s[72:73], s[42:43]
+; GLOBALNESS1-NEXT:    s_mov_b32 s75, s39
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1]
 ; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[58:59], 0, v2
-; GLOBALNESS1-NEXT:    v_readlane_b32 s63, v41, 33
-; GLOBALNESS1-NEXT:    v_readlane_b32 s65, v41, 35
-; GLOBALNESS1-NEXT:    s_branch .LBB1_16
-; GLOBALNESS1-NEXT:  .LBB1_14: ; %Flow7
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
+; GLOBALNESS1-NEXT:    s_branch .LBB1_15
+; GLOBALNESS1-NEXT:  .LBB1_13: ; %Flow7
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb63.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[50:51]
-; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
-; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb44.i
+; GLOBALNESS1-NEXT:  .LBB1_14: ; %bb63.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[86:87]
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_25
+; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb44.i
 ; GLOBALNESS1-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[100:101]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
-; GLOBALNESS1-NEXT:  ; %bb.17: ; %bb46.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[46:47]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
-; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb50.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[62:63]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
-; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb3.i.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[64:65]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_21
-; GLOBALNESS1-NEXT:  ; %bb.20: ; %bb6.i.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[94:95]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_14
+; GLOBALNESS1-NEXT:  ; %bb.16: ; %bb46.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[88:89]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_14
+; GLOBALNESS1-NEXT:  ; %bb.17: ; %bb50.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_20
+; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb3.i.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[40:41]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_20
+; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb6.i.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[56:57]
-; GLOBALNESS1-NEXT:  .LBB1_21: ; %spam.exit.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[48:49]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
-; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb55.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_add_u32 s60, s38, 40
-; GLOBALNESS1-NEXT:    s_addc_u32 s61, s39, 0
-; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:  .LBB1_20: ; %spam.exit.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[90:91]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_14
+; GLOBALNESS1-NEXT:  ; %bb.21: ; %bb55.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS1-NEXT:    s_add_u32 s60, s62, 40
+; GLOBALNESS1-NEXT:    s_addc_u32 s61, s63, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[64:65]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[54:55]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[60:61]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s42
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s45
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s44
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v43
-; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[52:53]
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s100
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s99
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s98
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[66:67]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[64:65]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[54:55]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[60:61]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s42
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s45
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s44
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s100
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s99
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s98
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[0:1], a[32:33], off
-; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[52:53]
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[66:67]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[58:59]
-; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_14
-; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb62.i
-; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_13
+; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb62.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_15 Depth=2
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS1-NEXT:    s_branch .LBB1_14
-; GLOBALNESS1-NEXT:  .LBB1_24: ; %Flow14
+; GLOBALNESS1-NEXT:    s_branch .LBB1_13
+; GLOBALNESS1-NEXT:  .LBB1_23: ; %LeafBlock
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s56, v41, 0
-; GLOBALNESS1-NEXT:    v_readlane_b32 s57, v41, 1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s68, v41, 12
-; GLOBALNESS1-NEXT:    v_readlane_b32 s69, v41, 13
-; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v41, 14
-; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v41, 15
-; GLOBALNESS1-NEXT:    v_readlane_b32 s72, v41, 16
-; GLOBALNESS1-NEXT:    v_readlane_b32 s73, v41, 17
-; GLOBALNESS1-NEXT:    v_readlane_b32 s74, v41, 18
-; GLOBALNESS1-NEXT:    v_readlane_b32 s75, v41, 19
-; GLOBALNESS1-NEXT:    v_readlane_b32 s76, v41, 20
-; GLOBALNESS1-NEXT:    v_readlane_b32 s77, v41, 21
-; GLOBALNESS1-NEXT:    v_readlane_b32 s78, v41, 22
-; GLOBALNESS1-NEXT:    v_readlane_b32 s79, v41, 23
-; GLOBALNESS1-NEXT:    v_readlane_b32 s80, v41, 24
-; GLOBALNESS1-NEXT:    v_readlane_b32 s81, v41, 25
-; GLOBALNESS1-NEXT:    v_readlane_b32 s82, v41, 26
-; GLOBALNESS1-NEXT:    v_readlane_b32 s83, v41, 27
-; GLOBALNESS1-NEXT:    v_readlane_b32 s84, v41, 28
-; GLOBALNESS1-NEXT:    v_readlane_b32 s85, v41, 29
-; GLOBALNESS1-NEXT:    s_mov_b32 s68, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s69, s57
-; GLOBALNESS1-NEXT:    v_readlane_b32 s59, v41, 3
-; GLOBALNESS1-NEXT:    v_readlane_b32 s86, v41, 30
-; GLOBALNESS1-NEXT:    v_readlane_b32 s87, v41, 31
-; GLOBALNESS1-NEXT:    s_mov_b32 s70, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s71, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s72, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s73, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s74, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s75, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s76, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s77, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s78, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s79, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s80, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s81, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s82, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s83, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s84, s57
-; GLOBALNESS1-NEXT:    s_mov_b32 s85, s57
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_readlane_b32 s58, v41, 2
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[8:9], s[76:77], s[76:77] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[10:11], s[78:79], s[78:79] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[12:13], s[80:81], s[80:81] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[14:15], s[82:83], s[82:83] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[16:17], s[84:85], s[84:85] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[18:19], s[86:87], s[86:87] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[20:21], s[88:89], s[88:89] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[22:23], s[90:91], s[90:91] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[24:25], s[92:93], s[92:93] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1]
-; GLOBALNESS1-NEXT:    s_mov_b32 s59, s91
-; GLOBALNESS1-NEXT:    v_readlane_b32 s60, v41, 4
-; GLOBALNESS1-NEXT:    v_readlane_b32 s61, v41, 5
-; GLOBALNESS1-NEXT:    v_readlane_b32 s62, v41, 6
-; GLOBALNESS1-NEXT:    v_readlane_b32 s63, v41, 7
-; GLOBALNESS1-NEXT:    v_readlane_b32 s64, v41, 8
-; GLOBALNESS1-NEXT:    v_readlane_b32 s65, v41, 9
-; GLOBALNESS1-NEXT:    v_readlane_b32 s66, v41, 10
-; GLOBALNESS1-NEXT:    v_readlane_b32 s67, v41, 11
-; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow15
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s39, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
+; GLOBALNESS1-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
+; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GLOBALNESS1-NEXT:    s_branch .LBB1_3
+; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow14
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[36:37]
+; GLOBALNESS1-NEXT:    s_mov_b32 s36, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s37, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s38, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s39, s93
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[40:41]
+; GLOBALNESS1-NEXT:    s_mov_b32 s40, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s41, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s42, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s43, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s44, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s45, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s46, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s47, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s48, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s49, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s50, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s51, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s52, s93
+; GLOBALNESS1-NEXT:    s_mov_b32 s53, s93
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[4:5], s[40:41], s[40:41] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[6:7], s[42:43], s[42:43] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[8:9], s[44:45], s[44:45] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[10:11], s[46:47], s[46:47] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[12:13], s[48:49], s[48:49] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[14:15], s[50:51], s[50:51] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[16:17], s[52:53], s[52:53] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[18:19], s[54:55], s[54:55] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[20:21], s[56:57], s[56:57] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[22:23], s[58:59], s[58:59] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[24:25], s[60:61], s[60:61] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[26:27], s[62:63], s[62:63] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[28:29], s[64:65], s[64:65] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[30:31], s[66:67], s[66:67] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[40:41], s[6:7]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[4:5]
+; GLOBALNESS1-NEXT:    s_mov_b32 s39, s75
+; GLOBALNESS1-NEXT:    s_mov_b64 s[42:43], s[72:73]
+; GLOBALNESS1-NEXT:  .LBB1_26: ; %Flow15
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[88:89]
-; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[54:55]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[54:55], s[92:93]
+; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[70:71]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[96:97]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_2
-; GLOBALNESS1-NEXT:  ; %bb.26: ; %bb67.i
+; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb67.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v41, 38
-; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v41, 39
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v41, 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v41, 3
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_1
-; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb69.i
+; GLOBALNESS1-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_1
-; GLOBALNESS1-NEXT:  .LBB1_28: ; %bb73.i
+; GLOBALNESS1-NEXT:  .LBB1_29: ; %bb73.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_2
-; GLOBALNESS1-NEXT:  .LBB1_29: ; %loop.exit.guard
+; GLOBALNESS1-NEXT:  .LBB1_30: ; %loop.exit.guard
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], -1
-; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_31
-; GLOBALNESS1-NEXT:  ; %bb.30: ; %bb7.i.i
-; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
-; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
-; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_32
+; GLOBALNESS1-NEXT:  ; %bb.31: ; %bb7.i.i
+; GLOBALNESS1-NEXT:    s_add_u32 s8, s62, 40
+; GLOBALNESS1-NEXT:    s_addc_u32 s9, s63, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[64:65]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[54:55]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s42
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s45
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s44
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s100
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s99
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s98
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS1-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
-; GLOBALNESS1-NEXT:  .LBB1_31: ; %Flow
+; GLOBALNESS1-NEXT:  .LBB1_32: ; %Flow
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_33
-; GLOBALNESS1-NEXT:  ; %bb.32: ; %bb11.i.i
-; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
-; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
-; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_34
+; GLOBALNESS1-NEXT:  ; %bb.33: ; %bb11.i.i
+; GLOBALNESS1-NEXT:    s_add_u32 s8, s62, 40
+; GLOBALNESS1-NEXT:    s_addc_u32 s9, s63, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[64:65]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[54:55]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS1-NEXT:    s_mov_b32 s12, s42
-; GLOBALNESS1-NEXT:    s_mov_b32 s13, s45
-; GLOBALNESS1-NEXT:    s_mov_b32 s14, s44
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s100
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s99
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s98
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS1-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GLOBALNESS1-NEXT:  .LBB1_33: ; %UnifiedUnreachableBlock
+; GLOBALNESS1-NEXT:  .LBB1_34: ; %UnifiedUnreachableBlock
 ;
 ; GLOBALNESS0-LABEL: kernel:
 ; GLOBALNESS0:       ; %bb.0: ; %bb
-; GLOBALNESS0-NEXT:    s_load_dwordx4 s[56:59], s[8:9], 0x0
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[6:7]
+; GLOBALNESS0-NEXT:    s_load_dwordx4 s[36:39], s[8:9], 0x0
+; GLOBALNESS0-NEXT:    s_load_dword s6, s[8:9], 0x14
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v42, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v44, 0
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dword v[0:1], v44, off
-; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[6:7]
-; GLOBALNESS0-NEXT:    s_load_dword s6, s[8:9], 0x14
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    global_load_dword v0, v44, s[56:57]
-; GLOBALNESS0-NEXT:    s_mov_b32 s61, 0
-; GLOBALNESS0-NEXT:    s_mov_b32 s60, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s62, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s63, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s64, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s65, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s66, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s67, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s68, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s69, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s70, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s71, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s72, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s73, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s74, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s75, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s76, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s77, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s78, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s79, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s80, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s81, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s82, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s83, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s84, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s85, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s86, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s87, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s88, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s89, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s90, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s91, s61
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a32, s60
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a33, s61
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a34, s62
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a35, s63
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a36, s64
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a37, s65
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a38, s66
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a39, s67
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a40, s68
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a41, s69
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a42, s70
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a43, s71
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a44, s72
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a45, s73
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a46, s74
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a47, s75
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a48, s76
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a49, s77
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a50, s78
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a51, s79
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a52, s80
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a53, s81
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a54, s82
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a55, s83
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a56, s84
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a57, s85
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a58, s86
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a59, s87
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a60, s88
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a61, s89
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a62, s90
-; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a63, s91
-; GLOBALNESS0-NEXT:    s_movk_i32 s60, 0x80
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s60, 0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s61, 1
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s62, 2
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s63, 3
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s64, 4
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s65, 5
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s66, 6
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s67, 7
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s68, 8
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s69, 9
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s70, 10
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s71, 11
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s72, 12
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s73, 13
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s74, 14
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s75, 15
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s76, 16
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s77, 17
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s78, 18
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s79, 19
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s80, 20
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s81, 21
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s82, 22
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s83, 23
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s84, 24
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s85, 25
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s86, 26
-; GLOBALNESS0-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GLOBALNESS0-NEXT:    global_load_dword v0, v44, s[36:37]
+; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GLOBALNESS0-NEXT:    s_mov_b64 s[62:63], s[4:5]
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS0-NEXT:    s_load_dword s7, s[8:9], 0x20
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s87, 27
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s88, 28
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s89, 29
-; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, 0x40994400
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s90, 30
 ; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s91, 31
-; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[8:9], s[4:5], v[44:45]
 ; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s8, 32
 ; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s9, 33
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
-; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s58, 0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 34
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 35
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, 0x40994400
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s38, 0
+; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[44:45]
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:    s_xor_b64 s[46:47], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_xor_b64 s[94:95], s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s6, 0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GLOBALNESS0-NEXT:    s_xor_b64 s[50:51], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_xor_b64 s[88:89], s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s7, 0
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[6:7]
 ; GLOBALNESS0-NEXT:    s_add_u32 s6, s6, wobble at gotpcrel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s7, s7, wobble at gotpcrel32@hi+12
-; GLOBALNESS0-NEXT:    s_xor_b64 s[52:53], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_xor_b64 s[86:87], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[66:67], s[6:7], 0x0
+; GLOBALNESS0-NEXT:    s_mov_b32 s98, s16
+; GLOBALNESS0-NEXT:    s_mov_b64 s[60:61], s[8:9]
+; GLOBALNESS0-NEXT:    s_mov_b32 s99, s15
+; GLOBALNESS0-NEXT:    s_mov_b32 s100, s14
+; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[92:93], 0x80
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
+; GLOBALNESS0-NEXT:    s_mov_b32 s69, 0x3ff00000
+; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
+; GLOBALNESS0-NEXT:    ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 36
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 37
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 1
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 38
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 39
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 3
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 40
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 41
-; GLOBALNESS0-NEXT:    s_mov_b32 s57, 0x3ff00000
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s56, 42
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s57, 43
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s58, 44
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s59, 45
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s60, 46
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s61, 47
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s62, 48
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s63, 49
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s64, 50
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s65, 51
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s66, 52
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s67, 53
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s68, 54
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s78, 0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s69, 55
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s79, 1
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s70, 56
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s80, 2
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s71, 57
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s81, 3
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s72, 58
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s82, 4
-; GLOBALNESS0-NEXT:    s_load_dwordx2 s[100:101], s[6:7], 0x0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s73, 59
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s83, 5
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s74, 60
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s84, 6
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s75, 61
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s85, 7
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s76, 62
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s86, 8
-; GLOBALNESS0-NEXT:    s_mov_b32 s44, s16
-; GLOBALNESS0-NEXT:    s_mov_b32 s45, s15
-; GLOBALNESS0-NEXT:    s_mov_b32 s42, s14
-; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v1
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[48:49], 1, v0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s77, 63
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s87, 9
-; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 4
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[90:91], 1, v0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 5
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS0-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 40
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 41
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 5
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
-; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_28
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_29
 ; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow6
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow19
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a63, v31
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[8:9]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a62, v30
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a61, v29
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a60, v28
@@ -1017,148 +518,72 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a34, v2
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a33, v1
 ; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a32, v0
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_29
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_30
 ; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
-; GLOBALNESS0-NEXT:    ; Child Loop BB1_16 Depth 2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 0
-; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 1
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    ; Child Loop BB1_15 Depth 2
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    flat_load_dword v40, v[0:1]
-; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS0-NEXT:    s_add_u32 s8, s60, 40
 ; GLOBALNESS0-NEXT:    buffer_store_dword v44, off, s[0:3], 0
-; GLOBALNESS0-NEXT:    flat_load_dword v46, v[0:1]
-; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
-; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS0-NEXT:    flat_load_dword v43, v[0:1]
+; GLOBALNESS0-NEXT:    s_addc_u32 s9, s61, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[62:63]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[54:55]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s42
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s45
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s44
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
-; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 3
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 4
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 5
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 6
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 7
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 9
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 10
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 12
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 13
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 14
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 15
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 16
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 17
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 18
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 19
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 20
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 21
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 22
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 23
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 24
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 25
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 26
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 27
-; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 28
-; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 29
-; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 30
-; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 31
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s100
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s99
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s98
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[100:101]
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[54:55]
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[66:67]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[42:43]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_8
 ; GLOBALNESS0-NEXT:  ; %bb.5: ; %NodeBlock
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s59, 1
+; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s39, 1
 ; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_7
 ; GLOBALNESS0-NEXT:  ; %bb.6: ; %LeafBlock3
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s59, 1
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s39, 1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_8
-; GLOBALNESS0-NEXT:    s_branch .LBB1_9
+; GLOBALNESS0-NEXT:    s_cbranch_execnz .LBB1_8
+; GLOBALNESS0-NEXT:    s_branch .LBB1_23
 ; GLOBALNESS0-NEXT:  .LBB1_7: ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GLOBALNESS0-NEXT:  .LBB1_8: ; %LeafBlock
+; GLOBALNESS0-NEXT:    s_branch .LBB1_23
+; GLOBALNESS0-NEXT:  .LBB1_8: ; %Flow16
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s59, 0
-; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
-; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow16
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
+; GLOBALNESS0-NEXT:  .LBB1_9: ; %baz.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 0
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 4
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 5
-; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[64:65]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 6
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 7
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 9
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 10
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 12
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 13
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 14
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 15
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 16
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 17
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 18
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 19
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 20
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 21
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 22
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 23
-; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 24
-; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 25
-; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 26
-; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 27
-; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 28
-; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 29
-; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 30
-; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 31
-; GLOBALNESS0-NEXT:    s_mov_b32 s68, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s69, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s70, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s71, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s72, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s73, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s74, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s75, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s76, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s77, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s78, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s79, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s80, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s81, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s82, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s83, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s84, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s85, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s86, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s87, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s88, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s89, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s90, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s91, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s92, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s93, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s94, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s95, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s96, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s97, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s98, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s99, s57
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS0-NEXT:    flat_load_dword v0, v[0:1]
+; GLOBALNESS0-NEXT:    s_mov_b32 s68, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s70, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s71, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s72, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s73, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s74, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s75, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s76, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s77, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s78, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s79, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s80, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s81, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s82, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s83, s69
+; GLOBALNESS0-NEXT:    s_mov_b32 s84, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s85, s69
+; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[96:97], 0, v0
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], -1
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1]
@@ -1174,367 +599,206 @@ define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i,
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 3
-; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_3
-; GLOBALNESS0-NEXT:  ; %bb.10: ; %baz.exit.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS0-NEXT:    flat_load_dword v0, v[0:1]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 0
-; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 3
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 4
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 5
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 6
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 7
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 9
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 10
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 12
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 13
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 14
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 15
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 16
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 17
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 18
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 19
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 20
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 21
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 22
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 23
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 24
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 25
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 26
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 27
-; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 28
-; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 29
-; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 30
-; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 31
-; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[60:61]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 42
-; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 43
-; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 44
-; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 45
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 46
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 47
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 48
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 49
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 50
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 51
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 52
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 53
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 54
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 55
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 56
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 57
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 58
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 59
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 60
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 61
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 62
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 63
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v42, 0
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v42, 1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v42, 2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v42, 3
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v42, 4
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v42, 5
-; GLOBALNESS0-NEXT:    s_mov_b32 s60, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s62, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s63, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s64, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s65, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s66, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s67, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s68, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s69, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s70, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s71, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s72, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s73, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s74, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s75, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s76, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s77, s61
-; GLOBALNESS0-NEXT:    s_mov_b32 s57, s61
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s56, 42
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s57, 43
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s58, 44
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s59, 45
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s60, 46
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s61, 47
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s62, 48
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s63, 49
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s64, 50
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s65, 51
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s66, 52
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s67, 53
-; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v42, 6
-; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v42, 7
-; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v42, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v42, 9
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s68, 54
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s78, 0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s69, 55
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s79, 1
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s70, 56
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s80, 2
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s71, 57
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s81, 3
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s72, 58
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s82, 4
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s73, 59
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s83, 5
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s74, 60
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s84, 6
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s75, 61
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s85, 7
-; GLOBALNESS0-NEXT:    s_mov_b64 s[92:93], s[54:55]
-; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[54:55], 0, v0
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s76, 62
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s86, 8
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s77, 63
-; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s87, 9
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], s[62:63], s[62:63] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[4:5], s[64:65], s[64:65] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[6:7], s[66:67], s[66:67] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[8:9], s[68:69], s[68:69] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[10:11], s[70:71], s[70:71] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[12:13], s[72:73], s[72:73] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[14:15], s[74:75], s[74:75] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[16:17], s[76:77], s[76:77] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[18:19], s[78:79], s[78:79] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[20:21], s[80:81], s[80:81] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[22:23], s[82:83], s[82:83] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[24:25], s[84:85], s[84:85] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[26:27], s[86:87], s[86:87] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[28:29], s[88:89], s[88:89] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[30:31], s[90:91], s[90:91] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[88:89], s[54:55]
-; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_25
-; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[70:71], s[96:97]
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
+; GLOBALNESS0-NEXT:  ; %bb.10: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
 ; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 36
-; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 37
-; GLOBALNESS0-NEXT:    s_mov_b32 s91, s59
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 1
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
-; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_12
+; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb39.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[44:45], off
-; GLOBALNESS0-NEXT:  .LBB1_13: ; %bb44.lr.ph.i
+; GLOBALNESS0-NEXT:  .LBB1_12: ; %bb44.lr.ph.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v43
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
-; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 34
-; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 32
+; GLOBALNESS0-NEXT:    s_mov_b64 s[72:73], s[42:43]
+; GLOBALNESS0-NEXT:    s_mov_b32 s75, s39
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
 ; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1]
 ; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[58:59], 0, v2
-; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 35
-; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 33
-; GLOBALNESS0-NEXT:    s_branch .LBB1_16
-; GLOBALNESS0-NEXT:  .LBB1_14: ; %Flow7
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
+; GLOBALNESS0-NEXT:    s_branch .LBB1_15
+; GLOBALNESS0-NEXT:  .LBB1_13: ; %Flow7
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb63.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[52:53]
-; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
-; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb44.i
+; GLOBALNESS0-NEXT:  .LBB1_14: ; %bb63.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[86:87]
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_25
+; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb44.i
 ; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[46:47]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
-; GLOBALNESS0-NEXT:  ; %bb.17: ; %bb46.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[50:51]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
-; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb50.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[62:63]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
-; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb3.i.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[60:61]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_21
-; GLOBALNESS0-NEXT:  ; %bb.20: ; %bb6.i.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[94:95]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_14
+; GLOBALNESS0-NEXT:  ; %bb.16: ; %bb46.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[88:89]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_14
+; GLOBALNESS0-NEXT:  ; %bb.17: ; %bb50.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_20
+; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb3.i.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[40:41]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_20
+; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb6.i.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[56:57]
-; GLOBALNESS0-NEXT:  .LBB1_21: ; %spam.exit.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[48:49]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
-; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb55.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_add_u32 s64, s38, 40
-; GLOBALNESS0-NEXT:    s_addc_u32 s65, s39, 0
-; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS0-NEXT:  .LBB1_20: ; %spam.exit.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[90:91]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_14
+; GLOBALNESS0-NEXT:  ; %bb.21: ; %bb55.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
+; GLOBALNESS0-NEXT:    s_add_u32 s64, s60, 40
+; GLOBALNESS0-NEXT:    s_addc_u32 s65, s61, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[62:63]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[54:55]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[64:65]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s42
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s45
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s44
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
-; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[100:101]
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s100
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s99
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s98
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v42
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[66:67]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[62:63]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[54:55]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[64:65]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s42
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s45
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s44
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s100
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s99
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s98
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[0:1], a[32:33], off
-; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[100:101]
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[66:67]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[58:59]
-; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_14
-; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb62.i
-; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_13
+; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb62.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_15 Depth=2
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[0:1], v[44:45], off
-; GLOBALNESS0-NEXT:    s_branch .LBB1_14
-; GLOBALNESS0-NEXT:  .LBB1_24: ; %Flow14
+; GLOBALNESS0-NEXT:    s_branch .LBB1_13
+; GLOBALNESS0-NEXT:  .LBB1_23: ; %LeafBlock
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s39, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
+; GLOBALNESS0-NEXT:  .LBB1_24: ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
+; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GLOBALNESS0-NEXT:    s_branch .LBB1_3
+; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow14
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s56, v41, 0
-; GLOBALNESS0-NEXT:    v_readlane_b32 s57, v41, 1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 8
-; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 9
-; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 10
-; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 11
-; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 12
-; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 13
-; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 14
-; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 15
-; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 16
-; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 17
-; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 18
-; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 19
-; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 20
-; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 21
-; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 22
-; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 23
-; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 24
-; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 25
-; GLOBALNESS0-NEXT:    s_mov_b32 s64, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s65, s57
-; GLOBALNESS0-NEXT:    v_readlane_b32 s59, v41, 3
-; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 26
-; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 27
-; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 28
-; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 29
-; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 30
-; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 31
-; GLOBALNESS0-NEXT:    s_mov_b32 s66, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s67, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s68, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s69, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s70, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s71, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s72, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s73, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s74, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s75, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s76, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s77, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s78, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s79, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s80, s57
-; GLOBALNESS0-NEXT:    s_mov_b32 s81, s57
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[64:65], s[64:65] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_readlane_b32 s58, v41, 2
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], s[66:67], s[66:67] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[4:5], s[68:69], s[68:69] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[6:7], s[70:71], s[70:71] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[8:9], s[72:73], s[72:73] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[10:11], s[74:75], s[74:75] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[12:13], s[76:77], s[76:77] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[14:15], s[78:79], s[78:79] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[16:17], s[80:81], s[80:81] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[18:19], s[82:83], s[82:83] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[20:21], s[84:85], s[84:85] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[22:23], s[86:87], s[86:87] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[24:25], s[88:89], s[88:89] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[26:27], s[90:91], s[90:91] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[28:29], s[92:93], s[92:93] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[30:31], s[94:95], s[94:95] op_sel:[0,1]
-; GLOBALNESS0-NEXT:    s_mov_b32 s59, s91
-; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 4
-; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 5
-; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 6
-; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 7
-; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow15
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[36:37]
+; GLOBALNESS0-NEXT:    s_mov_b32 s36, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s37, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s38, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s39, s93
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[40:41]
+; GLOBALNESS0-NEXT:    s_mov_b32 s40, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s41, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s42, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s43, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s44, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s45, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s46, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s47, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s48, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s49, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s50, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s51, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s52, s93
+; GLOBALNESS0-NEXT:    s_mov_b32 s53, s93
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[4:5], s[40:41], s[40:41] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[6:7], s[42:43], s[42:43] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[8:9], s[44:45], s[44:45] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[10:11], s[46:47], s[46:47] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[12:13], s[48:49], s[48:49] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[14:15], s[50:51], s[50:51] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[16:17], s[52:53], s[52:53] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[18:19], s[54:55], s[54:55] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[20:21], s[56:57], s[56:57] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[22:23], s[58:59], s[58:59] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[24:25], s[60:61], s[60:61] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[26:27], s[62:63], s[62:63] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[28:29], s[64:65], s[64:65] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[30:31], s[66:67], s[66:67] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[40:41], s[6:7]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[4:5]
+; GLOBALNESS0-NEXT:    s_mov_b32 s39, s75
+; GLOBALNESS0-NEXT:    s_mov_b64 s[42:43], s[72:73]
+; GLOBALNESS0-NEXT:  .LBB1_26: ; %Flow15
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[88:89]
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[54:55]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[92:93]
+; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[70:71]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[96:97]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
-; GLOBALNESS0-NEXT:  ; %bb.26: ; %bb67.i
+; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb67.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 38
-; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 39
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 3
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
-; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb69.i
+; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_1
-; GLOBALNESS0-NEXT:  .LBB1_28: ; %bb73.i
+; GLOBALNESS0-NEXT:  .LBB1_29: ; %bb73.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_2
-; GLOBALNESS0-NEXT:  .LBB1_29: ; %loop.exit.guard
+; GLOBALNESS0-NEXT:  .LBB1_30: ; %loop.exit.guard
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
-; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_31
-; GLOBALNESS0-NEXT:  ; %bb.30: ; %bb7.i.i
-; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
-; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
-; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_32
+; GLOBALNESS0-NEXT:  ; %bb.31: ; %bb7.i.i
+; GLOBALNESS0-NEXT:    s_add_u32 s8, s60, 40
+; GLOBALNESS0-NEXT:    s_addc_u32 s9, s61, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[62:63]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[54:55]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s42
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s45
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s44
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s100
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s99
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s98
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS0-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
-; GLOBALNESS0-NEXT:  .LBB1_31: ; %Flow
+; GLOBALNESS0-NEXT:  .LBB1_32: ; %Flow
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_33
-; GLOBALNESS0-NEXT:  ; %bb.32: ; %bb11.i.i
-; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
-; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
-; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_34
+; GLOBALNESS0-NEXT:  ; %bb.33: ; %bb11.i.i
+; GLOBALNESS0-NEXT:    s_add_u32 s8, s60, 40
+; GLOBALNESS0-NEXT:    s_addc_u32 s9, s61, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[62:63]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[54:55]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[34:35]
-; GLOBALNESS0-NEXT:    s_mov_b32 s12, s42
-; GLOBALNESS0-NEXT:    s_mov_b32 s13, s45
-; GLOBALNESS0-NEXT:    s_mov_b32 s14, s44
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s100
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s99
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s98
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[16:17]
 ; GLOBALNESS0-NEXT:    s_add_u32 s16, s16, widget at rel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s17, s17, widget at rel32@hi+12
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GLOBALNESS0-NEXT:  .LBB1_33: ; %UnifiedUnreachableBlock
+; GLOBALNESS0-NEXT:  .LBB1_34: ; %UnifiedUnreachableBlock
 bb:
   store i32 0, i32 addrspace(1)* null, align 4
   %tmp4 = load i32, i32 addrspace(1)* %arg1.global, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/v1024.ll b/llvm/test/CodeGen/AMDGPU/v1024.ll
index a5e0454a3634..1326ba437f94 100644
--- a/llvm/test/CodeGen/AMDGPU/v1024.ll
+++ b/llvm/test/CodeGen/AMDGPU/v1024.ll
@@ -10,6 +10,7 @@ define amdgpu_kernel void @test_v1024() {
 entry:
   %alloca = alloca <32 x i32>, align 16, addrspace(5)
   %cast = bitcast <32 x i32> addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %cast, i8 0, i32 128, i1 false)
   br i1 undef, label %if.then.i.i, label %if.else.i
 
 if.then.i.i:                                      ; preds = %entry
@@ -24,6 +25,7 @@ if.then.i62.i:                                    ; preds = %if.else.i, %if.then
   ret void
 }
 
+declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture readonly, i8, i32, i1 immarg)
 declare void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i1 immarg)
 
 declare void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i1 immarg)

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 5164b072a6dd..ed0de729dafd 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -14,7 +14,6 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v16
@@ -22,13 +21,6 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v14
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v13
 ; GFX9-NEXT:    v_mov_b32_e32 v32, v12
-; GFX9-NEXT:    s_mov_b32 s5, s4
-; GFX9-NEXT:    s_mov_b32 s6, s4
-; GFX9-NEXT:    s_mov_b32 s7, s4
-; GFX9-NEXT:    s_mov_b32 s8, s4
-; GFX9-NEXT:    s_mov_b32 s9, s4
-; GFX9-NEXT:    s_mov_b32 s10, s4
-; GFX9-NEXT:    s_mov_b32 s11, s4
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -82,16 +74,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    v_mov_b32_e32 v34, v14
 ; GFX10-NEXT:    v_mov_b32_e32 v33, v13
 ; GFX10-NEXT:    v_mov_b32_e32 v32, v12
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_mov_b32 s5, s4
-; GFX10-NEXT:    s_mov_b32 s6, s4
-; GFX10-NEXT:    s_mov_b32 s7, s4
-; GFX10-NEXT:    s_mov_b32 s8, s4
-; GFX10-NEXT:    s_mov_b32 s9, s4
-; GFX10-NEXT:    s_mov_b32 s10, s4
-; GFX10-NEXT:    s_mov_b32 s11, s4
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -145,16 +129,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15
 ; GFX11-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
 ; GFX11-NEXT:    v_mov_b32_e32 v32, v12
-; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s2, s0
-; GFX11-NEXT:    s_mov_b32 s3, s0
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    s_mov_b32 s5, s0
-; GFX11-NEXT:    s_mov_b32 s6, s0
-; GFX11-NEXT:    s_mov_b32 s7, s0
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:12
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
@@ -225,65 +201,41 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 10
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s40, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s41, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    v_writelane_b32 v40, s42, 8
-; GFX9-NEXT:    s_mov_b32 s36, 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v40, s43, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v45, v16
 ; GFX9-NEXT:    v_mov_b32_e32 v44, v15
 ; GFX9-NEXT:    v_mov_b32_e32 v43, v14
 ; GFX9-NEXT:    v_mov_b32_e32 v42, v13
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v12
-; GFX9-NEXT:    s_mov_b32 s37, s36
-; GFX9-NEXT:    s_mov_b32 s38, s36
-; GFX9-NEXT:    s_mov_b32 s39, s36
-; GFX9-NEXT:    s_mov_b32 s40, s36
-; GFX9-NEXT:    s_mov_b32 s41, s36
-; GFX9-NEXT:    s_mov_b32 s42, s36
-; GFX9-NEXT:    s_mov_b32 s43, s36
-; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
+; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
+; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s43, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s42, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s41, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s40, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -298,66 +250,42 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, v16
 ; GFX10-NEXT:    v_mov_b32_e32 v42, v15
 ; GFX10-NEXT:    v_mov_b32_e32 v43, v14
-; GFX10-NEXT:    v_mov_b32_e32 v44, v13
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_mov_b32_e32 v44, v13
 ; GFX10-NEXT:    v_mov_b32_e32 v45, v12
-; GFX10-NEXT:    v_writelane_b32 v40, s36, 2
-; GFX10-NEXT:    s_mov_b32 s36, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s37, 3
-; GFX10-NEXT:    s_mov_b32 s37, s36
-; GFX10-NEXT:    v_writelane_b32 v40, s38, 4
-; GFX10-NEXT:    s_mov_b32 s38, s36
-; GFX10-NEXT:    v_writelane_b32 v40, s39, 5
-; GFX10-NEXT:    s_mov_b32 s39, s36
-; GFX10-NEXT:    v_writelane_b32 v40, s40, 6
-; GFX10-NEXT:    s_mov_b32 s40, s36
-; GFX10-NEXT:    v_writelane_b32 v40, s41, 7
-; GFX10-NEXT:    s_mov_b32 s41, s36
-; GFX10-NEXT:    v_writelane_b32 v40, s42, 8
-; GFX10-NEXT:    s_mov_b32 s42, s36
-; GFX10-NEXT:    v_writelane_b32 v40, s43, 9
-; GFX10-NEXT:    s_mov_b32 s43, s36
-; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_clause 0x4
 ; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16
-; GFX10-NEXT:    v_readlane_b32 s43, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s42, v40, 8
-; GFX10-NEXT:    v_readlane_b32 s41, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s40, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s39, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s38, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s37, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s36, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -372,7 +300,7 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:20 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v40, s33, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_clause 0x4
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:16
@@ -380,56 +308,32 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v45, s33
+; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, extern_func at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+12
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15
 ; GFX11-NEXT:    v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    v_mov_b32_e32 v45, v12
-; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
-; GFX11-NEXT:    s_mov_b32 s36, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
-; GFX11-NEXT:    s_mov_b32 s37, s36
-; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
-; GFX11-NEXT:    s_mov_b32 s38, s36
-; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
-; GFX11-NEXT:    s_mov_b32 s39, s36
-; GFX11-NEXT:    v_writelane_b32 v40, s40, 6
-; GFX11-NEXT:    s_mov_b32 s40, s36
-; GFX11-NEXT:    v_writelane_b32 v40, s41, 7
-; GFX11-NEXT:    s_mov_b32 s41, s36
-; GFX11-NEXT:    v_writelane_b32 v40, s42, 8
-; GFX11-NEXT:    s_mov_b32 s42, s36
-; GFX11-NEXT:    v_writelane_b32 v40, s43, 9
-; GFX11-NEXT:    s_mov_b32 s43, s36
-; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, extern_func at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func at gotpcrel32@hi+12
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    s_clause 0x4
 ; GFX11-NEXT:    scratch_load_b32 v45, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:12
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:16
-; GFX11-NEXT:    v_readlane_b32 s43, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s42, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s41, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s40, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
-; GFX11-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:20 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
index c8312c0f039d..7cd9fb9bcd4f 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
@@ -6,7 +6,7 @@ declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>,
 
 define amdgpu_cs void @xyz () {
 ; CHECK-LABEL: xyz:
-; CHECK: v_wmma_f32_16x16x16_f16 v[0:3], v[4:11], v[4:11], v[0:3]
+; CHECK: v_wmma_f32_16x16x16_f16 v[0:3], v[0:7], v[0:7], v[0:3]
 
 .entry:
   br label %loop

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index dc85462631d4..16c30174657a 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1833,87 +1833,54 @@ main_body:
 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
 ; GFX9-W64-LABEL: test_loop_vcc:
 ; GFX9-W64:       ; %bb.0: ; %entry
-; GFX9-W64-NEXT:    s_mov_b64 s[8:9], exec
+; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[8:9]
-; GFX9-W64-NEXT:    s_mov_b32 s0, 0
-; GFX9-W64-NEXT:    s_mov_b32 s1, s0
-; GFX9-W64-NEXT:    s_mov_b32 s2, s0
-; GFX9-W64-NEXT:    s_mov_b32 s3, s0
-; GFX9-W64-NEXT:    s_mov_b32 s4, s0
-; GFX9-W64-NEXT:    s_mov_b32 s5, s0
-; GFX9-W64-NEXT:    s_mov_b32 s6, s0
-; GFX9-W64-NEXT:    s_mov_b32 s7, s0
+; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX9-W64-NEXT:    image_store v[4:7], v0, s[0:7] dmask:0xf unorm
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-W64-NEXT:    s_mov_b32 s10, 0x40e00000
+; GFX9-W64-NEXT:    s_mov_b32 s4, 0x40e00000
 ; GFX9-W64-NEXT:    s_branch .LBB31_2
 ; GFX9-W64-NEXT:  .LBB31_1: ; %body
 ; GFX9-W64-NEXT:    ; in Loop: Header=BB31_2 Depth=1
-; GFX9-W64-NEXT:    s_mov_b32 s1, s0
-; GFX9-W64-NEXT:    s_mov_b32 s2, s0
-; GFX9-W64-NEXT:    s_mov_b32 s3, s0
-; GFX9-W64-NEXT:    s_mov_b32 s4, s0
-; GFX9-W64-NEXT:    s_mov_b32 s5, s0
-; GFX9-W64-NEXT:    s_mov_b32 s6, s0
-; GFX9-W64-NEXT:    s_mov_b32 s7, s0
 ; GFX9-W64-NEXT:    image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf
 ; GFX9-W64-NEXT:    v_add_f32_e32 v8, 2.0, v8
-; GFX9-W64-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB31_4
 ; GFX9-W64-NEXT:  .LBB31_2: ; %loop
 ; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-W64-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v8
+; GFX9-W64-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v8
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX9-W64-NEXT:    s_cbranch_vccz .LBB31_1
 ; GFX9-W64-NEXT:  ; %bb.3:
-; GFX9-W64-NEXT:    s_mov_b64 s[2:3], -1
 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr8
 ; GFX9-W64-NEXT:  .LBB31_4: ; %break
-; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[8:9]
+; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-W32-LABEL: test_loop_vcc:
 ; GFX10-W32:       ; %bb.0: ; %entry
-; GFX10-W32-NEXT:    s_mov_b32 s8, exec_lo
+; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v8, 0
-; GFX10-W32-NEXT:    s_mov_b32 s0, 0
-; GFX10-W32-NEXT:    s_mov_b32 s1, s0
-; GFX10-W32-NEXT:    s_mov_b32 s2, s0
-; GFX10-W32-NEXT:    s_mov_b32 s3, s0
-; GFX10-W32-NEXT:    s_mov_b32 s4, s0
-; GFX10-W32-NEXT:    s_mov_b32 s5, s0
-; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s8
-; GFX10-W32-NEXT:    s_mov_b32 s6, s0
-; GFX10-W32-NEXT:    s_mov_b32 s7, s0
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    s_branch .LBB31_2
 ; GFX10-W32-NEXT:    .p2align 6
 ; GFX10-W32-NEXT:  .LBB31_1: ; %body
 ; GFX10-W32-NEXT:    ; in Loop: Header=BB31_2 Depth=1
-; GFX10-W32-NEXT:    s_mov_b32 s1, s0
-; GFX10-W32-NEXT:    s_mov_b32 s2, s0
-; GFX10-W32-NEXT:    s_mov_b32 s3, s0
-; GFX10-W32-NEXT:    s_mov_b32 s4, s0
-; GFX10-W32-NEXT:    s_mov_b32 s5, s0
-; GFX10-W32-NEXT:    s_mov_b32 s6, s0
-; GFX10-W32-NEXT:    s_mov_b32 s7, s0
-; GFX10-W32-NEXT:    v_add_f32_e32 v8, 2.0, v8
 ; GFX10-W32-NEXT:    image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT:    s_mov_b32 s1, 0
+; GFX10-W32-NEXT:    v_add_f32_e32 v8, 2.0, v8
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB31_4
 ; GFX10-W32-NEXT:  .LBB31_2: ; %loop
 ; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1925,11 +1892,10 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-W32-NEXT:    s_cbranch_vccz .LBB31_1
 ; GFX10-W32-NEXT:  ; %bb.3:
-; GFX10-W32-NEXT:    s_mov_b32 s1, -1
 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr8
 ; GFX10-W32-NEXT:  .LBB31_4: ; %break
-; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s8
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v5
@@ -1999,14 +1965,6 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
 ; GFX9-W64-NEXT:    v_lshl_add_u32 v0, v2, 2, v0
 ; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
-; GFX9-W64-NEXT:    s_mov_b32 s0, 0
-; GFX9-W64-NEXT:    s_mov_b32 s1, s0
-; GFX9-W64-NEXT:    s_mov_b32 s2, s0
-; GFX9-W64-NEXT:    s_mov_b32 s3, s0
-; GFX9-W64-NEXT:    s_mov_b32 s4, s0
-; GFX9-W64-NEXT:    s_mov_b32 s5, s0
-; GFX9-W64-NEXT:    s_mov_b32 s6, s0
-; GFX9-W64-NEXT:    s_mov_b32 s7, s0
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
@@ -2035,14 +1993,6 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    buffer_load_dword v0, v2, s[8:11], 0 offen
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
-; GFX10-W32-NEXT:    s_mov_b32 s0, 0
-; GFX10-W32-NEXT:    s_mov_b32 s1, s0
-; GFX10-W32-NEXT:    s_mov_b32 s2, s0
-; GFX10-W32-NEXT:    s_mov_b32 s3, s0
-; GFX10-W32-NEXT:    s_mov_b32 s4, s0
-; GFX10-W32-NEXT:    s_mov_b32 s5, s0
-; GFX10-W32-NEXT:    s_mov_b32 s6, s0
-; GFX10-W32-NEXT:    s_mov_b32 s7, s0
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
@@ -2079,18 +2029,10 @@ entry:
 define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
 ; GFX9-W64-LABEL: test_nonvoid_return:
 ; GFX9-W64:       ; %bb.0:
-; GFX9-W64-NEXT:    s_mov_b32 s0, 0
-; GFX9-W64-NEXT:    s_mov_b64 s[8:9], exec
-; GFX9-W64-NEXT:    s_mov_b32 s1, s0
-; GFX9-W64-NEXT:    s_mov_b32 s2, s0
-; GFX9-W64-NEXT:    s_mov_b32 s3, s0
-; GFX9-W64-NEXT:    s_mov_b32 s4, s0
-; GFX9-W64-NEXT:    s_mov_b32 s5, s0
-; GFX9-W64-NEXT:    s_mov_b32 s6, s0
-; GFX9-W64-NEXT:    s_mov_b32 s7, s0
+; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
-; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[8:9]
+; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
@@ -2098,18 +2040,10 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
 ;
 ; GFX10-W32-LABEL: test_nonvoid_return:
 ; GFX10-W32:       ; %bb.0:
-; GFX10-W32-NEXT:    s_mov_b32 s0, 0
-; GFX10-W32-NEXT:    s_mov_b32 s8, exec_lo
-; GFX10-W32-NEXT:    s_mov_b32 s1, s0
-; GFX10-W32-NEXT:    s_mov_b32 s2, s0
-; GFX10-W32-NEXT:    s_mov_b32 s3, s0
-; GFX10-W32-NEXT:    s_mov_b32 s4, s0
-; GFX10-W32-NEXT:    s_mov_b32 s5, s0
-; GFX10-W32-NEXT:    s_mov_b32 s6, s0
-; GFX10-W32-NEXT:    s_mov_b32 s7, s0
+; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s8
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
@@ -2128,20 +2062,11 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
 define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
 ; GFX9-W64-LABEL: test_nonvoid_return_unreachable:
 ; GFX9-W64:       ; %bb.0: ; %entry
-; GFX9-W64-NEXT:    s_mov_b32 s4, 0
-; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
-; GFX9-W64-NEXT:    s_mov_b32 s5, s4
-; GFX9-W64-NEXT:    s_mov_b32 s6, s4
-; GFX9-W64-NEXT:    s_mov_b32 s7, s4
-; GFX9-W64-NEXT:    s_mov_b32 s8, s4
-; GFX9-W64-NEXT:    s_mov_b32 s9, s4
-; GFX9-W64-NEXT:    s_mov_b32 s10, s4
-; GFX9-W64-NEXT:    s_mov_b32 s11, s4
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
-; GFX9-W64-NEXT:    image_sample v0, v0, s[4:11], s[0:3] dmask:0x1
-; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
+; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
+; GFX9-W64-NEXT:    s_and_b64 exec, exec, exec
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf
+; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
 ; GFX9-W64-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB34_2
 ; GFX9-W64-NEXT:  ; %bb.1: ; %else
@@ -2155,20 +2080,11 @@ define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) noun
 ;
 ; GFX10-W32-LABEL: test_nonvoid_return_unreachable:
 ; GFX10-W32:       ; %bb.0: ; %entry
-; GFX10-W32-NEXT:    s_mov_b32 s4, 0
-; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
-; GFX10-W32-NEXT:    s_mov_b32 s5, s4
-; GFX10-W32-NEXT:    s_mov_b32 s6, s4
-; GFX10-W32-NEXT:    s_mov_b32 s7, s4
-; GFX10-W32-NEXT:    s_mov_b32 s8, s4
-; GFX10-W32-NEXT:    s_mov_b32 s9, s4
-; GFX10-W32-NEXT:    s_mov_b32 s10, s4
-; GFX10-W32-NEXT:    s_mov_b32 s11, s4
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    image_sample v0, v0, s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
+; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, exec_lo
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB34_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %else
@@ -2215,33 +2131,17 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
 ; GFX9-W64-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB35_2
 ; GFX9-W64-NEXT:  ; %bb.1: ; %else
-; GFX9-W64-NEXT:    s_mov_b32 s4, 0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 1
-; GFX9-W64-NEXT:    s_mov_b32 s5, s4
-; GFX9-W64-NEXT:    s_mov_b32 s6, s4
-; GFX9-W64-NEXT:    s_mov_b32 s7, s4
-; GFX9-W64-NEXT:    s_mov_b32 s8, s4
-; GFX9-W64-NEXT:    s_mov_b32 s9, s4
-; GFX9-W64-NEXT:    s_mov_b32 s10, s4
-; GFX9-W64-NEXT:    s_mov_b32 s11, s4
-; GFX9-W64-NEXT:    image_sample v[0:3], v[0:1], s[4:11], s[0:3] dmask:0xf
+; GFX9-W64-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf
 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB35_3
 ; GFX9-W64-NEXT:    s_branch .LBB35_4
 ; GFX9-W64-NEXT:  .LBB35_2:
 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX9-W64-NEXT:  .LBB35_3: ; %if
-; GFX9-W64-NEXT:    s_mov_b32 s4, 0
-; GFX9-W64-NEXT:    s_mov_b32 s5, s4
-; GFX9-W64-NEXT:    s_mov_b32 s6, s4
-; GFX9-W64-NEXT:    s_mov_b32 s7, s4
-; GFX9-W64-NEXT:    s_mov_b32 s8, s4
-; GFX9-W64-NEXT:    s_mov_b32 s9, s4
-; GFX9-W64-NEXT:    s_mov_b32 s10, s4
-; GFX9-W64-NEXT:    s_mov_b32 s11, s4
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf
+; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
 ; GFX9-W64-NEXT:  .LBB35_4: ; %end
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v5, 1.0
@@ -2252,21 +2152,13 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
 ; GFX10-W32-LABEL: test_scc:
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-W32-NEXT:    s_mov_b32 s8, exec_lo
+; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB35_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %else
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 1
-; GFX10-W32-NEXT:    s_mov_b32 s0, 0
-; GFX10-W32-NEXT:    s_mov_b32 s1, s0
-; GFX10-W32-NEXT:    s_mov_b32 s2, s0
-; GFX10-W32-NEXT:    s_mov_b32 s3, s0
-; GFX10-W32-NEXT:    s_mov_b32 s4, s0
-; GFX10-W32-NEXT:    s_mov_b32 s5, s0
-; GFX10-W32-NEXT:    s_mov_b32 s6, s0
-; GFX10-W32-NEXT:    s_mov_b32 s7, s0
 ; GFX10-W32-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB35_3
 ; GFX10-W32-NEXT:    s_branch .LBB35_4
@@ -2275,17 +2167,9 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
 ; GFX10-W32-NEXT:  .LBB35_3: ; %if
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT:    s_mov_b32 s0, 0
-; GFX10-W32-NEXT:    s_mov_b32 s1, s0
-; GFX10-W32-NEXT:    s_mov_b32 s2, s0
-; GFX10-W32-NEXT:    s_mov_b32 s3, s0
-; GFX10-W32-NEXT:    s_mov_b32 s4, s0
-; GFX10-W32-NEXT:    s_mov_b32 s5, s0
-; GFX10-W32-NEXT:    s_mov_b32 s6, s0
-; GFX10-W32-NEXT:    s_mov_b32 s7, s0
 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:  .LBB35_4: ; %end
-; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s8
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v5, 1.0
 ; GFX10-W32-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 idxen
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)


        


More information about the llvm-commits mailing list