[llvm] ce096b2 - AMDGPU: Convert some tests to opaque pointers

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 19 06:04:23 PST 2022


Author: Matt Arsenault
Date: 2022-12-19T09:04:17-05:00
New Revision: ce096b2207d9a04c8170f3b67071cb8cb299f5b6

URL: https://github.com/llvm/llvm-project/commit/ce096b2207d9a04c8170f3b67071cb8cb299f5b6
DIFF: https://github.com/llvm/llvm-project/commit/ce096b2207d9a04c8170f3b67071cb8cb299f5b6.diff

LOG: AMDGPU: Convert some tests to opaque pointers

These required update_mir_test_checks.

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
    llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
    llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
    llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
    llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index b7d76f1ae63c..622847e9165c 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s
 
-define amdgpu_kernel void @uniform_trunc_i16_to_i1(i1 addrspace(1)* %out, i16 %x, i1 %z) {
+define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) {
   ; GCN-LABEL: name: uniform_trunc_i16_to_i1
   ; GCN: bb.0 (%ir-block.0):
   ; GCN-NEXT:   liveins: $sgpr0_sgpr1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
-  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset.cast, align 4, addrspace 4)
-  ; GCN-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down.cast, addrspace 4)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+  ; GCN-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4)
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
@@ -27,11 +27,11 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(i1 addrspace(1)* %out, i16 %x
   ; GCN-NEXT:   S_ENDPGM 0
   %setcc = icmp slt i16 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
-  store i1 %select, i1 addrspace(1)* %out
+  store i1 %select, ptr addrspace(1) %out
   ret void
 }
 
-define i1 @divergent_trunc_i16_to_i1(i1 addrspace(1)* %out, i16 %x, i1 %z) {
+define i1 @divergent_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x, i1 %z) {
   ; GCN-LABEL: name: divergent_trunc_i16_to_i1
   ; GCN: bb.0 (%ir-block.0):
   ; GCN-NEXT:   liveins: $vgpr2, $vgpr3
@@ -52,14 +52,14 @@ define i1 @divergent_trunc_i16_to_i1(i1 addrspace(1)* %out, i16 %x, i1 %z) {
   ret i1 %select
 }
 
-define amdgpu_kernel void @uniform_trunc_i32_to_i1(i1 addrspace(1)* %out, i32 %x, i1 %z) {
+define amdgpu_kernel void @uniform_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) {
   ; GCN-LABEL: name: uniform_trunc_i32_to_i1
   ; GCN: bb.0 (%ir-block.0):
   ; GCN-NEXT:   liveins: $sgpr0_sgpr1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
-  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset.cast, align 4, addrspace 4)
-  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s64) from %ir.1, align 4, addrspace 4)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s64) from %ir.x.kernarg.offset, align 4, addrspace 4)
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
@@ -79,11 +79,11 @@ define amdgpu_kernel void @uniform_trunc_i32_to_i1(i1 addrspace(1)* %out, i32 %x
   ; GCN-NEXT:   S_ENDPGM 0
   %setcc = icmp slt i32 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
-  store i1 %select, i1 addrspace(1)* %out
+  store i1 %select, ptr addrspace(1) %out
   ret void
 }
 
-define i1 @divergent_trunc_i32_to_i1(i1 addrspace(1)* %out, i32 %x, i1 %z) {
+define i1 @divergent_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %x, i1 %z) {
   ; GCN-LABEL: name: divergent_trunc_i32_to_i1
   ; GCN: bb.0 (%ir-block.0):
   ; GCN-NEXT:   liveins: $vgpr2, $vgpr3
@@ -103,14 +103,14 @@ define i1 @divergent_trunc_i32_to_i1(i1 addrspace(1)* %out, i32 %x, i1 %z) {
   ret i1 %select
 }
 
-define amdgpu_kernel void @uniform_trunc_i64_to_i1(i1 addrspace(1)* %out, i64 %x, i1 %z) {
+define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x, i1 %z) {
   ; GCN-LABEL: name: uniform_trunc_i64_to_i1
   ; GCN: bb.0 (%ir-block.0):
   ; GCN-NEXT:   liveins: $sgpr0_sgpr1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
-  ; GCN-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.1, align 4, addrspace 4)
-  ; GCN-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down.cast, addrspace 4)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
+  ; GCN-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4)
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
   ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1
@@ -130,15 +130,15 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(i1 addrspace(1)* %out, i64 %x
   ; GCN-NEXT:   [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY7]], implicit-def dead $scc
   ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
-  ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.3, addrspace 1)
+  ; GCN-NEXT:   BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.2, addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   %setcc = icmp slt i64 %x, 0
   %select = select i1 %setcc, i1 true, i1 %z
-  store i1 %select, i1 addrspace(1)* %out
+  store i1 %select, ptr addrspace(1) %out
   ret void
 }
 
-define i1 @divergent_trunc_i64_to_i1(i1 addrspace(1)* %out, i64 %x, i1 %z) {
+define i1 @divergent_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x, i1 %z) {
   ; GCN-LABEL: name: divergent_trunc_i64_to_i1
   ; GCN: bb.0 (%ir-block.0):
   ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 892afb1bf92c..52eab573ea44 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -96,7 +96,7 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b32 s33, s4
 ; NO-SPILL-TO-VGPR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
-  store volatile i32 0, i32 addrspace(5)* %alloca
+  store volatile i32 0, ptr addrspace(5) %alloca
   call void @external_void_func_void()
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index 52553fcac67f..e0e8dc88bdac 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -7,7 +7,7 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN: bb.0 (%ir-block.0):
   ; GCN-NEXT:   liveins: $sgpr4_sgpr5
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset.cast, addrspace 4)
+  ; GCN-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
@@ -16,7 +16,7 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-NEXT:   undef %24.sub0:av_64 = COPY %22.sub0
   ; GCN-NEXT:   SI_SPILL_AV64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
-  ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %16:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `<4 x i32> addrspace(1)* undef`, addrspace 1)
+  ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %16:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
   ; GCN-NEXT:   undef %23.sub0:vreg_64 = COPY [[SI_SPILL_AV64_RESTORE]].sub0
   ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3080201 /* reguse:VReg_64 */, %23
@@ -24,7 +24,7 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   %v0 = call i32 asm sideeffect "; def $0", "=v"()
   %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0
   %mai = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %arg, i32 0, i32 0, i32 0)
-  store volatile <4 x i32> %mai, <4 x i32> addrspace(1)* undef
+  store volatile <4 x i32> %mai, ptr addrspace(1) undef
   call void asm sideeffect "; use $0", "v"(<2 x i32> %tmp);
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 323425bf1f84..c70b0190e0c2 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -30,7 +30,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr9
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr10
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr8
-  ; CHECK-NEXT:   undef %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %56, 232, 0 :: (invariant load (s64) from %ir.40, addrspace 4)
+  ; CHECK-NEXT:   undef %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %56, 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4)
   ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc
@@ -40,8 +40,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, 65535, implicit-def dead $scc
   ; CHECK-NEXT:   undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0 :: (invariant load (s128) from %ir.84, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `<4 x i32> addrspace(4)* undef`, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %132:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   KILL undef %74:sreg_64
   ; CHECK-NEXT:   KILL undef %132:sgpr_128
@@ -60,7 +60,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   undef %149.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_]], implicit-def $scc
   ; CHECK-NEXT:   %149.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef %156.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0 :: (invariant load (s128) from %ir.91, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
   ; CHECK-NEXT:   %156.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef %163.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   %163.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
@@ -104,11 +104,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %369:sgpr_128, undef %370:sreg_32, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0 :: (invariant load (s128) from %ir.97, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %163, 0, 0 :: (invariant load (s128) from %ir.103, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0 :: (invariant load (s128) from %ir.111, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0 :: (invariant load (s128) from %ir.123, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0 :: (invariant load (s128) from %ir.92, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %163, 0, 0 :: (invariant load (s128) from %ir.97, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0 :: (invariant load (s128) from %ir.104, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0 :: (invariant load (s128) from %ir.109, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0 :: (invariant load (s128) from %ir.114, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %364:sgpr_128, [[S_ADD_I32_]], 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %375:sgpr_128, [[S_ADD_I32_1]], 0 :: (dereferenceable invariant load (s32))
@@ -121,7 +121,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   undef %335.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_]], implicit-def $scc
   ; CHECK-NEXT:   %335.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef %343.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0 :: (invariant load (s128) from %ir.131, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0 :: (invariant load (s128) from %ir.121, addrspace 4)
   ; CHECK-NEXT:   %343.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef %351.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   %351.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
@@ -130,11 +130,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %396:sgpr_128, [[S_ADD_I32_6]], 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0 :: (invariant load (s128) from %ir.155, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0 :: (invariant load (s128) from %ir.138, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0 :: (invariant load (s128) from %ir.126, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0 :: (invariant load (s128) from %ir.127, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0 :: (invariant load (s128) from %ir.144, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0 :: (invariant load (s128) from %ir.132, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0 :: (invariant load (s128) from %ir.137, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
@@ -157,29 +157,29 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   %425.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_ADD_U32_4:%[0-9]+]]:sreg_32 = S_ADD_U32 %56.sub0, 168, implicit-def $scc
   ; CHECK-NEXT:   [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %57:sreg_32, 0, implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0 :: (invariant load (s128) from %ir.162, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0 :: (invariant load (s128) from %ir.147, addrspace 4)
   ; CHECK-NEXT:   [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   undef %441.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_5]], implicit-def $scc
   ; CHECK-NEXT:   %441.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %441, 0, 0 :: (invariant load (s32) from %ir..i085.i, align 8, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %441, 0, 0 :: (invariant load (s32) from %ir.269, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0 :: (invariant load (s128) from %ir.154, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0 :: (invariant load (s128) from %ir.176, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0 :: (invariant load (s128) from %ir.159, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   %71.sub3:sgpr_128 = S_MOV_B32 553734060
   ; CHECK-NEXT:   %71.sub2:sgpr_128 = S_MOV_B32 -1
   ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:sgpr_128 = COPY %71
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %273, 0, 0 :: (invariant load (s128) from %ir.185, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %273, 0, 0 :: (invariant load (s128) from %ir.167, addrspace 4)
   ; CHECK-NEXT:   [[COPY13]].sub1:sgpr_128 = COPY %302.sub1
   ; CHECK-NEXT:   [[COPY13]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY13]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0 :: (invariant load (s128) from %ir.194, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %293, 0, 0 :: (invariant load (s128) from %ir.200, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0 :: (invariant load (s128) from %ir.175, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %293, 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
@@ -187,18 +187,18 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
   ; CHECK-NEXT:   undef %453.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_6]], implicit-def $scc
   ; CHECK-NEXT:   %453.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %453, 0, 0 :: (invariant load (s64) from %ir.308, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %453, 0, 0 :: (invariant load (s64) from %ir.277, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0 :: (invariant load (s128) from %ir.223, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0 :: (invariant load (s128) from %ir.230, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0 :: (invariant load (s128) from %ir.202, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0 :: (invariant load (s128) from %ir.208, addrspace 4)
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:sgpr_128 = COPY %71
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0 :: (invariant load (s128) from %ir.236, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4)
   ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY14]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
   ; CHECK-NEXT:   [[COPY14]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY14]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0 :: (invariant load (s128) from %ir.242, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc
@@ -208,24 +208,24 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   undef %468.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_7]], implicit-def $scc
   ; CHECK-NEXT:   %468.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %468, 0, 0 :: (invariant load (s64) from %ir.320, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %468, 0, 0 :: (invariant load (s64) from %ir.287, addrspace 4)
   ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sgpr_128 = COPY %71
   ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
   ; CHECK-NEXT:   [[COPY15]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
   ; CHECK-NEXT:   [[COPY15]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0 :: (invariant load (s128) from %ir.282, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0 :: (invariant load (s32) from `i32 addrspace(4)* undef`, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0 :: (invariant load (s128) from %ir.253, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   KILL %411.sub0, %411.sub1
   ; CHECK-NEXT:   KILL undef %488:sreg_64
   ; CHECK-NEXT:   KILL [[COPY15]].sub0_sub1, [[COPY15]].sub2_sub3
   ; CHECK-NEXT:   [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 3, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %425, 0, 0 :: (invariant load (s128) from %ir.291, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %425, 0, 0 :: (invariant load (s128) from %ir.261, addrspace 4)
   ; CHECK-NEXT:   [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
   ; CHECK-NEXT:   undef %485.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_8]], implicit-def $scc
   ; CHECK-NEXT:   %485.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %485, 0, 0 :: (invariant load (s32) from %ir..i0100.i, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %485, 0, 0 :: (invariant load (s32) from %ir.298, align 8, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM24]]
@@ -245,13 +245,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %33:sreg_32, 0, implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   undef %514.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_]], implicit-def $scc
   ; CHECK-NEXT:   %514.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %514, 0, 0 :: (invariant load (s128) from %ir.351, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %514, 0, 0 :: (invariant load (s128) from %ir.316, addrspace 4)
   ; CHECK-NEXT:   undef %522.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_1]], implicit-def $scc
   ; CHECK-NEXT:   %522.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %522, 0, 0 :: (invariant load (s128) from %ir.357, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %522, 0, 0 :: (invariant load (s128) from %ir.321, addrspace 4)
   ; CHECK-NEXT:   undef %530.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_2]], implicit-def $scc
   ; CHECK-NEXT:   %530.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %530, 0, 0 :: (invariant load (s128) from %ir.363, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %530, 0, 0 :: (invariant load (s128) from %ir.326, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
@@ -370,7 +370,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %564:sreg_64, 0, 0 :: (invariant load (s256) from `<8 x i32> addrspace(4)* undef`, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %564:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
   ; CHECK-NEXT:   [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
   ; CHECK-NEXT:   [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
@@ -396,421 +396,386 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   %14 = extractelement <31 x i32> %userData, i64 30
   %15 = insertelement <2 x i32> undef, i32 %13, i32 0
   %16 = bitcast <2 x i32> %15 to i64
-  %17 = inttoptr i64 %16 to i8 addrspace(4)*
+  %17 = inttoptr i64 %16 to ptr addrspace(4)
   %18 = insertelement <2 x i32> undef, i32 %12, i32 0
   %19 = bitcast <2 x i32> %18 to i64
-  %20 = inttoptr i64 %19 to i8 addrspace(4)*
+  %20 = inttoptr i64 %19 to ptr addrspace(4)
   %21 = insertelement <2 x i32> undef, i32 %11, i32 0
   %22 = bitcast <2 x i32> %21 to i64
   %23 = insertelement <2 x i32> undef, i32 %10, i32 0
   %24 = bitcast <2 x i32> %23 to i64
   %25 = insertelement <2 x i32> undef, i32 %9, i32 0
   %26 = bitcast <2 x i32> %25 to i64
-  %27 = inttoptr i64 %26 to i8 addrspace(4)*
+  %27 = inttoptr i64 %26 to ptr addrspace(4)
   %28 = insertelement <2 x i32> undef, i32 %8, i32 0
   %29 = bitcast <2 x i32> %28 to i64
   %30 = insertelement <2 x i32> undef, i32 %7, i32 0
   %31 = bitcast <2 x i32> %30 to i64
-  %32 = inttoptr i64 %31 to i8 addrspace(4)*
+  %32 = inttoptr i64 %31 to ptr addrspace(4)
   %33 = insertelement <2 x i32> undef, i32 %6, i32 0
   %34 = bitcast <2 x i32> %33 to i64
-  %35 = inttoptr i64 %34 to i8 addrspace(4)*
+  %35 = inttoptr i64 %34 to ptr addrspace(4)
   %36 = insertelement <2 x i32> undef, i32 %14, i32 0
   %37 = bitcast <2 x i32> %36 to i64
-  %38 = inttoptr i64 %37 to i8 addrspace(4)*
-  %39 = getelementptr i8, i8 addrspace(4)* %38, i64 232
-  %.i0.i = bitcast i8 addrspace(4)* %39 to i32 addrspace(4)*
-  %rootDesc58.ii0.i = load i32, i32 addrspace(4)* %.i0.i, align 8
-  %.i184.i = getelementptr i8, i8 addrspace(4)* %38, i64 236
-  %40 = bitcast i8 addrspace(4)* %.i184.i to i32 addrspace(4)*
-  %rootDesc58.ii1.i = load i32, i32 addrspace(4)* %40, align 4
-  %41 = and i32 %rootDesc58.ii1.i, 65535
-  %42 = insertelement <4 x i32> <i32 undef, i32 undef, i32 -1, i32 553734060>, i32 %rootDesc58.ii0.i, i32 0
-  %43 = insertelement <4 x i32> %42, i32 %41, i32 1
-  %44 = and i32 undef, 65535
-  %45 = insertelement <4 x i32> undef, i32 %44, i32 1
-  %46 = load <4 x i32>, <4 x i32> addrspace(4)* undef, align 16
-  %47 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %46, i32 0, i32 0, i32 0, i32 0)
-  %48 = add i32 %47, -1
-  %49 = shl i32 %0, 4
-  %50 = call i32 @llvm.amdgcn.readfirstlane(i32 %49)
-  %51 = sext i32 %50 to i64
-  %52 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %53 = add i32 %52, -2
-  %54 = or i32 %53, %48
-  %55 = shl i32 %1, 4
-  %56 = call i32 @llvm.amdgcn.readfirstlane(i32 %55)
-  %57 = sext i32 %56 to i64
-  %58 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %59 = add i32 %58, -3
-  %60 = or i32 %54, %59
-  %61 = shl i32 %2, 4
-  %62 = call i32 @llvm.amdgcn.readfirstlane(i32 %61)
-  %63 = sext i32 %62 to i64
-  %64 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %65 = add i32 %64, -4
-  %66 = or i32 %60, %65
-  %67 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %68 = add i32 %67, -27
-  %69 = or i32 %66, %68
-  %70 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 0, i32 0, i32 0)
-  %71 = add i32 %70, -28
-  %72 = or i32 %69, %71
-  %73 = call i32 @llvm.amdgcn.readfirstlane(i32 %0)
-  %74 = getelementptr i8, i8 addrspace(4)* %35, i64 16
-  %75 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 0, i32 0)
-  %76 = add i32 %75, -29
-  %77 = or i32 %72, %76
-  %78 = call i32 @llvm.amdgcn.readfirstlane(i32 %1)
-  %79 = shl i32 %78, 4
-  %80 = sext i32 %79 to i64
-  %81 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 0, i32 0)
-  %82 = add i32 %81, -30
-  %83 = or i32 %77, %82
-  %84 = call i32 @llvm.amdgcn.readfirstlane(i32 %2)
-  %85 = shl i32 %84, 4
-  %86 = sext i32 %85 to i64
-  %87 = getelementptr i8, i8 addrspace(4)* %74, i64 %86
-  %88 = bitcast i8 addrspace(4)* %87 to <4 x i32> addrspace(4)*
-  %89 = load <4 x i32>, <4 x i32> addrspace(4)* %88, align 16
-  %90 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %89, i32 0, i32 0)
-  %91 = add i32 %90, -31
-  %92 = or i32 %83, %91
-  %93 = getelementptr i8, i8 addrspace(4)* %35, i64 64
-  %94 = getelementptr i8, i8 addrspace(4)* %93, i64 %51
-  %95 = bitcast i8 addrspace(4)* %94 to <4 x i32> addrspace(4)*
-  %96 = load <4 x i32>, <4 x i32> addrspace(4)* %95, align 16
-  %97 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %96, i32 0, i32 0, i32 0, i32 0)
-  %98 = add i32 %97, -32
-  %99 = or i32 %92, %98
-  %100 = getelementptr i8, i8 addrspace(4)* %93, i64 %57
-  %101 = bitcast i8 addrspace(4)* %100 to <4 x i32> addrspace(4)*
-  %102 = load <4 x i32>, <4 x i32> addrspace(4)* %101, align 16
-  %103 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %102, i32 0, i32 0, i32 0, i32 0)
-  %104 = add i32 %103, -33
-  %105 = or i32 %99, %104
-  %106 = getelementptr i8, i8 addrspace(4)* %93, i64 %63
-  %107 = bitcast i8 addrspace(4)* %106 to <4 x i32> addrspace(4)*
-  %108 = load <4 x i32>, <4 x i32> addrspace(4)* %107, align 16
-  %109 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %108, i32 0, i32 0, i32 0, i32 0)
-  %110 = add i32 %109, -34
-  %111 = or i32 %105, %110
-  %112 = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
-  %113 = sext i32 %112 to i64
-  %114 = getelementptr i8, i8 addrspace(4)* %93, i64 %113
-  %115 = bitcast i8 addrspace(4)* %114 to <4 x i32> addrspace(4)*
-  %116 = load <4 x i32>, <4 x i32> addrspace(4)* %115, align 16
-  %117 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %116, i32 0, i32 0, i32 0, i32 0)
-  %118 = add i32 %117, -36
-  %119 = or i32 %111, %118
-  %120 = getelementptr i8, i8 addrspace(4)* %32, i64 %51
-  %121 = bitcast i8 addrspace(4)* %120 to <4 x i32> addrspace(4)*
-  %122 = load <4 x i32>, <4 x i32> addrspace(4)* %121, align 16
-  %123 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %122, i32 0, i32 0, i32 0, i32 0)
-  %124 = add i32 %123, -37
-  %125 = or i32 %119, %124
-  %126 = getelementptr i8, i8 addrspace(4)* %32, i64 %57
-  %127 = bitcast i8 addrspace(4)* %126 to <4 x i32> addrspace(4)*
-  %128 = load <4 x i32>, <4 x i32> addrspace(4)* %127, align 16
-  %129 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %128, i32 0, i32 0, i32 0, i32 0)
-  %130 = add i32 %129, -38
-  %131 = or i32 %125, %130
-  %132 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %133 = add i32 %132, -39
-  %134 = or i32 %131, %133
-  %135 = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
-  %136 = sext i32 %135 to i64
-  %137 = getelementptr i8, i8 addrspace(4)* %32, i64 %136
-  %138 = bitcast i8 addrspace(4)* %137 to <4 x i32> addrspace(4)*
-  %139 = load <4 x i32>, <4 x i32> addrspace(4)* %138, align 16
-  %140 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %139, i32 0, i32 0, i32 0, i32 0)
-  %141 = add i32 %140, -50
-  %142 = or i32 %134, %141
-  %143 = getelementptr i8, i8 addrspace(4)* %32, i64 224
-  %144 = getelementptr i8, i8 addrspace(4)* %143, i64 %51
-  %145 = bitcast i8 addrspace(4)* %144 to <4 x i32> addrspace(4)*
-  %146 = load <4 x i32>, <4 x i32> addrspace(4)* %145, align 16
+  %38 = inttoptr i64 %37 to ptr addrspace(4)
+  %39 = getelementptr i8, ptr addrspace(4) %38, i64 232
+  %rootDesc58.ii0.i = load i32, ptr addrspace(4) %39, align 8
+  %.i184.i = getelementptr i8, ptr addrspace(4) %38, i64 236
+  %rootDesc58.ii1.i = load i32, ptr addrspace(4) %.i184.i, align 4
+  %40 = and i32 %rootDesc58.ii1.i, 65535
+  %41 = insertelement <4 x i32> <i32 undef, i32 undef, i32 -1, i32 553734060>, i32 %rootDesc58.ii0.i, i32 0
+  %42 = insertelement <4 x i32> %41, i32 %40, i32 1
+  %43 = and i32 undef, 65535
+  %44 = insertelement <4 x i32> undef, i32 %43, i32 1
+  %45 = load <4 x i32>, ptr addrspace(4) undef, align 16
+  %46 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %45, i32 0, i32 0, i32 0, i32 0)
+  %47 = add i32 %46, -1
+  %48 = shl i32 %0, 4
+  %49 = call i32 @llvm.amdgcn.readfirstlane(i32 %48)
+  %50 = sext i32 %49 to i64
+  %51 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %52 = add i32 %51, -2
+  %53 = or i32 %52, %47
+  %54 = shl i32 %1, 4
+  %55 = call i32 @llvm.amdgcn.readfirstlane(i32 %54)
+  %56 = sext i32 %55 to i64
+  %57 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %58 = add i32 %57, -3
+  %59 = or i32 %53, %58
+  %60 = shl i32 %2, 4
+  %61 = call i32 @llvm.amdgcn.readfirstlane(i32 %60)
+  %62 = sext i32 %61 to i64
+  %63 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %64 = add i32 %63, -4
+  %65 = or i32 %59, %64
+  %66 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %67 = add i32 %66, -27
+  %68 = or i32 %65, %67
+  %69 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 0, i32 0, i32 0)
+  %70 = add i32 %69, -28
+  %71 = or i32 %68, %70
+  %72 = call i32 @llvm.amdgcn.readfirstlane(i32 %0)
+  %73 = getelementptr i8, ptr addrspace(4) %35, i64 16
+  %74 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 0, i32 0)
+  %75 = add i32 %74, -29
+  %76 = or i32 %71, %75
+  %77 = call i32 @llvm.amdgcn.readfirstlane(i32 %1)
+  %78 = shl i32 %77, 4
+  %79 = sext i32 %78 to i64
+  %80 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 0, i32 0)
+  %81 = add i32 %80, -30
+  %82 = or i32 %76, %81
+  %83 = call i32 @llvm.amdgcn.readfirstlane(i32 %2)
+  %84 = shl i32 %83, 4
+  %85 = sext i32 %84 to i64
+  %86 = getelementptr i8, ptr addrspace(4) %73, i64 %85
+  %87 = load <4 x i32>, ptr addrspace(4) %86, align 16
+  %88 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %87, i32 0, i32 0)
+  %89 = add i32 %88, -31
+  %90 = or i32 %82, %89
+  %91 = getelementptr i8, ptr addrspace(4) %35, i64 64
+  %92 = getelementptr i8, ptr addrspace(4) %91, i64 %50
+  %93 = load <4 x i32>, ptr addrspace(4) %92, align 16
+  %94 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %93, i32 0, i32 0, i32 0, i32 0)
+  %95 = add i32 %94, -32
+  %96 = or i32 %90, %95
+  %97 = getelementptr i8, ptr addrspace(4) %91, i64 %56
+  %98 = load <4 x i32>, ptr addrspace(4) %97, align 16
+  %99 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %98, i32 0, i32 0, i32 0, i32 0)
+  %100 = add i32 %99, -33
+  %101 = or i32 %96, %100
+  %102 = getelementptr i8, ptr addrspace(4) %91, i64 %62
+  %103 = load <4 x i32>, ptr addrspace(4) %102, align 16
+  %104 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %103, i32 0, i32 0, i32 0, i32 0)
+  %105 = add i32 %104, -34
+  %106 = or i32 %101, %105
+  %107 = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
+  %108 = sext i32 %107 to i64
+  %109 = getelementptr i8, ptr addrspace(4) %91, i64 %108
+  %110 = load <4 x i32>, ptr addrspace(4) %109, align 16
+  %111 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %110, i32 0, i32 0, i32 0, i32 0)
+  %112 = add i32 %111, -36
+  %113 = or i32 %106, %112
+  %114 = getelementptr i8, ptr addrspace(4) %32, i64 %50
+  %115 = load <4 x i32>, ptr addrspace(4) %114, align 16
+  %116 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %115, i32 0, i32 0, i32 0, i32 0)
+  %117 = add i32 %116, -37
+  %118 = or i32 %113, %117
+  %119 = getelementptr i8, ptr addrspace(4) %32, i64 %56
+  %120 = load <4 x i32>, ptr addrspace(4) %119, align 16
+  %121 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %120, i32 0, i32 0, i32 0, i32 0)
+  %122 = add i32 %121, -38
+  %123 = or i32 %118, %122
+  %124 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %125 = add i32 %124, -39
+  %126 = or i32 %123, %125
+  %127 = call i32 @llvm.amdgcn.readfirstlane(i32 undef)
+  %128 = sext i32 %127 to i64
+  %129 = getelementptr i8, ptr addrspace(4) %32, i64 %128
+  %130 = load <4 x i32>, ptr addrspace(4) %129, align 16
+  %131 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %130, i32 0, i32 0, i32 0, i32 0)
+  %132 = add i32 %131, -50
+  %133 = or i32 %126, %132
+  %134 = getelementptr i8, ptr addrspace(4) %32, i64 224
+  %135 = getelementptr i8, ptr addrspace(4) %134, i64 %50
+  %136 = load <4 x i32>, ptr addrspace(4) %135, align 16
+  %137 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %136, i32 0, i32 0, i32 0, i32 0)
+  %138 = add i32 %137, -51
+  %139 = or i32 %133, %138
+  %140 = getelementptr i8, ptr addrspace(4) %134, i64 %56
+  %141 = load <4 x i32>, ptr addrspace(4) %140, align 16
+  %142 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %141, i32 0, i32 0, i32 0, i32 0)
+  %143 = add i32 %142, -52
+  %144 = or i32 %139, %143
+  %145 = getelementptr i8, ptr addrspace(4) %134, i64 %62
+  %146 = load <4 x i32>, ptr addrspace(4) %145, align 16
   %147 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %146, i32 0, i32 0, i32 0, i32 0)
-  %148 = add i32 %147, -51
-  %149 = or i32 %142, %148
-  %150 = getelementptr i8, i8 addrspace(4)* %143, i64 %57
-  %151 = bitcast i8 addrspace(4)* %150 to <4 x i32> addrspace(4)*
-  %152 = load <4 x i32>, <4 x i32> addrspace(4)* %151, align 16
+  %148 = add i32 %147, -53
+  %149 = or i32 %144, %148
+  %150 = sext i32 undef to i64
+  %151 = getelementptr i8, ptr addrspace(4) %134, i64 %150
+  %152 = load <4 x i32>, ptr addrspace(4) %151, align 16
   %153 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %152, i32 0, i32 0, i32 0, i32 0)
-  %154 = add i32 %153, -52
+  %154 = add i32 %153, -72
   %155 = or i32 %149, %154
-  %156 = getelementptr i8, i8 addrspace(4)* %143, i64 %63
-  %157 = bitcast i8 addrspace(4)* %156 to <4 x i32> addrspace(4)*
-  %158 = load <4 x i32>, <4 x i32> addrspace(4)* %157, align 16
+  %156 = getelementptr i8, ptr addrspace(4) %32, i64 576
+  %157 = getelementptr i8, ptr addrspace(4) %156, i64 %50
+  %158 = load <4 x i32>, ptr addrspace(4) %157, align 16
   %159 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %158, i32 0, i32 0, i32 0, i32 0)
-  %160 = add i32 %159, -53
+  %160 = add i32 %159, -73
   %161 = or i32 %155, %160
-  %162 = sext i32 undef to i64
-  %163 = getelementptr i8, i8 addrspace(4)* %143, i64 %162
-  %164 = bitcast i8 addrspace(4)* %163 to <4 x i32> addrspace(4)*
-  %165 = load <4 x i32>, <4 x i32> addrspace(4)* %164, align 16
-  %166 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %165, i32 0, i32 0, i32 0, i32 0)
-  %167 = add i32 %166, -72
-  %168 = or i32 %161, %167
-  %169 = getelementptr i8, i8 addrspace(4)* %32, i64 576
-  %170 = getelementptr i8, i8 addrspace(4)* %169, i64 %51
-  %171 = bitcast i8 addrspace(4)* %170 to <4 x i32> addrspace(4)*
-  %172 = load <4 x i32>, <4 x i32> addrspace(4)* %171, align 16
-  %173 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %172, i32 0, i32 0, i32 0, i32 0)
-  %174 = add i32 %173, -73
-  %175 = or i32 %168, %174
-  %176 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %177 = add i32 %176, -74
-  %178 = or i32 %175, %177
-  %179 = getelementptr i8, i8 addrspace(4)* %169, i64 %63
-  %180 = bitcast i8 addrspace(4)* %179 to <4 x i32> addrspace(4)*
-  %181 = load <4 x i32>, <4 x i32> addrspace(4)* %180, align 16
-  %182 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %181, i32 0, i32 0, i32 0, i32 0)
-  %183 = add i32 %182, -75
-  %184 = or i32 %178, %183
-  %185 = getelementptr i8, i8 addrspace(4)* %169, i64 %113
-  %186 = bitcast i8 addrspace(4)* %185 to <4 x i32> addrspace(4)*
-  %187 = load <4 x i32>, <4 x i32> addrspace(4)* %186, align 16
-  %188 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %187, i32 0, i32 0, i32 0, i32 0)
-  %189 = add i32 %188, -77
-  %190 = or i32 %184, %189
-  %191 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %192 = add i32 %191, -93
-  %193 = or i32 %190, %192
-  %194 = inttoptr i64 %29 to i8 addrspace(4)*
-  %195 = getelementptr i8, i8 addrspace(4)* %194, i64 %51
-  %196 = bitcast i8 addrspace(4)* %195 to <4 x i32> addrspace(4)*
-  %197 = load <4 x i32>, <4 x i32> addrspace(4)* %196, align 16
-  %198 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %197, i32 0, i32 0, i32 0, i32 0)
-  %199 = add i32 %198, -94
-  %200 = or i32 %193, %199
-  %201 = load <4 x i32>, <4 x i32> addrspace(4)* undef, align 16
-  %202 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %201, i32 0, i32 0, i32 0)
-  %203 = add i32 %202, -95
-  %204 = or i32 %200, %203
-  %205 = getelementptr i8, i8 addrspace(4)* %27, i64 %80
-  %206 = bitcast i8 addrspace(4)* %205 to <4 x i32> addrspace(4)*
-  %207 = load <4 x i32>, <4 x i32> addrspace(4)* %206, align 16
-  %208 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %207, i32 0, i32 0, i32 0)
-  %209 = add i32 %208, -96
-  %210 = or i32 %204, %209
-  %211 = getelementptr i8, i8 addrspace(4)* %27, i64 %86
-  %212 = bitcast i8 addrspace(4)* %211 to <4 x i32> addrspace(4)*
-  %213 = load <4 x i32>, <4 x i32> addrspace(4)* %212, align 16
-  %214 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %213, i32 0, i32 0, i32 0)
-  %215 = add i32 %214, -97
-  %216 = or i32 %210, %215
-  %217 = getelementptr <{ [4 x i32], [6 x %llpc.array.element] }>, <{ [4 x i32], [6 x %llpc.array.element] }> addrspace(6)* null, i32 0, i32 1, i32 %0, i32 0
-  %218 = ptrtoint i32 addrspace(6)* %217 to i32
-  %219 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %45, i32 %218, i32 0)
-  %220 = add i32 %219, -98
-  %221 = or i32 %216, %220
-  %222 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %45, i32 undef, i32 0)
-  %223 = add i32 %222, -114
-  %224 = or i32 %221, %223
-  %225 = getelementptr <{ [4 x i32], [6 x %llpc.array.element] }>, <{ [4 x i32], [6 x %llpc.array.element] }> addrspace(6)* null, i32 0, i32 1, i32 %2, i32 0
-  %226 = ptrtoint i32 addrspace(6)* %225 to i32
-  %227 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %45, i32 %226, i32 0)
-  %228 = add i32 %227, -130
-  %229 = or i32 %224, %228
-  %230 = getelementptr <{ [4 x i32], [6 x %llpc.array.element] }>, <{ [4 x i32], [6 x %llpc.array.element] }> addrspace(6)* null, i32 0, i32 1, i32 undef, i32 0
-  %231 = ptrtoint i32 addrspace(6)* %230 to i32
-  %232 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %45, i32 %231, i32 0)
-  %233 = add i32 %232, -178
-  %234 = or i32 %229, %233
-  %235 = inttoptr i64 %24 to i8 addrspace(4)*
-  %236 = getelementptr i8, i8 addrspace(4)* %235, i64 %51
-  %237 = bitcast i8 addrspace(4)* %236 to <4 x i32> addrspace(4)*
-  %238 = load <4 x i32>, <4 x i32> addrspace(4)* %237, align 16
-  %239 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %238, i32 0, i32 0, i32 0, i32 0)
-  %240 = add i32 %239, -194
-  %241 = or i32 %234, %240
-  %242 = inttoptr i64 %22 to i8 addrspace(4)*
-  %243 = getelementptr i8, i8 addrspace(4)* %242, i64 %51
-  %244 = bitcast i8 addrspace(4)* %243 to <4 x i32> addrspace(4)*
-  %245 = load <4 x i32>, <4 x i32> addrspace(4)* %244, align 16
-  %246 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %245, i32 0, i32 0, i32 0, i32 0)
-  %247 = add i32 %246, -195
-  %248 = or i32 %241, %247
-  %249 = getelementptr i8, i8 addrspace(4)* %242, i64 %57
-  %250 = bitcast i8 addrspace(4)* %249 to <4 x i32> addrspace(4)*
-  %251 = load <4 x i32>, <4 x i32> addrspace(4)* %250, align 16
-  %252 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %251, i32 0, i32 0, i32 0, i32 0)
-  %253 = add i32 %252, -196
-  %254 = or i32 %248, %253
-  %255 = getelementptr i8, i8 addrspace(4)* %242, i64 %63
-  %256 = bitcast i8 addrspace(4)* %255 to <4 x i32> addrspace(4)*
-  %257 = load <4 x i32>, <4 x i32> addrspace(4)* %256, align 16
-  %258 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %257, i32 0, i32 0, i32 0, i32 0)
-  %259 = add i32 %258, -197
-  %260 = or i32 %254, %259
-  %261 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %262 = add i32 %261, -216
-  %263 = or i32 %260, %262
-  %264 = getelementptr <{ [4 x i32], [6 x %llpc.array.element.2] }>, <{ [4 x i32], [6 x %llpc.array.element.2] }> addrspace(6)* null, i32 0, i32 1, i32 %0, i32 0
-  %265 = ptrtoint i32 addrspace(6)* %264 to i32
-  %266 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 %265, i32 0)
-  %267 = add i32 %266, -217
-  %268 = or i32 %263, %267
-  %269 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
-  %270 = add i32 %269, -233
-  %271 = or i32 %268, %270
-  %272 = getelementptr <{ [4 x i32], [6 x %llpc.array.element.2] }>, <{ [4 x i32], [6 x %llpc.array.element.2] }> addrspace(6)* null, i32 0, i32 1, i32 %2, i32 0
-  %273 = ptrtoint i32 addrspace(6)* %272 to i32
-  %274 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 %273, i32 0)
-  %275 = add i32 %274, -249
-  %276 = or i32 %271, %275
-  %277 = getelementptr <{ [4 x i32], [6 x %llpc.array.element.2] }>, <{ [4 x i32], [6 x %llpc.array.element.2] }> addrspace(6)* null, i32 0, i32 1, i32 undef, i32 0
-  %278 = ptrtoint i32 addrspace(6)* %277 to i32
-  %279 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 %278, i32 0)
-  %280 = add i32 %279, -297
-  %281 = or i32 %276, %280
-  %282 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
-  %283 = add i32 %282, -313
-  %284 = or i32 %281, %283
-  %285 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
-  %286 = add i32 %285, -329
-  %287 = or i32 %284, %286
-  %288 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
-  %289 = add i32 %288, -345
-  %290 = or i32 %287, %289
-  %291 = getelementptr <{ [4 x i32], [9 x %llpc.array.element.5] }>, <{ [4 x i32], [9 x %llpc.array.element.5] }> addrspace(6)* null, i32 0, i32 1, i32 %4, i32 0
-  %292 = ptrtoint i32 addrspace(6)* %291 to i32
-  %293 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 %292, i32 0)
-  %294 = add i32 %293, -441
-  %295 = or i32 %290, %294
-  %296 = getelementptr i8, i8 addrspace(4)* %20, i64 160
-  %297 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %298 = add i32 %297, -457
-  %299 = or i32 %295, %298
-  %300 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %301 = add i32 %300, -458
-  %302 = or i32 %299, %301
-  %303 = getelementptr i8, i8 addrspace(4)* %296, i64 %63
-  %304 = bitcast i8 addrspace(4)* %303 to <4 x i32> addrspace(4)*
-  %305 = load <4 x i32>, <4 x i32> addrspace(4)* %304, align 16
-  %306 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %305, i32 0, i32 0, i32 0, i32 0)
-  %307 = add i32 %306, -459
-  %308 = or i32 %302, %307
-  %309 = shl i32 %5, 4
-  %310 = call i32 @llvm.amdgcn.readfirstlane(i32 %309)
-  %311 = sext i32 %310 to i64
-  %312 = getelementptr i8, i8 addrspace(4)* %296, i64 %311
-  %313 = bitcast i8 addrspace(4)* %312 to <4 x i32> addrspace(4)*
-  %314 = load <4 x i32>, <4 x i32> addrspace(4)* %313, align 16
-  %315 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %314, i32 0, i32 0, i32 0, i32 0)
-  %316 = add i32 %315, -466
-  %317 = or i32 %308, %316
-  %318 = getelementptr i8, i8 addrspace(4)* %38, i64 168
-  %319 = shl i32 %73, 3
-  %320 = sext i32 %319 to i64
-  %321 = getelementptr i8, i8 addrspace(4)* %318, i64 %320
-  %.i085.i = bitcast i8 addrspace(4)* %321 to i32 addrspace(4)*
-  %.ii0.i = load i32, i32 addrspace(4)* %.i085.i, align 8
-  %322 = and i32 undef, 65535
-  %323 = insertelement <4 x i32> <i32 undef, i32 undef, i32 -1, i32 553734060>, i32 %.ii0.i, i32 0
-  %324 = insertelement <4 x i32> %323, i32 %322, i32 1
-  %325 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %324, i32 0, i32 0)
-  %326 = add i32 %325, -467
-  %327 = or i32 %317, %326
-  %328 = shl i32 %78, 3
-  %329 = sext i32 %328 to i64
-  %330 = getelementptr i8, i8 addrspace(4)* %318, i64 %329
-  %.i088.i = bitcast i8 addrspace(4)* %330 to i32 addrspace(4)*
-  %.ii090.i = load i32, i32 addrspace(4)* %.i088.i, align 8
-  %.i191.i = getelementptr i8, i8 addrspace(4)* %330, i64 4
-  %331 = bitcast i8 addrspace(4)* %.i191.i to i32 addrspace(4)*
-  %.ii192.i = load i32, i32 addrspace(4)* %331, align 4
-  %332 = and i32 %.ii192.i, 65535
-  %333 = insertelement <4 x i32> <i32 undef, i32 undef, i32 -1, i32 553734060>, i32 %.ii090.i, i32 0
-  %334 = insertelement <4 x i32> %333, i32 %332, i32 1
-  %335 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %334, i32 0, i32 0)
-  %336 = add i32 %335, -468
-  %337 = or i32 %327, %336
-  %338 = shl i32 %84, 3
-  %339 = sext i32 %338 to i64
-  %340 = getelementptr i8, i8 addrspace(4)* %318, i64 %339
-  %.i094.i = bitcast i8 addrspace(4)* %340 to i32 addrspace(4)*
-  %.ii096.i = load i32, i32 addrspace(4)* %.i094.i, align 8
-  %.i197.i = getelementptr i8, i8 addrspace(4)* %340, i64 4
-  %341 = bitcast i8 addrspace(4)* %.i197.i to i32 addrspace(4)*
-  %.ii198.i = load i32, i32 addrspace(4)* %341, align 4
-  %342 = and i32 %.ii198.i, 65535
-  %343 = insertelement <4 x i32> <i32 undef, i32 undef, i32 -1, i32 553734060>, i32 %.ii096.i, i32 0
-  %344 = insertelement <4 x i32> %343, i32 %342, i32 1
-  %345 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %344, i32 0, i32 0)
-  %346 = add i32 %345, -469
-  %347 = or i32 %337, %346
-  %348 = call i32 @llvm.amdgcn.readfirstlane(i32 %3)
-  %349 = shl i32 %348, 3
-  %350 = sext i32 %349 to i64
-  %351 = getelementptr i8, i8 addrspace(4)* %318, i64 %350
-  %.i0100.i = bitcast i8 addrspace(4)* %351 to i32 addrspace(4)*
-  %.ii0102.i = load i32, i32 addrspace(4)* %.i0100.i, align 8
-  %.ii1104.i = load i32, i32 addrspace(4)* undef, align 4
-  %352 = and i32 %.ii1104.i, 65535
-  %353 = insertelement <4 x i32> <i32 undef, i32 undef, i32 -1, i32 553734060>, i32 %.ii0102.i, i32 0
-  %354 = insertelement <4 x i32> %353, i32 %352, i32 1
-  %355 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %354, i32 0, i32 0)
-  %356 = add i32 %355, -473
-  %357 = or i32 %347, %356
-  %358 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 0, i32 0)
-  %359 = add i32 %358, -474
-  %360 = or i32 %357, %359
-  %361 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
-  %362 = add i32 %361, -475
-  %363 = or i32 %360, %362
-  %364 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
-  %365 = add i32 %364, -491
-  %366 = or i32 %363, %365
-  %367 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
-  %368 = add i32 %367, -507
-  %369 = or i32 %366, %368
-  %370 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
-  %371 = add i32 %370, -539
-  %372 = or i32 %369, %371
-  %373 = getelementptr i8, i8 addrspace(4)* %17, i64 96
-  %374 = getelementptr i8, i8 addrspace(4)* %373, i64 %51
-  %375 = bitcast i8 addrspace(4)* %374 to <4 x i32> addrspace(4)*
-  %376 = load <4 x i32>, <4 x i32> addrspace(4)* %375, align 16
-  %377 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %376, i32 0, i32 0, i32 0, i32 0)
-  %378 = add i32 %377, -555
-  %379 = or i32 %372, %378
-  %380 = getelementptr i8, i8 addrspace(4)* %373, i64 %57
-  %381 = bitcast i8 addrspace(4)* %380 to <4 x i32> addrspace(4)*
-  %382 = load <4 x i32>, <4 x i32> addrspace(4)* %381, align 16
-  %383 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %382, i32 0, i32 0, i32 0, i32 0)
-  %384 = add i32 %383, -556
-  %385 = or i32 %379, %384
-  %386 = getelementptr i8, i8 addrspace(4)* %373, i64 %63
-  %387 = bitcast i8 addrspace(4)* %386 to <4 x i32> addrspace(4)*
-  %388 = load <4 x i32>, <4 x i32> addrspace(4)* %387, align 16
-  %389 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %388, i32 0, i32 0, i32 0, i32 0)
-  %390 = add i32 %389, -557
-  %391 = or i32 %385, %390
-  %392 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %393 = add i32 %392, -574
-  %394 = or i32 %391, %393
-  %395 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %396 = add i32 %395, -575
-  %397 = or i32 %394, %396
-  %398 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %399 = add i32 %398, -576
-  %400 = or i32 %397, %399
-  %401 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %402 = add i32 %401, -577
-  %403 = or i32 %400, %402
-  %404 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
-  %405 = add i32 %404, -593
-  %406 = or i32 %403, %405
-  %407 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %43, i32 0, i32 0)
-  %408 = add i32 %407, -594
-  %409 = or i32 %406, %408
-  %.not.i = icmp eq i32 %409, 0
-  %410 = load <8 x i32>, <8 x i32> addrspace(4)* undef, align 32
+  %162 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %163 = add i32 %162, -74
+  %164 = or i32 %161, %163
+  %165 = getelementptr i8, ptr addrspace(4) %156, i64 %62
+  %166 = load <4 x i32>, ptr addrspace(4) %165, align 16
+  %167 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %166, i32 0, i32 0, i32 0, i32 0)
+  %168 = add i32 %167, -75
+  %169 = or i32 %164, %168
+  %170 = getelementptr i8, ptr addrspace(4) %156, i64 %108
+  %171 = load <4 x i32>, ptr addrspace(4) %170, align 16
+  %172 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %171, i32 0, i32 0, i32 0, i32 0)
+  %173 = add i32 %172, -77
+  %174 = or i32 %169, %173
+  %175 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %176 = add i32 %175, -93
+  %177 = or i32 %174, %176
+  %178 = inttoptr i64 %29 to ptr addrspace(4)
+  %179 = getelementptr i8, ptr addrspace(4) %178, i64 %50
+  %180 = load <4 x i32>, ptr addrspace(4) %179, align 16
+  %181 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %180, i32 0, i32 0, i32 0, i32 0)
+  %182 = add i32 %181, -94
+  %183 = or i32 %177, %182
+  %184 = load <4 x i32>, ptr addrspace(4) undef, align 16
+  %185 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %184, i32 0, i32 0, i32 0)
+  %186 = add i32 %185, -95
+  %187 = or i32 %183, %186
+  %188 = getelementptr i8, ptr addrspace(4) %27, i64 %79
+  %189 = load <4 x i32>, ptr addrspace(4) %188, align 16
+  %190 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %189, i32 0, i32 0, i32 0)
+  %191 = add i32 %190, -96
+  %192 = or i32 %187, %191
+  %193 = getelementptr i8, ptr addrspace(4) %27, i64 %85
+  %194 = load <4 x i32>, ptr addrspace(4) %193, align 16
+  %195 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %194, i32 0, i32 0, i32 0)
+  %196 = add i32 %195, -97
+  %197 = or i32 %192, %196
+  %198 = getelementptr <{ [4 x i32], [6 x %llpc.array.element] }>, ptr addrspace(6) null, i32 0, i32 1, i32 %0, i32 0
+  %199 = ptrtoint ptr addrspace(6) %198 to i32
+  %200 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %44, i32 %199, i32 0)
+  %201 = add i32 %200, -98
+  %202 = or i32 %197, %201
+  %203 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %44, i32 undef, i32 0)
+  %204 = add i32 %203, -114
+  %205 = or i32 %202, %204
+  %206 = getelementptr <{ [4 x i32], [6 x %llpc.array.element] }>, ptr addrspace(6) null, i32 0, i32 1, i32 %2, i32 0
+  %207 = ptrtoint ptr addrspace(6) %206 to i32
+  %208 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %44, i32 %207, i32 0)
+  %209 = add i32 %208, -130
+  %210 = or i32 %205, %209
+  %211 = getelementptr <{ [4 x i32], [6 x %llpc.array.element] }>, ptr addrspace(6) null, i32 0, i32 1, i32 undef, i32 0
+  %212 = ptrtoint ptr addrspace(6) %211 to i32
+  %213 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %44, i32 %212, i32 0)
+  %214 = add i32 %213, -178
+  %215 = or i32 %210, %214
+  %216 = inttoptr i64 %24 to ptr addrspace(4)
+  %217 = getelementptr i8, ptr addrspace(4) %216, i64 %50
+  %218 = load <4 x i32>, ptr addrspace(4) %217, align 16
+  %219 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %218, i32 0, i32 0, i32 0, i32 0)
+  %220 = add i32 %219, -194
+  %221 = or i32 %215, %220
+  %222 = inttoptr i64 %22 to ptr addrspace(4)
+  %223 = getelementptr i8, ptr addrspace(4) %222, i64 %50
+  %224 = load <4 x i32>, ptr addrspace(4) %223, align 16
+  %225 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %224, i32 0, i32 0, i32 0, i32 0)
+  %226 = add i32 %225, -195
+  %227 = or i32 %221, %226
+  %228 = getelementptr i8, ptr addrspace(4) %222, i64 %56
+  %229 = load <4 x i32>, ptr addrspace(4) %228, align 16
+  %230 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %229, i32 0, i32 0, i32 0, i32 0)
+  %231 = add i32 %230, -196
+  %232 = or i32 %227, %231
+  %233 = getelementptr i8, ptr addrspace(4) %222, i64 %62
+  %234 = load <4 x i32>, ptr addrspace(4) %233, align 16
+  %235 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %234, i32 0, i32 0, i32 0, i32 0)
+  %236 = add i32 %235, -197
+  %237 = or i32 %232, %236
+  %238 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %239 = add i32 %238, -216
+  %240 = or i32 %237, %239
+  %241 = getelementptr <{ [4 x i32], [6 x %llpc.array.element.2] }>, ptr addrspace(6) null, i32 0, i32 1, i32 %0, i32 0
+  %242 = ptrtoint ptr addrspace(6) %241 to i32
+  %243 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 %242, i32 0)
+  %244 = add i32 %243, -217
+  %245 = or i32 %240, %244
+  %246 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
+  %247 = add i32 %246, -233
+  %248 = or i32 %245, %247
+  %249 = getelementptr <{ [4 x i32], [6 x %llpc.array.element.2] }>, ptr addrspace(6) null, i32 0, i32 1, i32 %2, i32 0
+  %250 = ptrtoint ptr addrspace(6) %249 to i32
+  %251 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 %250, i32 0)
+  %252 = add i32 %251, -249
+  %253 = or i32 %248, %252
+  %254 = getelementptr <{ [4 x i32], [6 x %llpc.array.element.2] }>, ptr addrspace(6) null, i32 0, i32 1, i32 undef, i32 0
+  %255 = ptrtoint ptr addrspace(6) %254 to i32
+  %256 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 %255, i32 0)
+  %257 = add i32 %256, -297
+  %258 = or i32 %253, %257
+  %259 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
+  %260 = add i32 %259, -313
+  %261 = or i32 %258, %260
+  %262 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
+  %263 = add i32 %262, -329
+  %264 = or i32 %261, %263
+  %265 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
+  %266 = add i32 %265, -345
+  %267 = or i32 %264, %266
+  %268 = getelementptr <{ [4 x i32], [9 x %llpc.array.element.5] }>, ptr addrspace(6) null, i32 0, i32 1, i32 %4, i32 0
+  %269 = ptrtoint ptr addrspace(6) %268 to i32
+  %270 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 %269, i32 0)
+  %271 = add i32 %270, -441
+  %272 = or i32 %267, %271
+  %273 = getelementptr i8, ptr addrspace(4) %20, i64 160
+  %274 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %275 = add i32 %274, -457
+  %276 = or i32 %272, %275
+  %277 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %278 = add i32 %277, -458
+  %279 = or i32 %276, %278
+  %280 = getelementptr i8, ptr addrspace(4) %273, i64 %62
+  %281 = load <4 x i32>, ptr addrspace(4) %280, align 16
+  %282 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %281, i32 0, i32 0, i32 0, i32 0)
+  %283 = add i32 %282, -459
+  %284 = or i32 %279, %283
+  %285 = shl i32 %5, 4
+  %286 = call i32 @llvm.amdgcn.readfirstlane(i32 %285)
+  %287 = sext i32 %286 to i64
+  %288 = getelementptr i8, ptr addrspace(4) %273, i64 %287
+  %289 = load <4 x i32>, ptr addrspace(4) %288, align 16
+  %290 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %289, i32 0, i32 0, i32 0, i32 0)
+  %291 = add i32 %290, -466
+  %292 = or i32 %284, %291
+  %293 = getelementptr i8, ptr addrspace(4) %38, i64 168
+  %294 = shl i32 %72, 3
+  %295 = sext i32 %294 to i64
+  %296 = getelementptr i8, ptr addrspace(4) %293, i64 %295
+  %.ii0.i = load i32, ptr addrspace(4) %296, align 8
+  %297 = and i32 undef, 65535
+  %298 = insertelement <4 x i32> <i32 undef, i32 undef, i32 -1, i32 553734060>, i32 %.ii0.i, i32 0
+  %299 = insertelement <4 x i32> %298, i32 %297, i32 1
+  %300 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %299, i32 0, i32 0)
+  %301 = add i32 %300, -467
+  %302 = or i32 %292, %301
+  %303 = shl i32 %77, 3
+  %304 = sext i32 %303 to i64
+  %305 = getelementptr i8, ptr addrspace(4) %293, i64 %304
+  %.ii090.i = load i32, ptr addrspace(4) %305, align 8
+  %.i191.i = getelementptr i8, ptr addrspace(4) %305, i64 4
+  %.ii192.i = load i32, ptr addrspace(4) %.i191.i, align 4
+  %306 = and i32 %.ii192.i, 65535
+  %307 = insertelement <4 x i32> <i32 undef, i32 undef, i32 -1, i32 553734060>, i32 %.ii090.i, i32 0
+  %308 = insertelement <4 x i32> %307, i32 %306, i32 1
+  %309 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %308, i32 0, i32 0)
+  %310 = add i32 %309, -468
+  %311 = or i32 %302, %310
+  %312 = shl i32 %83, 3
+  %313 = sext i32 %312 to i64
+  %314 = getelementptr i8, ptr addrspace(4) %293, i64 %313
+  %.ii096.i = load i32, ptr addrspace(4) %314, align 8
+  %.i197.i = getelementptr i8, ptr addrspace(4) %314, i64 4
+  %.ii198.i = load i32, ptr addrspace(4) %.i197.i, align 4
+  %315 = and i32 %.ii198.i, 65535
+  %316 = insertelement <4 x i32> <i32 undef, i32 undef, i32 -1, i32 553734060>, i32 %.ii096.i, i32 0
+  %317 = insertelement <4 x i32> %316, i32 %315, i32 1
+  %318 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %317, i32 0, i32 0)
+  %319 = add i32 %318, -469
+  %320 = or i32 %311, %319
+  %321 = call i32 @llvm.amdgcn.readfirstlane(i32 %3)
+  %322 = shl i32 %321, 3
+  %323 = sext i32 %322 to i64
+  %324 = getelementptr i8, ptr addrspace(4) %293, i64 %323
+  %.ii0102.i = load i32, ptr addrspace(4) %324, align 8
+  %.ii1104.i = load i32, ptr addrspace(4) undef, align 4
+  %325 = and i32 %.ii1104.i, 65535
+  %326 = insertelement <4 x i32> <i32 undef, i32 undef, i32 -1, i32 553734060>, i32 %.ii0102.i, i32 0
+  %327 = insertelement <4 x i32> %326, i32 %325, i32 1
+  %328 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %327, i32 0, i32 0)
+  %329 = add i32 %328, -473
+  %330 = or i32 %320, %329
+  %331 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 0, i32 0)
+  %332 = add i32 %331, -474
+  %333 = or i32 %330, %332
+  %334 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
+  %335 = add i32 %334, -475
+  %336 = or i32 %333, %335
+  %337 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
+  %338 = add i32 %337, -491
+  %339 = or i32 %336, %338
+  %340 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
+  %341 = add i32 %340, -507
+  %342 = or i32 %339, %341
+  %343 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> undef, i32 undef, i32 0)
+  %344 = add i32 %343, -539
+  %345 = or i32 %342, %344
+  %346 = getelementptr i8, ptr addrspace(4) %17, i64 96
+  %347 = getelementptr i8, ptr addrspace(4) %346, i64 %50
+  %348 = load <4 x i32>, ptr addrspace(4) %347, align 16
+  %349 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %348, i32 0, i32 0, i32 0, i32 0)
+  %350 = add i32 %349, -555
+  %351 = or i32 %345, %350
+  %352 = getelementptr i8, ptr addrspace(4) %346, i64 %56
+  %353 = load <4 x i32>, ptr addrspace(4) %352, align 16
+  %354 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %353, i32 0, i32 0, i32 0, i32 0)
+  %355 = add i32 %354, -556
+  %356 = or i32 %351, %355
+  %357 = getelementptr i8, ptr addrspace(4) %346, i64 %62
+  %358 = load <4 x i32>, ptr addrspace(4) %357, align 16
+  %359 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %358, i32 0, i32 0, i32 0, i32 0)
+  %360 = add i32 %359, -557
+  %361 = or i32 %356, %360
+  %362 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %363 = add i32 %362, -574
+  %364 = or i32 %361, %363
+  %365 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %366 = add i32 %365, -575
+  %367 = or i32 %364, %366
+  %368 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %369 = add i32 %368, -576
+  %370 = or i32 %367, %369
+  %371 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %372 = add i32 %371, -577
+  %373 = or i32 %370, %372
+  %374 = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> undef, i32 0, i32 0, i32 0, i32 0)
+  %375 = add i32 %374, -593
+  %376 = or i32 %373, %375
+  %377 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %42, i32 0, i32 0)
+  %378 = add i32 %377, -594
+  %379 = or i32 %376, %378
+  %.not.i = icmp eq i32 %379, 0
+  %380 = load <8 x i32>, ptr addrspace(4) undef, align 32
   %.i010.i = select i1 %.not.i, float 0x36A0000000000000, float 0.000000e+00
-  %411 = insertelement <4 x float> undef, float %.i010.i, i32 3
-  call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %411, i32 15, i32 undef, i32 undef, <8 x i32> %410, i32 0, i32 0)
+  %381 = insertelement <4 x float> undef, float %.i010.i, i32 3
+  call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %381, i32 15, i32 undef, i32 undef, <8 x i32> %380, i32 0, i32 0)
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 21fdbb2b8ed6..0a21a77b5b94 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -216,7 +216,7 @@ for.end:
 }
 
 ; a loop inside an if-else
-define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, float(float)* %extern_func, float(float)* %extern_func2) #0 {
+define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_func, ptr %extern_func2) #0 {
   ; SI-LABEL: name: loop
   ; SI: bb.0.main_body:
   ; SI-NEXT:   successors: %bb.6(0x40000000), %bb.1(0x40000000)
@@ -339,7 +339,7 @@ end:
 }
 
 ; a loop inside an if-else, but the variable is still in use after the if-else
-define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, float(float)* %extern_func, float(float)* %extern_func2) #0 {
+define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %extern_func, ptr %extern_func2) #0 {
   ; SI-LABEL: name: loop_with_use
   ; SI: bb.0.main_body:
   ; SI-NEXT:   successors: %bb.6(0x40000000), %bb.1(0x40000000)
@@ -460,7 +460,7 @@ end:
   ret float %r2
 }
 
-define amdgpu_kernel void @livevariables_update_missed_block(i8 addrspace(1)* %src1) {
+define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %src1) {
   ; SI-LABEL: name: livevariables_update_missed_block
   ; SI: bb.0.entry:
   ; SI-NEXT:   successors: %bb.2(0x40000000), %bb.5(0x40000000)
@@ -475,13 +475,13 @@ define amdgpu_kernel void @livevariables_update_missed_block(i8 addrspace(1)* %s
   ; SI-NEXT: bb.1.if.then:
   ; SI-NEXT:   successors: %bb.7(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset.cast, align 4, addrspace 4)
+  ; SI-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4)
   ; SI-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %51, 0, implicit $exec
   ; SI-NEXT:   %44:vgpr_32, dead %46:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %44, %subreg.sub1
   ; SI-NEXT:   [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1)
   ; SI-NEXT:   [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
-  ; SI-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1)
+  ; SI-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1)
   ; SI-NEXT:   S_BRANCH %bb.7
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.2.if.then9:
@@ -498,7 +498,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(i8 addrspace(1)* %s
   ; SI-NEXT:   successors: %bb.6(0x80000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
-  ; SI-NEXT:   [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s8) from `i8 addrspace(1)* null`, addrspace 1)
+  ; SI-NEXT:   [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s8) from `ptr addrspace(1) null`, addrspace 1)
   ; SI-NEXT:   S_BRANCH %bb.6
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.5.Flow:
@@ -513,7 +513,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(i8 addrspace(1)* %s
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %36:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
   ; SI-NEXT:   [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
-  ; SI-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1)
+  ; SI-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1)
   ; SI-NEXT:   S_BRANCH %bb.5
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.7.UnifiedReturnBlock:
@@ -530,21 +530,21 @@ entry:
 
 if.then:                                          ; preds = %entry
   %i9 = mul i64 %i6, 1
-  %i10 = getelementptr inbounds i8, i8 addrspace(1)* %src1, i64 %i9
-  %i11 = load i8, i8 addrspace(1)* %i10, align 1
+  %i10 = getelementptr inbounds i8, ptr addrspace(1) %src1, i64 %i9
+  %i11 = load i8, ptr addrspace(1) %i10, align 1
   %i12 = insertelement <3 x i8> zeroinitializer, i8 %i11, i64 0
   %i13 = insertelement <3 x i8> %i12, i8 0, i64 1
   %i14 = insertelement <3 x i8> %i13, i8 0, i64 1
   %i15 = select <3 x i1> zeroinitializer, <3 x i8> zeroinitializer, <3 x i8> %i14
   %i16 = extractelement <3 x i8> %i15, i64 0
-  store i8 %i16, i8 addrspace(1)* null, align 1
+  store i8 %i16, ptr addrspace(1) null, align 1
   ret void
 
 if.then9:                                         ; preds = %entry
   br i1 undef, label %sw.bb18, label %sw.bb
 
 sw.bb:                                            ; preds = %if.then9
-  %i17 = load i8, i8 addrspace(1)* null, align 1
+  %i17 = load i8, ptr addrspace(1) null, align 1
   %i18 = insertelement <4 x i8> zeroinitializer, i8 %i17, i64 0
   %a.sroa.0.0.vecblend = shufflevector <4 x i8> %i18, <4 x i8> zeroinitializer, <4 x i32> <i32 0, i32 0, i32 0, i32 undef>
   br label %sw.bb18
@@ -555,12 +555,12 @@ sw.bb18:                                          ; preds = %sw.bb, %if.then9
   %i19 = insertelement <3 x i8> %a.sroa.0.0.vec.extract61, i8 0, i64 0
   %i20 = select <3 x i1> zeroinitializer, <3 x i8> zeroinitializer, <3 x i8> %i19
   %i21 = extractelement <3 x i8> %i20, i64 1
-  store i8 %i21, i8 addrspace(1)* null, align 1
+  store i8 %i21, ptr addrspace(1) null, align 1
   ret void
 }
 
 %tex = type opaque
-define protected amdgpu_kernel void @nested_waterfalls(%tex* addrspace(1)* %tex.coerce) local_unnamed_addr {
+define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coerce) local_unnamed_addr {
   ; SI-LABEL: name: nested_waterfalls
   ; SI: bb.0.entry:
   ; SI-NEXT:   successors: %bb.1(0x80000000)
@@ -572,21 +572,21 @@ define protected amdgpu_kernel void @nested_waterfalls(%tex* addrspace(1)* %tex.
   ; SI-NEXT: bb.1.if.then:
   ; SI-NEXT:   successors: %bb.2(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset.cast, align 4, addrspace 4)
+  ; SI-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4)
   ; SI-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec
   ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
-  ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 16, 0, implicit $exec :: (invariant load (s128) from %ir.6 + 16, addrspace 4)
+  ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 16, 0, implicit $exec :: (invariant load (s128) from %ir.3 + 16, addrspace 4)
   ; SI-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3
   ; SI-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub2
   ; SI-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub1
   ; SI-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub0
-  ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 0, 0, implicit $exec :: (invariant load (s128) from %ir.6, align 32, addrspace 4)
+  ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX4_1:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 0, 0, implicit $exec :: (invariant load (s128) from %ir.3, align 32, addrspace 4)
   ; SI-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_1]].sub3
   ; SI-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_1]].sub2
   ; SI-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_1]].sub1
   ; SI-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_1]].sub0
   ; SI-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_256 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3, killed [[COPY5]], %subreg.sub4, killed [[COPY4]], %subreg.sub5, killed [[COPY3]], %subreg.sub6, killed [[COPY2]], %subreg.sub7
-  ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 killed [[GLOBAL_LOAD_DWORDX2_SADDR]], 48, 0, implicit $exec :: (invariant load (s128) from %ir.8, addrspace 4)
+  ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX4_2:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 killed [[GLOBAL_LOAD_DWORDX2_SADDR]], 48, 0, implicit $exec :: (invariant load (s128) from %ir.add.ptr.i, addrspace 4)
   ; SI-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.2:
@@ -650,7 +650,7 @@ define protected amdgpu_kernel void @nested_waterfalls(%tex* addrspace(1)* %tex.
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.7:
   ; SI-NEXT:   $exec_lo = S_MOV_B32 killed [[S_MOV_B32_]]
-  ; SI-NEXT:   GLOBAL_STORE_DWORD undef %25:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `float addrspace(1)* undef`, addrspace 1)
+  ; SI-NEXT:   GLOBAL_STORE_DWORD undef %25:vreg_64, killed [[IMAGE_SAMPLE_V1_V2_gfx10_]], 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; SI-NEXT:   S_ENDPGM 0
 entry:
   %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -658,18 +658,15 @@ entry:
   br label %if.then
 
 if.then:                                          ; preds = %entry
-  %idx = getelementptr inbounds %tex*, %tex* addrspace(1)* %tex.coerce, i64 %1
-  %2 = load %tex*, %tex* addrspace(1)* %idx, align 8
-  %3 = bitcast %tex* %2 to i32*
-  %4 = addrspacecast i32* %3 to i32 addrspace(4)*
-  %add.ptr.i = getelementptr inbounds i32, i32 addrspace(4)* %4, i64 12
-  %5 = bitcast %tex* %2 to <8 x i32>*
-  %6 = addrspacecast <8 x i32>* %5 to <8 x i32> addrspace(4)*
-  %7 = load <8 x i32>, <8 x i32> addrspace(4)* %6, align 32
-  %8 = bitcast i32 addrspace(4)* %add.ptr.i to <4 x i32> addrspace(4)*
-  %9 = load <4 x i32>, <4 x i32> addrspace(4)* %8, align 16
-  %10 = tail call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float undef, float undef, <8 x i32> %7, <4 x i32> %9, i1 false, i32 0, i32 0)
-  store float %10, float addrspace(1)* undef, align 4
+  %idx = getelementptr inbounds ptr, ptr addrspace(1) %tex.coerce, i64 %1
+  %2 = load ptr, ptr addrspace(1) %idx, align 8
+  %3 = addrspacecast ptr %2 to ptr addrspace(4)
+  %add.ptr.i = getelementptr inbounds i32, ptr addrspace(4) %3, i64 12
+  %4 = addrspacecast ptr %2 to ptr addrspace(4)
+  %5 = load <8 x i32>, ptr addrspace(4) %4, align 32
+  %6 = load <4 x i32>, ptr addrspace(4) %add.ptr.i, align 16
+  %7 = tail call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float undef, float undef, <8 x i32> %5, <4 x i32> %6, i1 false, i32 0, i32 0)
+  store float %7, ptr addrspace(1) undef, align 4
   ret void
 }
 


        


More information about the llvm-commits mailing list