[llvm] c4c5fdd - [AMDGPU] Generate checks for vector indexing. NFC. (#105668)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 22 08:11:16 PDT 2024
Author: Jay Foad
Date: 2024-08-22T16:11:12+01:00
New Revision: c4c5fdd933fa2d1f7624d863d05a4fb982b4c074
URL: https://github.com/llvm/llvm-project/commit/c4c5fdd933fa2d1f7624d863d05a4fb982b4c074
DIFF: https://github.com/llvm/llvm-project/commit/c4c5fdd933fa2d1f7624d863d05a4fb982b4c074.diff
LOG: [AMDGPU] Generate checks for vector indexing. NFC. (#105668)
This allows combining some test files that were only split because
adding new RUN lines introduced too much churn in the checks.
Added:
Modified:
llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
Removed:
llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
index 31fa32b3475cb7..872a457a3b5c34 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
@@ -2,70 +2,6 @@
; indexing of vectors.
-; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll
-; to avoid gfx9 scheduling induced issues.
-
-
-; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
-; GCN-DAG: s_load_dwordx16 s[[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]]
-; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
-; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
-
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
-
-; GCN: v_cmp_eq_u32_e32
-; GCN-COUNT-32: v_cndmask_b32
-
-; GCN-COUNT-4: buffer_store_dwordx4
-define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) #0 {
-entry:
- %id = call i32 @llvm.amdgcn.workitem.id.x() #1
- %id.ext = zext i32 %id to i64
- %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext
- %idx0 = load volatile i32, ptr addrspace(1) %gep
- %idx1 = add i32 %idx0, 1
- %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
- %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
- %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
- store volatile <16 x i32> %vec2, ptr addrspace(1) %out0
- %cmp = icmp eq i32 %id, 0
- br i1 %cmp, label %bb1, label %bb2
-
-bb1:
- store volatile i32 %live.out.val, ptr addrspace(1) undef
- br label %bb2
-
-bb2:
- ret void
-}
-
-; Avoid inserting extra v_mov from copies within the vgpr indexing sequence. The
-; gpr_idx mode switching sequence is expanded late for this reason.
-
-; GCN-LABEL: {{^}}insert_w_offset_multiple_in_block
-
-; GCN: s_set_gpr_idx_on
-; GCN-NEXT: v_mov_b32_e32
-; GCN-NEXT: s_set_gpr_idx_off
-
-; GCN: s_set_gpr_idx_on
-; GCN-NEXT: v_mov_b32_e32
-; GCN-NOT: v_mov_b32_e32
-; GCN-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) #0 {
-entry:
- %add1 = add i32 %in, 1
- %ins1 = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add1
- %add2 = add i32 %in, 2
- %ins2 = insertelement <16 x float> %ins1, float 17.0, i32 %add2
- store <16 x float> %ins1, ptr addrspace(1) %out1
- %out2 = getelementptr <16 x float>, ptr addrspace(1) %out1, i32 1
- store <16 x float> %ins2, ptr addrspace(1) %out2
-
- ret void
-}
-
declare hidden void @foo()
; For functions with calls, we were not accounting for m0_lo16/m0_hi16
@@ -83,7 +19,4 @@ define amdgpu_kernel void @insertelement_with_call(ptr addrspace(1) %ptr, i32 %i
ret void
}
-declare i32 @llvm.amdgcn.workitem.id.x() #1
-declare void @llvm.amdgcn.s.barrier() #2
-
attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
deleted file mode 100644
index 1a72140963d696..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
-
-; FIXME: Merge into indirect-addressing-si.ll
-
-; Make sure that TwoAddressInstructions keeps src0 as subregister sub0
-; of the tied implicit use and def of the super register.
-
-; CHECK-LABEL: {{^}}insert_wo_offset:
-; CHECK: s_load_dword [[IN:s[0-9]+]]
-; CHECK: s_mov_b32 m0, [[IN]]
-; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
-; CHECK: buffer_store_dwordx4
-; CHECK: buffer_store_dwordx4
-; CHECK: buffer_store_dwordx4
-; CHECK: buffer_store_dwordx4
-define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) {
-entry:
- %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
- store <16 x float> %ins, ptr addrspace(1) %out
- ret void
-}
-
-; Make sure we don't hit use of undefined register errors when expanding an
-; extract with undef index.
-
-; CHECK-LABEL: {{^}}extract_adjacent_blocks:
-; CHECK: s_load_dword [[ARG:s[0-9]+]]
-; CHECK: s_cmp_lg_u32
-; CHECK: s_cbranch_scc1 [[BB4:.LBB[0-9]+_[0-9]+]]
-
-; CHECK: buffer_load_dwordx4
-
-; CHECK: s_branch [[ENDBB:.LBB[0-9]+_[0-9]+]]
-
-; CHECK: [[BB4]]:
-; CHECK: buffer_load_dwordx4
-
-; CHECK: [[ENDBB]]:
-; CHECK: buffer_store_dword
-; CHECK: s_endpgm
-
-define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) #0 {
-bb:
- %tmp = icmp eq i32 %arg, 0
- br i1 %tmp, label %bb1, label %bb4
-
-bb1:
- %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef
- %tmp3 = extractelement <4 x float> %tmp2, i32 undef
- call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) #0 ; Prevent block optimize out
- br label %bb7
-
-bb4:
- %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef
- %tmp6 = extractelement <4 x float> %tmp5, i32 undef
- call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) #0 ; Prevent block optimize out
- br label %bb7
-
-bb7:
- %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
- store volatile float %tmp8, ptr addrspace(1) undef
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
deleted file mode 100644
index cbb5d9e1692843..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-
-; Tests for indirect addressing on SI, which is implemented using dynamic
-; indexing of vectors.
-
-; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll
-; to avoid gfx9 scheduling induced issues.
-
-
-; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
-; GCN-DAG: s_load_dwordx16 s[[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]]
-; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
-; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
-
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
-
-; GCN-DAG: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], vcc, 1, [[IDX0]]
-
-; GCN: v_cmp_eq_u32_e32
-; GCN-COUNT-32: v_cndmask_b32
-
-; GCN-COUNT-4: buffer_store_dwordx4
-define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) #0 {
-entry:
- %id = call i32 @llvm.amdgcn.workitem.id.x() #1
- %id.ext = zext i32 %id to i64
- %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext
- %idx0 = load volatile i32, ptr addrspace(1) %gep
- %idx1 = add i32 %idx0, 1
- %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
- %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
- %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
- store volatile <16 x i32> %vec2, ptr addrspace(1) %out0
- %cmp = icmp eq i32 %id, 0
- br i1 %cmp, label %bb1, label %bb2
-
-bb1:
- store volatile i32 %live.out.val, ptr addrspace(1) undef
- br label %bb2
-
-bb2:
- ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x() #1
-declare void @llvm.amdgcn.s.barrier() #2
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index f095aef7a0cc81..c130eb04d02370 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1,26 +1,197 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 < %s | FileCheck -check-prefix=NOOPT %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI-MOVREL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI,VI-MOVREL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-vgpr-index-mode < %s | FileCheck -check-prefixes=VI,VI-IDXMODE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-IDXMODE %s
; Tests for indirect addressing on SI, which is implemented using dynamic
; indexing of vectors.
-
-; GCN-LABEL: {{^}}extract_w_offset:
-; GCN-DAG: s_load_dword [[IN0:s[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
-; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
-; GCN-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1
-
-; MOVREL-DAG: s_mov_b32 m0, [[IN]]
-; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
-
-; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}}
-; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
-; IDXMODE-NEXT: s_set_gpr_idx_off
define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) {
+; NOOPT-LABEL: extract_w_offset:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_mov_b32 s5, 1
+; NOOPT-NEXT: s_add_i32 s4, s4, s5
+; NOOPT-NEXT: s_mov_b32 s5, 0x41800000
+; NOOPT-NEXT: s_mov_b32 s6, 0x41700000
+; NOOPT-NEXT: s_mov_b32 s7, 0x41600000
+; NOOPT-NEXT: s_mov_b32 s8, 0x41500000
+; NOOPT-NEXT: s_mov_b32 s9, 0x41400000
+; NOOPT-NEXT: s_mov_b32 s10, 0x41300000
+; NOOPT-NEXT: s_mov_b32 s11, 0x41200000
+; NOOPT-NEXT: s_mov_b32 s12, 0x41100000
+; NOOPT-NEXT: s_mov_b32 s13, 0x41000000
+; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000
+; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000
+; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000
+; NOOPT-NEXT: s_mov_b32 s17, 4.0
+; NOOPT-NEXT: s_mov_b32 s18, 0x40400000
+; NOOPT-NEXT: s_mov_b32 s19, 2.0
+; NOOPT-NEXT: s_mov_b32 s20, 1.0
+; NOOPT-NEXT: v_mov_b32_e32 v0, s20
+; NOOPT-NEXT: v_mov_b32_e32 v30, s19
+; NOOPT-NEXT: v_mov_b32_e32 v29, s18
+; NOOPT-NEXT: v_mov_b32_e32 v28, s17
+; NOOPT-NEXT: v_mov_b32_e32 v27, s16
+; NOOPT-NEXT: v_mov_b32_e32 v26, s15
+; NOOPT-NEXT: v_mov_b32_e32 v25, s14
+; NOOPT-NEXT: v_mov_b32_e32 v24, s13
+; NOOPT-NEXT: v_mov_b32_e32 v23, s12
+; NOOPT-NEXT: v_mov_b32_e32 v22, s11
+; NOOPT-NEXT: v_mov_b32_e32 v21, s10
+; NOOPT-NEXT: v_mov_b32_e32 v20, s9
+; NOOPT-NEXT: v_mov_b32_e32 v19, s8
+; NOOPT-NEXT: v_mov_b32_e32 v18, s7
+; NOOPT-NEXT: v_mov_b32_e32 v17, s6
+; NOOPT-NEXT: v_mov_b32_e32 v16, s5
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v30
+; NOOPT-NEXT: v_mov_b32_e32 v2, v29
+; NOOPT-NEXT: v_mov_b32_e32 v3, v28
+; NOOPT-NEXT: v_mov_b32_e32 v4, v27
+; NOOPT-NEXT: v_mov_b32_e32 v5, v26
+; NOOPT-NEXT: v_mov_b32_e32 v6, v25
+; NOOPT-NEXT: v_mov_b32_e32 v7, v24
+; NOOPT-NEXT: v_mov_b32_e32 v8, v23
+; NOOPT-NEXT: v_mov_b32_e32 v9, v22
+; NOOPT-NEXT: v_mov_b32_e32 v10, v21
+; NOOPT-NEXT: v_mov_b32_e32 v11, v20
+; NOOPT-NEXT: v_mov_b32_e32 v12, v19
+; NOOPT-NEXT: v_mov_b32_e32 v13, v18
+; NOOPT-NEXT: v_mov_b32_e32 v14, v17
+; NOOPT-NEXT: v_mov_b32_e32 v15, v16
+; NOOPT-NEXT: s_mov_b32 m0, s4
+; NOOPT-NEXT: v_movrels_b32_e32 v0, v0
+; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: extract_w_offset:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1
+; SI-MOVREL-NEXT: s_mov_b32 m0, s4
+; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: extract_w_offset:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_add_i32 s4, s4, 1
+; VI-MOVREL-NEXT: s_mov_b32 m0, s4
+; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1
+; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: extract_w_offset:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_add_i32 s4, s4, 1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1
+; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: extract_w_offset:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: s_add_i32 s4, s4, 1
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%idx = add i32 %in, 1
%elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %idx
@@ -29,24 +200,291 @@ entry:
}
; XXX: Could do v_or_b32 directly
-; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector:
-; GCN-DAG: s_or_b32
-; GCN-DAG: s_or_b32
-; GCN-DAG: s_or_b32
-; GCN-DAG: s_or_b32
-; MOVREL: s_mov_b32 m0
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-
-
-; MOVREL: v_movrels_b32_e32
-
-; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(SRC0){{$}}
-; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; IDXMODE-NEXT: s_set_gpr_idx_off
define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %out, i32 %in, <16 x i32> %or.val) {
+; NOOPT-LABEL: extract_w_offset_salu_use_vector:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb
+; NOOPT-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x19
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_mov_b32 s21, 1
+; NOOPT-NEXT: s_add_i32 s4, s4, s21
+; NOOPT-NEXT: s_mov_b32 s5, s51
+; NOOPT-NEXT: s_mov_b32 s6, 16
+; NOOPT-NEXT: s_or_b32 s5, s5, s6
+; NOOPT-NEXT: s_mov_b32 s6, s50
+; NOOPT-NEXT: s_mov_b32 s7, 15
+; NOOPT-NEXT: s_or_b32 s6, s6, s7
+; NOOPT-NEXT: s_mov_b32 s7, s49
+; NOOPT-NEXT: s_mov_b32 s8, 14
+; NOOPT-NEXT: s_or_b32 s7, s7, s8
+; NOOPT-NEXT: s_mov_b32 s8, s48
+; NOOPT-NEXT: s_mov_b32 s9, 13
+; NOOPT-NEXT: s_or_b32 s8, s8, s9
+; NOOPT-NEXT: s_mov_b32 s9, s47
+; NOOPT-NEXT: s_mov_b32 s10, 12
+; NOOPT-NEXT: s_or_b32 s9, s9, s10
+; NOOPT-NEXT: s_mov_b32 s10, s46
+; NOOPT-NEXT: s_mov_b32 s11, 11
+; NOOPT-NEXT: s_or_b32 s10, s10, s11
+; NOOPT-NEXT: s_mov_b32 s11, s45
+; NOOPT-NEXT: s_mov_b32 s12, 10
+; NOOPT-NEXT: s_or_b32 s11, s11, s12
+; NOOPT-NEXT: s_mov_b32 s12, s44
+; NOOPT-NEXT: s_mov_b32 s13, 9
+; NOOPT-NEXT: s_or_b32 s12, s12, s13
+; NOOPT-NEXT: s_mov_b32 s13, s43
+; NOOPT-NEXT: s_mov_b32 s14, 8
+; NOOPT-NEXT: s_or_b32 s13, s13, s14
+; NOOPT-NEXT: s_mov_b32 s14, s42
+; NOOPT-NEXT: s_mov_b32 s15, 7
+; NOOPT-NEXT: s_or_b32 s14, s14, s15
+; NOOPT-NEXT: s_mov_b32 s15, s41
+; NOOPT-NEXT: s_mov_b32 s16, 6
+; NOOPT-NEXT: s_or_b32 s15, s15, s16
+; NOOPT-NEXT: s_mov_b32 s16, s40
+; NOOPT-NEXT: s_mov_b32 s17, 5
+; NOOPT-NEXT: s_or_b32 s16, s16, s17
+; NOOPT-NEXT: s_mov_b32 s17, s39
+; NOOPT-NEXT: s_mov_b32 s18, 4
+; NOOPT-NEXT: s_or_b32 s17, s17, s18
+; NOOPT-NEXT: s_mov_b32 s18, s38
+; NOOPT-NEXT: s_mov_b32 s19, 3
+; NOOPT-NEXT: s_or_b32 s18, s18, s19
+; NOOPT-NEXT: s_mov_b32 s19, s37
+; NOOPT-NEXT: s_mov_b32 s20, 2
+; NOOPT-NEXT: s_or_b32 s19, s19, s20
+; NOOPT-NEXT: s_mov_b32 s20, s36
+; NOOPT-NEXT: s_or_b32 s20, s20, s21
+; NOOPT-NEXT: v_mov_b32_e32 v0, s20
+; NOOPT-NEXT: v_mov_b32_e32 v30, s19
+; NOOPT-NEXT: v_mov_b32_e32 v29, s18
+; NOOPT-NEXT: v_mov_b32_e32 v28, s17
+; NOOPT-NEXT: v_mov_b32_e32 v27, s16
+; NOOPT-NEXT: v_mov_b32_e32 v26, s15
+; NOOPT-NEXT: v_mov_b32_e32 v25, s14
+; NOOPT-NEXT: v_mov_b32_e32 v24, s13
+; NOOPT-NEXT: v_mov_b32_e32 v23, s12
+; NOOPT-NEXT: v_mov_b32_e32 v22, s11
+; NOOPT-NEXT: v_mov_b32_e32 v21, s10
+; NOOPT-NEXT: v_mov_b32_e32 v20, s9
+; NOOPT-NEXT: v_mov_b32_e32 v19, s8
+; NOOPT-NEXT: v_mov_b32_e32 v18, s7
+; NOOPT-NEXT: v_mov_b32_e32 v17, s6
+; NOOPT-NEXT: v_mov_b32_e32 v16, s5
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v30
+; NOOPT-NEXT: v_mov_b32_e32 v2, v29
+; NOOPT-NEXT: v_mov_b32_e32 v3, v28
+; NOOPT-NEXT: v_mov_b32_e32 v4, v27
+; NOOPT-NEXT: v_mov_b32_e32 v5, v26
+; NOOPT-NEXT: v_mov_b32_e32 v6, v25
+; NOOPT-NEXT: v_mov_b32_e32 v7, v24
+; NOOPT-NEXT: v_mov_b32_e32 v8, v23
+; NOOPT-NEXT: v_mov_b32_e32 v9, v22
+; NOOPT-NEXT: v_mov_b32_e32 v10, v21
+; NOOPT-NEXT: v_mov_b32_e32 v11, v20
+; NOOPT-NEXT: v_mov_b32_e32 v12, v19
+; NOOPT-NEXT: v_mov_b32_e32 v13, v18
+; NOOPT-NEXT: v_mov_b32_e32 v14, v17
+; NOOPT-NEXT: v_mov_b32_e32 v15, v16
+; NOOPT-NEXT: s_mov_b32 m0, s4
+; NOOPT-NEXT: v_movrels_b32_e32 v0, v0
+; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: extract_w_offset_salu_use_vector:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0xb
+; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_add_i32 s20, s20, 1
+; SI-MOVREL-NEXT: s_or_b32 s4, s4, 1
+; SI-MOVREL-NEXT: s_or_b32 s19, s19, 16
+; SI-MOVREL-NEXT: s_or_b32 s18, s18, 15
+; SI-MOVREL-NEXT: s_or_b32 s17, s17, 14
+; SI-MOVREL-NEXT: s_or_b32 s16, s16, 13
+; SI-MOVREL-NEXT: s_or_b32 s15, s15, 12
+; SI-MOVREL-NEXT: s_or_b32 s14, s14, 11
+; SI-MOVREL-NEXT: s_or_b32 s13, s13, 10
+; SI-MOVREL-NEXT: s_or_b32 s12, s12, 9
+; SI-MOVREL-NEXT: s_or_b32 s11, s11, 8
+; SI-MOVREL-NEXT: s_or_b32 s10, s10, 7
+; SI-MOVREL-NEXT: s_or_b32 s9, s9, 6
+; SI-MOVREL-NEXT: s_or_b32 s8, s8, 5
+; SI-MOVREL-NEXT: s_or_b32 s7, s7, 4
+; SI-MOVREL-NEXT: s_or_b32 s6, s6, 3
+; SI-MOVREL-NEXT: s_or_b32 s5, s5, 2
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
+; SI-MOVREL-NEXT: s_mov_b32 m0, s20
+; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19
+; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: extract_w_offset_salu_use_vector:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0x2c
+; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_add_i32 s20, s20, 1
+; VI-MOVREL-NEXT: s_or_b32 s6, s6, 3
+; VI-MOVREL-NEXT: s_or_b32 s5, s5, 2
+; VI-MOVREL-NEXT: s_or_b32 s4, s4, 1
+; VI-MOVREL-NEXT: s_or_b32 s2, s19, 16
+; VI-MOVREL-NEXT: s_or_b32 s3, s18, 15
+; VI-MOVREL-NEXT: s_or_b32 s17, s17, 14
+; VI-MOVREL-NEXT: s_or_b32 s16, s16, 13
+; VI-MOVREL-NEXT: s_or_b32 s15, s15, 12
+; VI-MOVREL-NEXT: s_or_b32 s14, s14, 11
+; VI-MOVREL-NEXT: s_or_b32 s13, s13, 10
+; VI-MOVREL-NEXT: s_or_b32 s12, s12, 9
+; VI-MOVREL-NEXT: s_or_b32 s11, s11, 8
+; VI-MOVREL-NEXT: s_or_b32 s10, s10, 7
+; VI-MOVREL-NEXT: s_or_b32 s9, s9, 6
+; VI-MOVREL-NEXT: s_or_b32 s8, s8, 5
+; VI-MOVREL-NEXT: s_or_b32 s7, s7, 4
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6
+; VI-MOVREL-NEXT: s_mov_b32 m0, s20
+; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s2
+; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1
+; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: extract_w_offset_salu_use_vector:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0x2c
+; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_add_i32 s20, s20, 1
+; VI-IDXMODE-NEXT: s_or_b32 s6, s6, 3
+; VI-IDXMODE-NEXT: s_or_b32 s5, s5, 2
+; VI-IDXMODE-NEXT: s_or_b32 s4, s4, 1
+; VI-IDXMODE-NEXT: s_or_b32 s2, s19, 16
+; VI-IDXMODE-NEXT: s_or_b32 s3, s18, 15
+; VI-IDXMODE-NEXT: s_or_b32 s17, s17, 14
+; VI-IDXMODE-NEXT: s_or_b32 s16, s16, 13
+; VI-IDXMODE-NEXT: s_or_b32 s15, s15, 12
+; VI-IDXMODE-NEXT: s_or_b32 s14, s14, 11
+; VI-IDXMODE-NEXT: s_or_b32 s13, s13, 10
+; VI-IDXMODE-NEXT: s_or_b32 s12, s12, 9
+; VI-IDXMODE-NEXT: s_or_b32 s11, s11, 8
+; VI-IDXMODE-NEXT: s_or_b32 s10, s10, 7
+; VI-IDXMODE-NEXT: s_or_b32 s9, s9, 6
+; VI-IDXMODE-NEXT: s_or_b32 s8, s8, 5
+; VI-IDXMODE-NEXT: s_or_b32 s7, s7, 4
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s2
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(SRC0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1
+; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: extract_w_offset_salu_use_vector:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0x2c
+; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: s_add_i32 s20, s20, 1
+; GFX9-IDXMODE-NEXT: s_or_b32 s4, s4, 1
+; GFX9-IDXMODE-NEXT: s_or_b32 s2, s19, 16
+; GFX9-IDXMODE-NEXT: s_or_b32 s3, s18, 15
+; GFX9-IDXMODE-NEXT: s_or_b32 s17, s17, 14
+; GFX9-IDXMODE-NEXT: s_or_b32 s16, s16, 13
+; GFX9-IDXMODE-NEXT: s_or_b32 s15, s15, 12
+; GFX9-IDXMODE-NEXT: s_or_b32 s14, s14, 11
+; GFX9-IDXMODE-NEXT: s_or_b32 s13, s13, 10
+; GFX9-IDXMODE-NEXT: s_or_b32 s12, s12, 9
+; GFX9-IDXMODE-NEXT: s_or_b32 s11, s11, 8
+; GFX9-IDXMODE-NEXT: s_or_b32 s10, s10, 7
+; GFX9-IDXMODE-NEXT: s_or_b32 s9, s9, 6
+; GFX9-IDXMODE-NEXT: s_or_b32 s8, s8, 5
+; GFX9-IDXMODE-NEXT: s_or_b32 s7, s7, 4
+; GFX9-IDXMODE-NEXT: s_or_b32 s6, s6, 3
+; GFX9-IDXMODE-NEXT: s_or_b32 s5, s5, 2
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s3
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s2
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(SRC0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%idx = add i32 %in, 1
%vec = or <16 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
@@ -55,38 +493,371 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}extract_wo_offset:
-; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
-; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
-
-; MOVREL-DAG: s_mov_b32 m0, [[IN]]
-; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
-
-; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}}
-; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
-; IDXMODE-NEXT: s_set_gpr_idx_off
define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) {
+; NOOPT-LABEL: extract_wo_offset:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_mov_b32 s5, 0x41800000
+; NOOPT-NEXT: s_mov_b32 s6, 0x41700000
+; NOOPT-NEXT: s_mov_b32 s7, 0x41600000
+; NOOPT-NEXT: s_mov_b32 s8, 0x41500000
+; NOOPT-NEXT: s_mov_b32 s9, 0x41400000
+; NOOPT-NEXT: s_mov_b32 s10, 0x41300000
+; NOOPT-NEXT: s_mov_b32 s11, 0x41200000
+; NOOPT-NEXT: s_mov_b32 s12, 0x41100000
+; NOOPT-NEXT: s_mov_b32 s13, 0x41000000
+; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000
+; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000
+; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000
+; NOOPT-NEXT: s_mov_b32 s17, 4.0
+; NOOPT-NEXT: s_mov_b32 s18, 0x40400000
+; NOOPT-NEXT: s_mov_b32 s19, 2.0
+; NOOPT-NEXT: s_mov_b32 s20, 1.0
+; NOOPT-NEXT: v_mov_b32_e32 v0, s20
+; NOOPT-NEXT: v_mov_b32_e32 v30, s19
+; NOOPT-NEXT: v_mov_b32_e32 v29, s18
+; NOOPT-NEXT: v_mov_b32_e32 v28, s17
+; NOOPT-NEXT: v_mov_b32_e32 v27, s16
+; NOOPT-NEXT: v_mov_b32_e32 v26, s15
+; NOOPT-NEXT: v_mov_b32_e32 v25, s14
+; NOOPT-NEXT: v_mov_b32_e32 v24, s13
+; NOOPT-NEXT: v_mov_b32_e32 v23, s12
+; NOOPT-NEXT: v_mov_b32_e32 v22, s11
+; NOOPT-NEXT: v_mov_b32_e32 v21, s10
+; NOOPT-NEXT: v_mov_b32_e32 v20, s9
+; NOOPT-NEXT: v_mov_b32_e32 v19, s8
+; NOOPT-NEXT: v_mov_b32_e32 v18, s7
+; NOOPT-NEXT: v_mov_b32_e32 v17, s6
+; NOOPT-NEXT: v_mov_b32_e32 v16, s5
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v30
+; NOOPT-NEXT: v_mov_b32_e32 v2, v29
+; NOOPT-NEXT: v_mov_b32_e32 v3, v28
+; NOOPT-NEXT: v_mov_b32_e32 v4, v27
+; NOOPT-NEXT: v_mov_b32_e32 v5, v26
+; NOOPT-NEXT: v_mov_b32_e32 v6, v25
+; NOOPT-NEXT: v_mov_b32_e32 v7, v24
+; NOOPT-NEXT: v_mov_b32_e32 v8, v23
+; NOOPT-NEXT: v_mov_b32_e32 v9, v22
+; NOOPT-NEXT: v_mov_b32_e32 v10, v21
+; NOOPT-NEXT: v_mov_b32_e32 v11, v20
+; NOOPT-NEXT: v_mov_b32_e32 v12, v19
+; NOOPT-NEXT: v_mov_b32_e32 v13, v18
+; NOOPT-NEXT: v_mov_b32_e32 v14, v17
+; NOOPT-NEXT: v_mov_b32_e32 v15, v16
+; NOOPT-NEXT: s_mov_b32 m0, s4
+; NOOPT-NEXT: v_movrels_b32_e32 v0, v0
+; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: extract_wo_offset:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_mov_b32 m0, s4
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: extract_wo_offset:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x2c
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1
+; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: extract_wo_offset:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x2c
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1
+; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: extract_wo_offset:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %in
store float %elt, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}extract_neg_offset_sgpr:
-; The offset depends on the register that holds the first element of the vector.
-; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
-; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
-
-; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
-; IDXMODE: v_mov_b32_e32 v14, 15
-; IDXMODE: v_mov_b32_e32 v15, 16
-; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}}
-; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; IDXMODE-NEXT: s_set_gpr_idx_off
define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %offset) {
+; NOOPT-LABEL: extract_neg_offset_sgpr:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_mov_b32 s5, 16
+; NOOPT-NEXT: s_mov_b32 s6, 15
+; NOOPT-NEXT: s_mov_b32 s7, 14
+; NOOPT-NEXT: s_mov_b32 s8, 13
+; NOOPT-NEXT: s_mov_b32 s9, 12
+; NOOPT-NEXT: s_mov_b32 s10, 11
+; NOOPT-NEXT: s_mov_b32 s11, 10
+; NOOPT-NEXT: s_mov_b32 s12, 9
+; NOOPT-NEXT: s_mov_b32 s13, 8
+; NOOPT-NEXT: s_mov_b32 s14, 7
+; NOOPT-NEXT: s_mov_b32 s15, 6
+; NOOPT-NEXT: s_mov_b32 s16, 5
+; NOOPT-NEXT: s_mov_b32 s17, 3
+; NOOPT-NEXT: s_mov_b32 s18, 2
+; NOOPT-NEXT: s_mov_b32 s19, 1
+; NOOPT-NEXT: s_mov_b32 s20, 0
+; NOOPT-NEXT: v_mov_b32_e32 v0, s20
+; NOOPT-NEXT: v_mov_b32_e32 v30, s19
+; NOOPT-NEXT: v_mov_b32_e32 v29, s18
+; NOOPT-NEXT: v_mov_b32_e32 v28, s17
+; NOOPT-NEXT: v_mov_b32_e32 v27, s16
+; NOOPT-NEXT: v_mov_b32_e32 v26, s15
+; NOOPT-NEXT: v_mov_b32_e32 v25, s14
+; NOOPT-NEXT: v_mov_b32_e32 v24, s13
+; NOOPT-NEXT: v_mov_b32_e32 v23, s12
+; NOOPT-NEXT: v_mov_b32_e32 v22, s11
+; NOOPT-NEXT: v_mov_b32_e32 v21, s10
+; NOOPT-NEXT: v_mov_b32_e32 v20, s9
+; NOOPT-NEXT: v_mov_b32_e32 v19, s8
+; NOOPT-NEXT: v_mov_b32_e32 v18, s7
+; NOOPT-NEXT: v_mov_b32_e32 v17, s6
+; NOOPT-NEXT: v_mov_b32_e32 v16, s5
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v30
+; NOOPT-NEXT: v_mov_b32_e32 v2, v29
+; NOOPT-NEXT: v_mov_b32_e32 v3, v28
+; NOOPT-NEXT: v_mov_b32_e32 v4, v27
+; NOOPT-NEXT: v_mov_b32_e32 v5, v26
+; NOOPT-NEXT: v_mov_b32_e32 v6, v25
+; NOOPT-NEXT: v_mov_b32_e32 v7, v24
+; NOOPT-NEXT: v_mov_b32_e32 v8, v23
+; NOOPT-NEXT: v_mov_b32_e32 v9, v22
+; NOOPT-NEXT: v_mov_b32_e32 v10, v21
+; NOOPT-NEXT: v_mov_b32_e32 v11, v20
+; NOOPT-NEXT: v_mov_b32_e32 v12, v19
+; NOOPT-NEXT: v_mov_b32_e32 v13, v18
+; NOOPT-NEXT: v_mov_b32_e32 v14, v17
+; NOOPT-NEXT: v_mov_b32_e32 v15, v16
+; NOOPT-NEXT: s_add_i32 m0, s4, 0xfffffe00
+; NOOPT-NEXT: v_movrels_b32_e32 v0, v0
+; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: extract_neg_offset_sgpr:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 0
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 1
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_add_i32 m0, s4, 0xfffffe00
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 2
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 3
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 6
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 7
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 8
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 10
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 11
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 12
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 13
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 14
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 15
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 16
+; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: extract_neg_offset_sgpr:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x2c
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 2
+; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 3
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 6
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 7
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 8
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 9
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 10
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 11
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 12
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 13
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 14
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 15
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 16
+; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1
+; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: extract_neg_offset_sgpr:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x2c
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 2
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 3
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 6
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 7
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 8
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 9
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 10
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 11
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 12
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 13
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 14
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 15
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 16
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1
+; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: extract_neg_offset_sgpr:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 2
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 3
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: s_addk_i32 s4, 0xfe00
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 6
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 7
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 8
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 9
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 10
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 11
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 12
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 13
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 14
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 15
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 16
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%index = add i32 %offset, -512
%value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
@@ -94,32 +865,293 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}extract_neg_offset_sgpr_loaded:
-; The offset depends on the register that holds the first element of the vector.
-; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
-; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
-
-; IDXMODE-DAG: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
-; IDXMODE-DAG: v_mov_b32_e32 v0,
-; IDXMODE: v_mov_b32_e32 v1,
-; IDXMODE: v_mov_b32_e32 v2,
-; IDXMODE: v_mov_b32_e32 v3,
-; IDXMODE: v_mov_b32_e32 v4,
-; IDXMODE: v_mov_b32_e32 v5,
-; IDXMODE: v_mov_b32_e32 v6,
-; IDXMODE: v_mov_b32_e32 v7,
-; IDXMODE: v_mov_b32_e32 v8,
-; IDXMODE: v_mov_b32_e32 v9,
-; IDXMODE: v_mov_b32_e32 v10,
-; IDXMODE: v_mov_b32_e32 v11,
-; IDXMODE: v_mov_b32_e32 v12,
-; IDXMODE: v_mov_b32_e32 v13,
-; IDXMODE: v_mov_b32_e32 v14,
-; IDXMODE: v_mov_b32_e32 v15,
-; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0){{$}}
-; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; IDXMODE-NEXT: s_set_gpr_idx_off
define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) {
+; NOOPT-LABEL: extract_neg_offset_sgpr_loaded:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x19
+; NOOPT-NEXT: s_load_dwordx16 s[52:67], s[2:3], 0x29
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0x39
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_mov_b32 s6, s67
+; NOOPT-NEXT: s_mov_b32 s5, s51
+; NOOPT-NEXT: s_or_b32 s5, s5, s6
+; NOOPT-NEXT: s_mov_b32 s7, s66
+; NOOPT-NEXT: s_mov_b32 s6, s50
+; NOOPT-NEXT: s_or_b32 s6, s6, s7
+; NOOPT-NEXT: s_mov_b32 s8, s65
+; NOOPT-NEXT: s_mov_b32 s7, s49
+; NOOPT-NEXT: s_or_b32 s7, s7, s8
+; NOOPT-NEXT: s_mov_b32 s9, s64
+; NOOPT-NEXT: s_mov_b32 s8, s48
+; NOOPT-NEXT: s_or_b32 s8, s8, s9
+; NOOPT-NEXT: s_mov_b32 s10, s63
+; NOOPT-NEXT: s_mov_b32 s9, s47
+; NOOPT-NEXT: s_or_b32 s9, s9, s10
+; NOOPT-NEXT: s_mov_b32 s11, s62
+; NOOPT-NEXT: s_mov_b32 s10, s46
+; NOOPT-NEXT: s_or_b32 s10, s10, s11
+; NOOPT-NEXT: s_mov_b32 s12, s61
+; NOOPT-NEXT: s_mov_b32 s11, s45
+; NOOPT-NEXT: s_or_b32 s11, s11, s12
+; NOOPT-NEXT: s_mov_b32 s13, s60
+; NOOPT-NEXT: s_mov_b32 s12, s44
+; NOOPT-NEXT: s_or_b32 s12, s12, s13
+; NOOPT-NEXT: s_mov_b32 s14, s59
+; NOOPT-NEXT: s_mov_b32 s13, s43
+; NOOPT-NEXT: s_or_b32 s13, s13, s14
+; NOOPT-NEXT: s_mov_b32 s15, s58
+; NOOPT-NEXT: s_mov_b32 s14, s42
+; NOOPT-NEXT: s_or_b32 s14, s14, s15
+; NOOPT-NEXT: s_mov_b32 s16, s57
+; NOOPT-NEXT: s_mov_b32 s15, s41
+; NOOPT-NEXT: s_or_b32 s15, s15, s16
+; NOOPT-NEXT: s_mov_b32 s17, s56
+; NOOPT-NEXT: s_mov_b32 s16, s40
+; NOOPT-NEXT: s_or_b32 s16, s16, s17
+; NOOPT-NEXT: s_mov_b32 s18, s55
+; NOOPT-NEXT: s_mov_b32 s17, s39
+; NOOPT-NEXT: s_or_b32 s17, s17, s18
+; NOOPT-NEXT: s_mov_b32 s19, s54
+; NOOPT-NEXT: s_mov_b32 s18, s38
+; NOOPT-NEXT: s_or_b32 s18, s18, s19
+; NOOPT-NEXT: s_mov_b32 s20, s53
+; NOOPT-NEXT: s_mov_b32 s19, s37
+; NOOPT-NEXT: s_or_b32 s19, s19, s20
+; NOOPT-NEXT: s_mov_b32 s21, s52
+; NOOPT-NEXT: s_mov_b32 s20, s36
+; NOOPT-NEXT: s_or_b32 s20, s20, s21
+; NOOPT-NEXT: v_mov_b32_e32 v0, s20
+; NOOPT-NEXT: v_mov_b32_e32 v30, s19
+; NOOPT-NEXT: v_mov_b32_e32 v29, s18
+; NOOPT-NEXT: v_mov_b32_e32 v28, s17
+; NOOPT-NEXT: v_mov_b32_e32 v27, s16
+; NOOPT-NEXT: v_mov_b32_e32 v26, s15
+; NOOPT-NEXT: v_mov_b32_e32 v25, s14
+; NOOPT-NEXT: v_mov_b32_e32 v24, s13
+; NOOPT-NEXT: v_mov_b32_e32 v23, s12
+; NOOPT-NEXT: v_mov_b32_e32 v22, s11
+; NOOPT-NEXT: v_mov_b32_e32 v21, s10
+; NOOPT-NEXT: v_mov_b32_e32 v20, s9
+; NOOPT-NEXT: v_mov_b32_e32 v19, s8
+; NOOPT-NEXT: v_mov_b32_e32 v18, s7
+; NOOPT-NEXT: v_mov_b32_e32 v17, s6
+; NOOPT-NEXT: v_mov_b32_e32 v16, s5
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v30
+; NOOPT-NEXT: v_mov_b32_e32 v2, v29
+; NOOPT-NEXT: v_mov_b32_e32 v3, v28
+; NOOPT-NEXT: v_mov_b32_e32 v4, v27
+; NOOPT-NEXT: v_mov_b32_e32 v5, v26
+; NOOPT-NEXT: v_mov_b32_e32 v6, v25
+; NOOPT-NEXT: v_mov_b32_e32 v7, v24
+; NOOPT-NEXT: v_mov_b32_e32 v8, v23
+; NOOPT-NEXT: v_mov_b32_e32 v9, v22
+; NOOPT-NEXT: v_mov_b32_e32 v10, v21
+; NOOPT-NEXT: v_mov_b32_e32 v11, v20
+; NOOPT-NEXT: v_mov_b32_e32 v12, v19
+; NOOPT-NEXT: v_mov_b32_e32 v13, v18
+; NOOPT-NEXT: v_mov_b32_e32 v14, v17
+; NOOPT-NEXT: v_mov_b32_e32 v15, v16
+; NOOPT-NEXT: s_add_i32 m0, s4, 0xfffffe00
+; NOOPT-NEXT: v_movrels_b32_e32 v0, v0
+; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: extract_neg_offset_sgpr_loaded:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19
+; SI-MOVREL-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x29
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0x39
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_or_b32 s4, s4, s36
+; SI-MOVREL-NEXT: s_or_b32 s19, s19, s51
+; SI-MOVREL-NEXT: s_or_b32 s18, s18, s50
+; SI-MOVREL-NEXT: s_or_b32 s17, s17, s49
+; SI-MOVREL-NEXT: s_or_b32 s16, s16, s48
+; SI-MOVREL-NEXT: s_or_b32 s15, s15, s47
+; SI-MOVREL-NEXT: s_or_b32 s14, s14, s46
+; SI-MOVREL-NEXT: s_or_b32 s13, s13, s45
+; SI-MOVREL-NEXT: s_or_b32 s12, s12, s44
+; SI-MOVREL-NEXT: s_or_b32 s11, s11, s43
+; SI-MOVREL-NEXT: s_or_b32 s10, s10, s42
+; SI-MOVREL-NEXT: s_or_b32 s9, s9, s41
+; SI-MOVREL-NEXT: s_or_b32 s8, s8, s40
+; SI-MOVREL-NEXT: s_or_b32 s7, s7, s39
+; SI-MOVREL-NEXT: s_or_b32 s6, s6, s38
+; SI-MOVREL-NEXT: s_or_b32 s5, s5, s37
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
+; SI-MOVREL-NEXT: s_add_i32 m0, s20, 0xfffffe00
+; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19
+; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: extract_neg_offset_sgpr_loaded:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
+; VI-MOVREL-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0xe4
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_or_b32 s6, s6, s38
+; VI-MOVREL-NEXT: s_or_b32 s5, s5, s37
+; VI-MOVREL-NEXT: s_or_b32 s4, s4, s36
+; VI-MOVREL-NEXT: s_or_b32 s3, s19, s51
+; VI-MOVREL-NEXT: s_or_b32 s18, s18, s50
+; VI-MOVREL-NEXT: s_or_b32 s17, s17, s49
+; VI-MOVREL-NEXT: s_or_b32 s16, s16, s48
+; VI-MOVREL-NEXT: s_or_b32 s15, s15, s47
+; VI-MOVREL-NEXT: s_or_b32 s14, s14, s46
+; VI-MOVREL-NEXT: s_or_b32 s13, s13, s45
+; VI-MOVREL-NEXT: s_or_b32 s12, s12, s44
+; VI-MOVREL-NEXT: s_or_b32 s11, s11, s43
+; VI-MOVREL-NEXT: s_or_b32 s10, s10, s42
+; VI-MOVREL-NEXT: s_or_b32 s9, s9, s41
+; VI-MOVREL-NEXT: s_or_b32 s8, s8, s40
+; VI-MOVREL-NEXT: s_or_b32 s7, s7, s39
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00
+; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s18
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s3
+; VI-MOVREL-NEXT: v_movrels_b32_e32 v2, v0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1
+; VI-MOVREL-NEXT: flat_store_dword v[0:1], v2
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: extract_neg_offset_sgpr_loaded:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
+; VI-IDXMODE-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0xe4
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_or_b32 s6, s6, s38
+; VI-IDXMODE-NEXT: s_or_b32 s5, s5, s37
+; VI-IDXMODE-NEXT: s_or_b32 s4, s4, s36
+; VI-IDXMODE-NEXT: s_or_b32 s3, s19, s51
+; VI-IDXMODE-NEXT: s_or_b32 s18, s18, s50
+; VI-IDXMODE-NEXT: s_or_b32 s17, s17, s49
+; VI-IDXMODE-NEXT: s_or_b32 s16, s16, s48
+; VI-IDXMODE-NEXT: s_or_b32 s15, s15, s47
+; VI-IDXMODE-NEXT: s_or_b32 s14, s14, s46
+; VI-IDXMODE-NEXT: s_or_b32 s13, s13, s45
+; VI-IDXMODE-NEXT: s_or_b32 s12, s12, s44
+; VI-IDXMODE-NEXT: s_or_b32 s11, s11, s43
+; VI-IDXMODE-NEXT: s_or_b32 s10, s10, s42
+; VI-IDXMODE-NEXT: s_or_b32 s9, s9, s41
+; VI-IDXMODE-NEXT: s_or_b32 s8, s8, s40
+; VI-IDXMODE-NEXT: s_or_b32 s7, s7, s39
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6
+; VI-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s18
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s3
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, v0
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1
+; VI-IDXMODE-NEXT: flat_store_dword v[0:1], v2
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: extract_neg_offset_sgpr_loaded:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
+; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0xa4
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xe4
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: s_or_b32 s4, s4, s36
+; GFX9-IDXMODE-NEXT: s_or_b32 s2, s19, s51
+; GFX9-IDXMODE-NEXT: s_or_b32 s3, s18, s50
+; GFX9-IDXMODE-NEXT: s_or_b32 s17, s17, s49
+; GFX9-IDXMODE-NEXT: s_or_b32 s16, s16, s48
+; GFX9-IDXMODE-NEXT: s_or_b32 s15, s15, s47
+; GFX9-IDXMODE-NEXT: s_or_b32 s14, s14, s46
+; GFX9-IDXMODE-NEXT: s_or_b32 s13, s13, s45
+; GFX9-IDXMODE-NEXT: s_or_b32 s12, s12, s44
+; GFX9-IDXMODE-NEXT: s_or_b32 s11, s11, s43
+; GFX9-IDXMODE-NEXT: s_or_b32 s10, s10, s42
+; GFX9-IDXMODE-NEXT: s_or_b32 s9, s9, s41
+; GFX9-IDXMODE-NEXT: s_or_b32 s8, s8, s40
+; GFX9-IDXMODE-NEXT: s_or_b32 s7, s7, s39
+; GFX9-IDXMODE-NEXT: s_or_b32 s6, s6, s38
+; GFX9-IDXMODE-NEXT: s_or_b32 s5, s5, s37
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-IDXMODE-NEXT: s_addk_i32 s20, 0xfe00
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s3
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s2
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(SRC0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%index = add i32 %offset, -512
%or = or <16 x i32> %vec0, %vec1
@@ -128,25 +1160,350 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}extract_neg_offset_vgpr:
-; The offset depends on the register that holds the first element of the vector.
-
-; GCN: v_cmp_eq_u32_e32
-; GCN-COUNT-14: v_cndmask_b32
-; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 16
-; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
+; NOOPT-LABEL: extract_neg_offset_vgpr:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; NOOPT-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; NOOPT-NEXT: s_mov_b32 s22, -1
+; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000
+; NOOPT-NEXT: s_add_u32 s20, s20, s9
+; NOOPT-NEXT: s_addc_u32 s21, s21, 0
+; NOOPT-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
+; NOOPT-NEXT: v_mov_b32_e32 v1, v0
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s6, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s4, 0xf000
+; NOOPT-NEXT: s_mov_b32 s5, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s6
+; NOOPT-NEXT: s_mov_b32 s2, s5
+; NOOPT-NEXT: s_mov_b32 s3, s4
+; NOOPT-NEXT: s_waitcnt vmcnt(1)
+; NOOPT-NEXT: v_writelane_b32 v0, s0, 0
+; NOOPT-NEXT: v_writelane_b32 v0, s1, 1
+; NOOPT-NEXT: v_writelane_b32 v0, s2, 2
+; NOOPT-NEXT: v_writelane_b32 v0, s3, 3
+; NOOPT-NEXT: s_mov_b32 s0, 16
+; NOOPT-NEXT: s_mov_b32 s1, 15
+; NOOPT-NEXT: s_mov_b32 s2, 14
+; NOOPT-NEXT: s_mov_b32 s3, 13
+; NOOPT-NEXT: s_mov_b32 s4, 12
+; NOOPT-NEXT: s_mov_b32 s5, 11
+; NOOPT-NEXT: s_mov_b32 s6, 10
+; NOOPT-NEXT: s_mov_b32 s7, 9
+; NOOPT-NEXT: s_mov_b32 s8, 8
+; NOOPT-NEXT: s_mov_b32 s9, 7
+; NOOPT-NEXT: s_mov_b32 s10, 6
+; NOOPT-NEXT: s_mov_b32 s11, 5
+; NOOPT-NEXT: s_mov_b32 s12, 3
+; NOOPT-NEXT: s_mov_b32 s13, 2
+; NOOPT-NEXT: s_mov_b32 s14, 1
+; NOOPT-NEXT: s_mov_b32 s15, 0
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v1, s15
+; NOOPT-NEXT: v_mov_b32_e32 v31, s14
+; NOOPT-NEXT: v_mov_b32_e32 v30, s13
+; NOOPT-NEXT: v_mov_b32_e32 v29, s12
+; NOOPT-NEXT: v_mov_b32_e32 v28, s11
+; NOOPT-NEXT: v_mov_b32_e32 v27, s10
+; NOOPT-NEXT: v_mov_b32_e32 v26, s9
+; NOOPT-NEXT: v_mov_b32_e32 v25, s8
+; NOOPT-NEXT: v_mov_b32_e32 v24, s7
+; NOOPT-NEXT: v_mov_b32_e32 v23, s6
+; NOOPT-NEXT: v_mov_b32_e32 v22, s5
+; NOOPT-NEXT: v_mov_b32_e32 v21, s4
+; NOOPT-NEXT: v_mov_b32_e32 v20, s3
+; NOOPT-NEXT: v_mov_b32_e32 v19, s2
+; NOOPT-NEXT: v_mov_b32_e32 v18, s1
+; NOOPT-NEXT: v_mov_b32_e32 v17, s0
+; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v2, v31
+; NOOPT-NEXT: v_mov_b32_e32 v3, v30
+; NOOPT-NEXT: v_mov_b32_e32 v4, v29
+; NOOPT-NEXT: v_mov_b32_e32 v5, v28
+; NOOPT-NEXT: v_mov_b32_e32 v6, v27
+; NOOPT-NEXT: v_mov_b32_e32 v7, v26
+; NOOPT-NEXT: v_mov_b32_e32 v8, v25
+; NOOPT-NEXT: v_mov_b32_e32 v9, v24
+; NOOPT-NEXT: v_mov_b32_e32 v10, v23
+; NOOPT-NEXT: v_mov_b32_e32 v11, v22
+; NOOPT-NEXT: v_mov_b32_e32 v12, v21
+; NOOPT-NEXT: v_mov_b32_e32 v13, v20
+; NOOPT-NEXT: v_mov_b32_e32 v14, v19
+; NOOPT-NEXT: v_mov_b32_e32 v15, v18
+; NOOPT-NEXT: v_mov_b32_e32 v16, v17
+; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[0:1], exec
+; NOOPT-NEXT: v_writelane_b32 v0, s0, 4
+; NOOPT-NEXT: v_writelane_b32 v0, s1, 5
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: ; implicit-def: $vgpr0
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 6
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 7
+; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:72 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
+; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
+; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; NOOPT-NEXT: s_add_i32 m0, s2, 0xfffffe00
+; NOOPT-NEXT: v_movrels_b32_e32 v1, v1
+; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1]
+; NOOPT-NEXT: v_writelane_b32 v0, s2, 6
+; NOOPT-NEXT: v_writelane_b32 v0, s3, 7
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1]
+; NOOPT-NEXT: s_cbranch_execnz .LBB5_1
+; NOOPT-NEXT: ; %bb.2:
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 4
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 5
+; NOOPT-NEXT: s_mov_b64 exec, s[0:1]
+; NOOPT-NEXT: ; %bb.3:
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 0
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 1
+; NOOPT-NEXT: v_readlane_b32 s2, v0, 2
+; NOOPT-NEXT: v_readlane_b32 s3, v0, 3
+; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; NOOPT-NEXT: ; kill: killed $vgpr0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: extract_neg_offset_vgpr:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffe00, v0
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 2, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 6, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 7, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 8, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 9, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 10, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 11, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 12, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 13, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 13, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 14, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 15, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v0, 16, v1, vcc
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-LABEL: extract_neg_offset_vgpr:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfffffe00, v0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
+; VI-NEXT: v_cndmask_b32_e32 v1, 2, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0
+; VI-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v0
+; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0
+; VI-NEXT: v_cndmask_b32_e32 v1, 6, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v0
+; VI-NEXT: v_cndmask_b32_e32 v1, 7, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v0
+; VI-NEXT: v_cndmask_b32_e32 v1, 8, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v0
+; VI-NEXT: v_cndmask_b32_e32 v1, 9, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0
+; VI-NEXT: v_cndmask_b32_e32 v1, 10, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0
+; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-NEXT: v_cndmask_b32_e32 v1, 11, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0
+; VI-NEXT: v_cndmask_b32_e32 v1, 12, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0
+; VI-NEXT: v_cndmask_b32_e32 v1, 13, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 13, v0
+; VI-NEXT: v_cndmask_b32_e32 v1, 14, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v0
+; VI-NEXT: v_cndmask_b32_e32 v1, 15, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0
+; VI-NEXT: v_cndmask_b32_e32 v2, 16, v1, vcc
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: extract_neg_offset_vgpr:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: v_add_u32_e32 v0, 0xfffffe00, v0
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 2, v2, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 6, v2, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 7, v2, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 8, v2, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 9, v2, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 10, v2, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 11, v2, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 12, v2, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 13, v2, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 13, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 14, v2, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 15, v2, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v0, 16, v2, vcc
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
- %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
%index = add i32 %id, -512
%value = extractelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 %index
store i32 %value, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}extract_undef_offset_sgpr:
; undefined behavior, but shouldn't crash compiler
define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; NOOPT-LABEL: extract_undef_offset_sgpr:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s6, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s4, 0xf000
+; NOOPT-NEXT: s_mov_b32 s5, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s6
+; NOOPT-NEXT: s_mov_b32 s2, s5
+; NOOPT-NEXT: s_mov_b32 s3, s4
+; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: extract_undef_offset_sgpr:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
+; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s6, -1
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_mov_b32 s4, s2
+; SI-MOVREL-NEXT: s_mov_b32 s5, s3
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-LABEL: extract_undef_offset_sgpr:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: extract_undef_offset_sgpr:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%ld = load volatile <4 x i32>, ptr addrspace(1) %in
%value = extractelement <4 x i32> %ld, i32 undef
@@ -154,9 +1511,23 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}insert_undef_offset_sgpr_vector_src:
; undefined behavior, but shouldn't crash compiler
define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; NOOPT-LABEL: insert_undef_offset_sgpr_vector_src:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: insert_undef_offset_sgpr_vector_src:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-LABEL: insert_undef_offset_sgpr_vector_src:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: insert_undef_offset_sgpr_vector_src:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%ld = load <4 x i32>, ptr addrspace(1) %in
%value = insertelement <4 x i32> %ld, i32 5, i32 undef
@@ -164,20 +1535,276 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}insert_w_offset:
-; GCN-DAG: s_load_dword [[IN0:s[0-9]+]]
-; MOVREL-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1
-; MOVREL-DAG: s_mov_b32 m0, [[IN]]
-; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000
-; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0
-; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000
-; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000
-
-; MOVREL: v_movreld_b32_e32 v[[ELT0]], v[[INS]]
-; MOVREL: buffer_store_dwordx4 v[[[ELT0]]:[[ELT3]]]
define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) {
+; NOOPT-LABEL: insert_w_offset:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_mov_b32 s5, 1
+; NOOPT-NEXT: s_add_i32 s4, s4, s5
+; NOOPT-NEXT: s_mov_b32 s5, 0x41800000
+; NOOPT-NEXT: s_mov_b32 s6, 0x41700000
+; NOOPT-NEXT: s_mov_b32 s7, 0x41600000
+; NOOPT-NEXT: s_mov_b32 s8, 0x41500000
+; NOOPT-NEXT: s_mov_b32 s9, 0x41400000
+; NOOPT-NEXT: s_mov_b32 s10, 0x41300000
+; NOOPT-NEXT: s_mov_b32 s11, 0x41200000
+; NOOPT-NEXT: s_mov_b32 s12, 0x41100000
+; NOOPT-NEXT: s_mov_b32 s13, 0x41000000
+; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000
+; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000
+; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000
+; NOOPT-NEXT: s_mov_b32 s17, 4.0
+; NOOPT-NEXT: s_mov_b32 s18, 0x40400000
+; NOOPT-NEXT: s_mov_b32 s19, 2.0
+; NOOPT-NEXT: s_mov_b32 s20, 1.0
+; NOOPT-NEXT: v_mov_b32_e32 v7, s20
+; NOOPT-NEXT: v_mov_b32_e32 v30, s19
+; NOOPT-NEXT: v_mov_b32_e32 v29, s18
+; NOOPT-NEXT: v_mov_b32_e32 v28, s17
+; NOOPT-NEXT: v_mov_b32_e32 v27, s16
+; NOOPT-NEXT: v_mov_b32_e32 v26, s15
+; NOOPT-NEXT: v_mov_b32_e32 v25, s14
+; NOOPT-NEXT: v_mov_b32_e32 v24, s13
+; NOOPT-NEXT: v_mov_b32_e32 v23, s12
+; NOOPT-NEXT: v_mov_b32_e32 v6, s11
+; NOOPT-NEXT: v_mov_b32_e32 v5, s10
+; NOOPT-NEXT: v_mov_b32_e32 v4, s9
+; NOOPT-NEXT: v_mov_b32_e32 v3, s8
+; NOOPT-NEXT: v_mov_b32_e32 v2, s7
+; NOOPT-NEXT: v_mov_b32_e32 v1, s6
+; NOOPT-NEXT: v_mov_b32_e32 v0, s5
+; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v8, v30
+; NOOPT-NEXT: v_mov_b32_e32 v9, v29
+; NOOPT-NEXT: v_mov_b32_e32 v10, v28
+; NOOPT-NEXT: v_mov_b32_e32 v11, v27
+; NOOPT-NEXT: v_mov_b32_e32 v12, v26
+; NOOPT-NEXT: v_mov_b32_e32 v13, v25
+; NOOPT-NEXT: v_mov_b32_e32 v14, v24
+; NOOPT-NEXT: v_mov_b32_e32 v15, v23
+; NOOPT-NEXT: v_mov_b32_e32 v16, v6
+; NOOPT-NEXT: v_mov_b32_e32 v17, v5
+; NOOPT-NEXT: v_mov_b32_e32 v18, v4
+; NOOPT-NEXT: v_mov_b32_e32 v19, v3
+; NOOPT-NEXT: v_mov_b32_e32 v20, v2
+; NOOPT-NEXT: v_mov_b32_e32 v21, v1
+; NOOPT-NEXT: v_mov_b32_e32 v22, v0
+; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000
+; NOOPT-NEXT: s_mov_b32 m0, s4
+; NOOPT-NEXT: v_movreld_b32_e32 v7, v0
+; NOOPT-NEXT: v_mov_b32_e32 v4, v22
+; NOOPT-NEXT: v_mov_b32_e32 v5, v21
+; NOOPT-NEXT: v_mov_b32_e32 v6, v20
+; NOOPT-NEXT: v_mov_b32_e32 v0, v19
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; NOOPT-NEXT: v_mov_b32_e32 v4, v18
+; NOOPT-NEXT: v_mov_b32_e32 v5, v17
+; NOOPT-NEXT: v_mov_b32_e32 v6, v16
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v15
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; NOOPT-NEXT: v_mov_b32_e32 v4, v14
+; NOOPT-NEXT: v_mov_b32_e32 v5, v13
+; NOOPT-NEXT: v_mov_b32_e32 v6, v12
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v11
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; NOOPT-NEXT: v_mov_b32_e32 v4, v10
+; NOOPT-NEXT: v_mov_b32_e32 v5, v9
+; NOOPT-NEXT: v_mov_b32_e32 v6, v8
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v7
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: insert_w_offset:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000
+; SI-MOVREL-NEXT: s_mov_b32 m0, s4
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: insert_w_offset:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_add_i32 s4, s4, 1
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
+; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000
+; VI-MOVREL-NEXT: s_mov_b32 m0, s4
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: insert_w_offset:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_add_i32 s4, s4, 1
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x41880000
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v16
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: insert_w_offset:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: s_add_i32 s4, s4, 1
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x41880000
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v17
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%add = add i32 %in, 1
%ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
@@ -185,19 +1812,276 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}insert_unsigned_base_plus_offset:
-; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0
-; GCN-DAG: s_and_b32 [[BASE:s[0-9]+]], [[IN]], 0xffff
-
-; MOVREL: s_mov_b32 m0, [[BASE]]
-; MOVREL: v_movreld_b32_e32 [[ELT1]], v{{[0-9]+}}
-
-; IDXMODE: s_set_gpr_idx_on [[BASE]], gpr_idx(DST)
-; IDXMODE-NEXT: v_mov_b32_e32 [[ELT1]], v{{[0-9]+}}
-; IDXMODE-NEXT: s_set_gpr_idx_off
define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %out, i16 %in) {
+; NOOPT-LABEL: insert_unsigned_base_plus_offset:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_mov_b32 s5, 0xffff
+; NOOPT-NEXT: s_and_b32 s4, s4, s5
+; NOOPT-NEXT: s_mov_b32 s5, 0x41800000
+; NOOPT-NEXT: s_mov_b32 s6, 0x41700000
+; NOOPT-NEXT: s_mov_b32 s7, 0x41600000
+; NOOPT-NEXT: s_mov_b32 s8, 0x41500000
+; NOOPT-NEXT: s_mov_b32 s9, 0x41400000
+; NOOPT-NEXT: s_mov_b32 s10, 0x41300000
+; NOOPT-NEXT: s_mov_b32 s11, 0x41200000
+; NOOPT-NEXT: s_mov_b32 s12, 0x41100000
+; NOOPT-NEXT: s_mov_b32 s13, 0x41000000
+; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000
+; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000
+; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000
+; NOOPT-NEXT: s_mov_b32 s17, 4.0
+; NOOPT-NEXT: s_mov_b32 s18, 0x40400000
+; NOOPT-NEXT: s_mov_b32 s19, 2.0
+; NOOPT-NEXT: s_mov_b32 s20, 1.0
+; NOOPT-NEXT: v_mov_b32_e32 v7, s20
+; NOOPT-NEXT: v_mov_b32_e32 v30, s19
+; NOOPT-NEXT: v_mov_b32_e32 v29, s18
+; NOOPT-NEXT: v_mov_b32_e32 v28, s17
+; NOOPT-NEXT: v_mov_b32_e32 v27, s16
+; NOOPT-NEXT: v_mov_b32_e32 v26, s15
+; NOOPT-NEXT: v_mov_b32_e32 v25, s14
+; NOOPT-NEXT: v_mov_b32_e32 v24, s13
+; NOOPT-NEXT: v_mov_b32_e32 v23, s12
+; NOOPT-NEXT: v_mov_b32_e32 v6, s11
+; NOOPT-NEXT: v_mov_b32_e32 v5, s10
+; NOOPT-NEXT: v_mov_b32_e32 v4, s9
+; NOOPT-NEXT: v_mov_b32_e32 v3, s8
+; NOOPT-NEXT: v_mov_b32_e32 v2, s7
+; NOOPT-NEXT: v_mov_b32_e32 v1, s6
+; NOOPT-NEXT: v_mov_b32_e32 v0, s5
+; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v8, v30
+; NOOPT-NEXT: v_mov_b32_e32 v9, v29
+; NOOPT-NEXT: v_mov_b32_e32 v10, v28
+; NOOPT-NEXT: v_mov_b32_e32 v11, v27
+; NOOPT-NEXT: v_mov_b32_e32 v12, v26
+; NOOPT-NEXT: v_mov_b32_e32 v13, v25
+; NOOPT-NEXT: v_mov_b32_e32 v14, v24
+; NOOPT-NEXT: v_mov_b32_e32 v15, v23
+; NOOPT-NEXT: v_mov_b32_e32 v16, v6
+; NOOPT-NEXT: v_mov_b32_e32 v17, v5
+; NOOPT-NEXT: v_mov_b32_e32 v18, v4
+; NOOPT-NEXT: v_mov_b32_e32 v19, v3
+; NOOPT-NEXT: v_mov_b32_e32 v20, v2
+; NOOPT-NEXT: v_mov_b32_e32 v21, v1
+; NOOPT-NEXT: v_mov_b32_e32 v22, v0
+; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000
+; NOOPT-NEXT: s_mov_b32 m0, s4
+; NOOPT-NEXT: v_movreld_b32_e32 v8, v0
+; NOOPT-NEXT: v_mov_b32_e32 v4, v22
+; NOOPT-NEXT: v_mov_b32_e32 v5, v21
+; NOOPT-NEXT: v_mov_b32_e32 v6, v20
+; NOOPT-NEXT: v_mov_b32_e32 v0, v19
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; NOOPT-NEXT: v_mov_b32_e32 v4, v18
+; NOOPT-NEXT: v_mov_b32_e32 v5, v17
+; NOOPT-NEXT: v_mov_b32_e32 v6, v16
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v15
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; NOOPT-NEXT: v_mov_b32_e32 v4, v14
+; NOOPT-NEXT: v_mov_b32_e32 v5, v13
+; NOOPT-NEXT: v_mov_b32_e32 v6, v12
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v11
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; NOOPT-NEXT: v_mov_b32_e32 v4, v10
+; NOOPT-NEXT: v_mov_b32_e32 v5, v9
+; NOOPT-NEXT: v_mov_b32_e32 v6, v8
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v7
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: insert_unsigned_base_plus_offset:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_and_b32 s4, s4, 0xffff
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000
+; SI-MOVREL-NEXT: s_mov_b32 m0, s4
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: insert_unsigned_base_plus_offset:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_and_b32 s2, s4, 0xffff
+; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
+; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: insert_unsigned_base_plus_offset:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_and_b32 s2, s4, 0xffff
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x41880000
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, v16
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: insert_unsigned_base_plus_offset:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: s_and_b32 s2, s4, 0xffff
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x41880000
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v17
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%base = zext i16 %in to i32
%add = add i32 %base, 1
@@ -206,21 +2090,281 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}insert_signed_base_plus_offset:
-; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 2.0
-
-; GCN-DAG: s_sext_i32_i16 [[BASE:s[0-9]+]], [[IN]]
-; GCN-DAG: s_add_i32 [[BASE_PLUS_OFFSET:s[0-9]+]], [[BASE]], 1
-
-; MOVREL: s_mov_b32 m0, [[BASE_PLUS_OFFSET]]
-; MOVREL: v_movreld_b32_e32 [[ELT0]], v{{[0-9]+}}
-
-; IDXMODE: s_set_gpr_idx_on [[BASE_PLUS_OFFSET]], gpr_idx(DST)
-; IDXMODE-NEXT: v_mov_b32_e32 [[ELT0]], v{{[0-9]+}}
-; IDXMODE-NEXT: s_set_gpr_idx_off
define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, i16 %in) {
+; NOOPT-LABEL: insert_signed_base_plus_offset:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_sext_i32_i16 s4, s4
+; NOOPT-NEXT: s_mov_b32 s5, 1
+; NOOPT-NEXT: s_add_i32 s4, s4, s5
+; NOOPT-NEXT: s_mov_b32 s5, 0x41800000
+; NOOPT-NEXT: s_mov_b32 s6, 0x41700000
+; NOOPT-NEXT: s_mov_b32 s7, 0x41600000
+; NOOPT-NEXT: s_mov_b32 s8, 0x41500000
+; NOOPT-NEXT: s_mov_b32 s9, 0x41400000
+; NOOPT-NEXT: s_mov_b32 s10, 0x41300000
+; NOOPT-NEXT: s_mov_b32 s11, 0x41200000
+; NOOPT-NEXT: s_mov_b32 s12, 0x41100000
+; NOOPT-NEXT: s_mov_b32 s13, 0x41000000
+; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000
+; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000
+; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000
+; NOOPT-NEXT: s_mov_b32 s17, 4.0
+; NOOPT-NEXT: s_mov_b32 s18, 0x40400000
+; NOOPT-NEXT: s_mov_b32 s19, 2.0
+; NOOPT-NEXT: s_mov_b32 s20, 1.0
+; NOOPT-NEXT: v_mov_b32_e32 v7, s20
+; NOOPT-NEXT: v_mov_b32_e32 v30, s19
+; NOOPT-NEXT: v_mov_b32_e32 v29, s18
+; NOOPT-NEXT: v_mov_b32_e32 v28, s17
+; NOOPT-NEXT: v_mov_b32_e32 v27, s16
+; NOOPT-NEXT: v_mov_b32_e32 v26, s15
+; NOOPT-NEXT: v_mov_b32_e32 v25, s14
+; NOOPT-NEXT: v_mov_b32_e32 v24, s13
+; NOOPT-NEXT: v_mov_b32_e32 v23, s12
+; NOOPT-NEXT: v_mov_b32_e32 v6, s11
+; NOOPT-NEXT: v_mov_b32_e32 v5, s10
+; NOOPT-NEXT: v_mov_b32_e32 v4, s9
+; NOOPT-NEXT: v_mov_b32_e32 v3, s8
+; NOOPT-NEXT: v_mov_b32_e32 v2, s7
+; NOOPT-NEXT: v_mov_b32_e32 v1, s6
+; NOOPT-NEXT: v_mov_b32_e32 v0, s5
+; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v8, v30
+; NOOPT-NEXT: v_mov_b32_e32 v9, v29
+; NOOPT-NEXT: v_mov_b32_e32 v10, v28
+; NOOPT-NEXT: v_mov_b32_e32 v11, v27
+; NOOPT-NEXT: v_mov_b32_e32 v12, v26
+; NOOPT-NEXT: v_mov_b32_e32 v13, v25
+; NOOPT-NEXT: v_mov_b32_e32 v14, v24
+; NOOPT-NEXT: v_mov_b32_e32 v15, v23
+; NOOPT-NEXT: v_mov_b32_e32 v16, v6
+; NOOPT-NEXT: v_mov_b32_e32 v17, v5
+; NOOPT-NEXT: v_mov_b32_e32 v18, v4
+; NOOPT-NEXT: v_mov_b32_e32 v19, v3
+; NOOPT-NEXT: v_mov_b32_e32 v20, v2
+; NOOPT-NEXT: v_mov_b32_e32 v21, v1
+; NOOPT-NEXT: v_mov_b32_e32 v22, v0
+; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000
+; NOOPT-NEXT: s_mov_b32 m0, s4
+; NOOPT-NEXT: v_movreld_b32_e32 v7, v0
+; NOOPT-NEXT: v_mov_b32_e32 v4, v22
+; NOOPT-NEXT: v_mov_b32_e32 v5, v21
+; NOOPT-NEXT: v_mov_b32_e32 v6, v20
+; NOOPT-NEXT: v_mov_b32_e32 v0, v19
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; NOOPT-NEXT: v_mov_b32_e32 v4, v18
+; NOOPT-NEXT: v_mov_b32_e32 v5, v17
+; NOOPT-NEXT: v_mov_b32_e32 v6, v16
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v15
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; NOOPT-NEXT: v_mov_b32_e32 v4, v14
+; NOOPT-NEXT: v_mov_b32_e32 v5, v13
+; NOOPT-NEXT: v_mov_b32_e32 v6, v12
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v11
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; NOOPT-NEXT: v_mov_b32_e32 v4, v10
+; NOOPT-NEXT: v_mov_b32_e32 v5, v9
+; NOOPT-NEXT: v_mov_b32_e32 v6, v8
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v7
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: insert_signed_base_plus_offset:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_sext_i32_i16 s4, s4
+; SI-MOVREL-NEXT: s_add_i32 s4, s4, 1
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000
+; SI-MOVREL-NEXT: s_mov_b32 m0, s4
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: insert_signed_base_plus_offset:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_sext_i32_i16 s2, s4
+; VI-MOVREL-NEXT: s_add_i32 s2, s2, 1
+; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
+; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: insert_signed_base_plus_offset:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_sext_i32_i16 s2, s4
+; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x41880000
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v16
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: insert_signed_base_plus_offset:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: s_sext_i32_i16 s2, s4
+; GFX9-IDXMODE-NEXT: s_add_i32 s2, s2, 1
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x41880000
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v17
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%base = sext i16 %in to i32
%add = add i32 %base, 1
@@ -229,35 +2373,553 @@ entry:
ret void
}
-
-; GCN-LABEL: {{^}}insert_wo_offset:
-; GCN: s_load_dword [[IN:s[0-9]+]]
-
-; MOVREL: s_mov_b32 m0, [[IN]]
-; MOVREL: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
-
-; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(DST)
-; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v{{[0-9]+}}
-; IDXMODE-NEXT: s_set_gpr_idx_off
-
-; GCN: buffer_store_dwordx4 v[[[ELT0]]:
+; Make sure that TwoAddressInstructions keeps src0 as subregister sub0
+; of the tied implicit use and def of the super register.
define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) {
+; NOOPT-LABEL: insert_wo_offset:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_mov_b32 s5, 0x41800000
+; NOOPT-NEXT: s_mov_b32 s6, 0x41700000
+; NOOPT-NEXT: s_mov_b32 s7, 0x41600000
+; NOOPT-NEXT: s_mov_b32 s8, 0x41500000
+; NOOPT-NEXT: s_mov_b32 s9, 0x41400000
+; NOOPT-NEXT: s_mov_b32 s10, 0x41300000
+; NOOPT-NEXT: s_mov_b32 s11, 0x41200000
+; NOOPT-NEXT: s_mov_b32 s12, 0x41100000
+; NOOPT-NEXT: s_mov_b32 s13, 0x41000000
+; NOOPT-NEXT: s_mov_b32 s14, 0x40e00000
+; NOOPT-NEXT: s_mov_b32 s15, 0x40c00000
+; NOOPT-NEXT: s_mov_b32 s16, 0x40a00000
+; NOOPT-NEXT: s_mov_b32 s17, 4.0
+; NOOPT-NEXT: s_mov_b32 s18, 0x40400000
+; NOOPT-NEXT: s_mov_b32 s19, 2.0
+; NOOPT-NEXT: s_mov_b32 s20, 1.0
+; NOOPT-NEXT: v_mov_b32_e32 v7, s20
+; NOOPT-NEXT: v_mov_b32_e32 v30, s19
+; NOOPT-NEXT: v_mov_b32_e32 v29, s18
+; NOOPT-NEXT: v_mov_b32_e32 v28, s17
+; NOOPT-NEXT: v_mov_b32_e32 v27, s16
+; NOOPT-NEXT: v_mov_b32_e32 v26, s15
+; NOOPT-NEXT: v_mov_b32_e32 v25, s14
+; NOOPT-NEXT: v_mov_b32_e32 v24, s13
+; NOOPT-NEXT: v_mov_b32_e32 v23, s12
+; NOOPT-NEXT: v_mov_b32_e32 v6, s11
+; NOOPT-NEXT: v_mov_b32_e32 v5, s10
+; NOOPT-NEXT: v_mov_b32_e32 v4, s9
+; NOOPT-NEXT: v_mov_b32_e32 v3, s8
+; NOOPT-NEXT: v_mov_b32_e32 v2, s7
+; NOOPT-NEXT: v_mov_b32_e32 v1, s6
+; NOOPT-NEXT: v_mov_b32_e32 v0, s5
+; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v8, v30
+; NOOPT-NEXT: v_mov_b32_e32 v9, v29
+; NOOPT-NEXT: v_mov_b32_e32 v10, v28
+; NOOPT-NEXT: v_mov_b32_e32 v11, v27
+; NOOPT-NEXT: v_mov_b32_e32 v12, v26
+; NOOPT-NEXT: v_mov_b32_e32 v13, v25
+; NOOPT-NEXT: v_mov_b32_e32 v14, v24
+; NOOPT-NEXT: v_mov_b32_e32 v15, v23
+; NOOPT-NEXT: v_mov_b32_e32 v16, v6
+; NOOPT-NEXT: v_mov_b32_e32 v17, v5
+; NOOPT-NEXT: v_mov_b32_e32 v18, v4
+; NOOPT-NEXT: v_mov_b32_e32 v19, v3
+; NOOPT-NEXT: v_mov_b32_e32 v20, v2
+; NOOPT-NEXT: v_mov_b32_e32 v21, v1
+; NOOPT-NEXT: v_mov_b32_e32 v22, v0
+; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000
+; NOOPT-NEXT: s_mov_b32 m0, s4
+; NOOPT-NEXT: v_movreld_b32_e32 v7, v0
+; NOOPT-NEXT: v_mov_b32_e32 v4, v22
+; NOOPT-NEXT: v_mov_b32_e32 v5, v21
+; NOOPT-NEXT: v_mov_b32_e32 v6, v20
+; NOOPT-NEXT: v_mov_b32_e32 v0, v19
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; NOOPT-NEXT: v_mov_b32_e32 v4, v18
+; NOOPT-NEXT: v_mov_b32_e32 v5, v17
+; NOOPT-NEXT: v_mov_b32_e32 v6, v16
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v15
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; NOOPT-NEXT: v_mov_b32_e32 v4, v14
+; NOOPT-NEXT: v_mov_b32_e32 v5, v13
+; NOOPT-NEXT: v_mov_b32_e32 v6, v12
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v11
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; NOOPT-NEXT: v_mov_b32_e32 v4, v10
+; NOOPT-NEXT: v_mov_b32_e32 v5, v9
+; NOOPT-NEXT: v_mov_b32_e32 v6, v8
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v7
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: insert_wo_offset:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_mov_b32 m0, s4
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: insert_wo_offset:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x2c
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x41880000
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v16
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: insert_wo_offset:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x2c
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x41880000
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v16
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: insert_wo_offset:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x41880000
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v17
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
store <16 x float> %ins, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}insert_neg_offset_sgpr:
-; The offset depends on the register that holds the first element of the vector.
-; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
-; MOVREL: v_movreld_b32_e32 v0, 16
-
-; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
-; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
-; IDXMODE-NEXT: v_mov_b32_e32 v0, 16
-; IDXMODE-NEXT: s_set_gpr_idx_off
define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %offset) {
+; NOOPT-LABEL: insert_neg_offset_sgpr:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_mov_b32 s5, 15
+; NOOPT-NEXT: s_mov_b32 s6, 14
+; NOOPT-NEXT: s_mov_b32 s7, 13
+; NOOPT-NEXT: s_mov_b32 s8, 12
+; NOOPT-NEXT: s_mov_b32 s9, 11
+; NOOPT-NEXT: s_mov_b32 s10, 10
+; NOOPT-NEXT: s_mov_b32 s11, 9
+; NOOPT-NEXT: s_mov_b32 s12, 8
+; NOOPT-NEXT: s_mov_b32 s13, 7
+; NOOPT-NEXT: s_mov_b32 s14, 6
+; NOOPT-NEXT: s_mov_b32 s15, 5
+; NOOPT-NEXT: s_mov_b32 s16, 4
+; NOOPT-NEXT: s_mov_b32 s17, 3
+; NOOPT-NEXT: s_mov_b32 s18, 2
+; NOOPT-NEXT: s_mov_b32 s19, 1
+; NOOPT-NEXT: s_mov_b32 s20, 0
+; NOOPT-NEXT: v_mov_b32_e32 v15, s20
+; NOOPT-NEXT: v_mov_b32_e32 v14, s19
+; NOOPT-NEXT: v_mov_b32_e32 v13, s18
+; NOOPT-NEXT: v_mov_b32_e32 v12, s17
+; NOOPT-NEXT: v_mov_b32_e32 v11, s16
+; NOOPT-NEXT: v_mov_b32_e32 v10, s15
+; NOOPT-NEXT: v_mov_b32_e32 v9, s14
+; NOOPT-NEXT: v_mov_b32_e32 v8, s13
+; NOOPT-NEXT: v_mov_b32_e32 v7, s12
+; NOOPT-NEXT: v_mov_b32_e32 v6, s11
+; NOOPT-NEXT: v_mov_b32_e32 v5, s10
+; NOOPT-NEXT: v_mov_b32_e32 v4, s9
+; NOOPT-NEXT: v_mov_b32_e32 v3, s8
+; NOOPT-NEXT: v_mov_b32_e32 v2, s7
+; NOOPT-NEXT: v_mov_b32_e32 v1, s6
+; NOOPT-NEXT: v_mov_b32_e32 v0, s5
+; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v16, v14
+; NOOPT-NEXT: v_mov_b32_e32 v17, v13
+; NOOPT-NEXT: v_mov_b32_e32 v18, v12
+; NOOPT-NEXT: v_mov_b32_e32 v19, v11
+; NOOPT-NEXT: v_mov_b32_e32 v20, v10
+; NOOPT-NEXT: v_mov_b32_e32 v21, v9
+; NOOPT-NEXT: v_mov_b32_e32 v22, v8
+; NOOPT-NEXT: v_mov_b32_e32 v23, v7
+; NOOPT-NEXT: v_mov_b32_e32 v24, v6
+; NOOPT-NEXT: v_mov_b32_e32 v25, v5
+; NOOPT-NEXT: v_mov_b32_e32 v26, v4
+; NOOPT-NEXT: v_mov_b32_e32 v27, v3
+; NOOPT-NEXT: v_mov_b32_e32 v28, v2
+; NOOPT-NEXT: v_mov_b32_e32 v29, v1
+; NOOPT-NEXT: v_mov_b32_e32 v30, v0
+; NOOPT-NEXT: v_mov_b32_e32 v0, 16
+; NOOPT-NEXT: s_add_i32 m0, s4, 0xfffffe00
+; NOOPT-NEXT: v_movreld_b32_e32 v15, v0
+; NOOPT-NEXT: v_mov_b32_e32 v4, v18
+; NOOPT-NEXT: v_mov_b32_e32 v5, v17
+; NOOPT-NEXT: v_mov_b32_e32 v6, v16
+; NOOPT-NEXT: v_mov_b32_e32 v0, v15
+; NOOPT-NEXT: v_mov_b32_e32 v1, v22
+; NOOPT-NEXT: v_mov_b32_e32 v2, v21
+; NOOPT-NEXT: v_mov_b32_e32 v3, v20
+; NOOPT-NEXT: v_mov_b32_e32 v7, v19
+; NOOPT-NEXT: v_mov_b32_e32 v12, v26
+; NOOPT-NEXT: v_mov_b32_e32 v13, v25
+; NOOPT-NEXT: v_mov_b32_e32 v14, v24
+; NOOPT-NEXT: v_mov_b32_e32 v8, v23
+; NOOPT-NEXT: v_mov_b32_e32 v9, v30
+; NOOPT-NEXT: v_mov_b32_e32 v10, v29
+; NOOPT-NEXT: v_mov_b32_e32 v11, v28
+; NOOPT-NEXT: v_mov_b32_e32 v15, v27
+; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v16, v11
+; NOOPT-NEXT: v_mov_b32_e32 v17, v10
+; NOOPT-NEXT: v_mov_b32_e32 v18, v9
+; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
+; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v9, v14
+; NOOPT-NEXT: v_mov_b32_e32 v10, v13
+; NOOPT-NEXT: v_mov_b32_e32 v11, v12
+; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v8, v3
+; NOOPT-NEXT: v_mov_b32_e32 v9, v2
+; NOOPT-NEXT: v_mov_b32_e32 v10, v1
+; NOOPT-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: insert_neg_offset_sgpr:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
+; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xd
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 1
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 2
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 3
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 4
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 6
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 7
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 8
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 10
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 11
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 12
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 13
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 14
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 15
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_add_i32 m0, s4, 0xfffffe00
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, 16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: insert_neg_offset_sgpr:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
+; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 2
+; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 3
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_add_i32 m0, s2, 0xfffffe00
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 4
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 6
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 7
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 8
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 9
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 10
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 11
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 12
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 13
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 14
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 15
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 16
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: insert_neg_offset_sgpr:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
+; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 2
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 4
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 6
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 7
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 8
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 9
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 10
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 11
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 12
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 13
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 14
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 15
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_addk_i32 s2, 0xfe00
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 16
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: insert_neg_offset_sgpr:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
+; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x34
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 2
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 3
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 4
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 6
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 7
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 8
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 9
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 10
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 11
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 12
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 13
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 14
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 15
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v15
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: s_addk_i32 s4, 0xfe00
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, v14
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, v13
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, v12
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, v11
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, v10
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, v9
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, v8
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, v6
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 16
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v0, v[13:16], s[0:1] offset:48
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v0, v[9:12], s[0:1] offset:32
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%index = add i32 %offset, -512
%value = insertelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, i32 16, i32 %index
@@ -267,17 +2929,239 @@ entry:
; The vector indexed into is originally loaded into an SGPR rather
; than built with a reg_sequence
-
-; GCN-LABEL: {{^}}insert_neg_offset_sgpr_loadreg:
-; The offset depends on the register that holds the first element of the vector.
-; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
-; MOVREL: v_movreld_b32_e32 v0, 5
-
-; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
-; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(DST)
-; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
-; IDXMODE-NEXT: s_set_gpr_idx_off
define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ptr addrspace(1) %out, <16 x i32> %vec, i32 %offset) {
+; NOOPT-LABEL: insert_neg_offset_sgpr_loadreg:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
+; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x19
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0x29
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: v_mov_b32_e32 v0, 5
+; NOOPT-NEXT: v_mov_b32_e32 v30, s23
+; NOOPT-NEXT: v_mov_b32_e32 v29, s22
+; NOOPT-NEXT: v_mov_b32_e32 v28, s21
+; NOOPT-NEXT: v_mov_b32_e32 v27, s20
+; NOOPT-NEXT: v_mov_b32_e32 v26, s19
+; NOOPT-NEXT: v_mov_b32_e32 v25, s18
+; NOOPT-NEXT: v_mov_b32_e32 v24, s17
+; NOOPT-NEXT: v_mov_b32_e32 v23, s16
+; NOOPT-NEXT: v_mov_b32_e32 v22, s15
+; NOOPT-NEXT: v_mov_b32_e32 v21, s14
+; NOOPT-NEXT: v_mov_b32_e32 v20, s13
+; NOOPT-NEXT: v_mov_b32_e32 v19, s12
+; NOOPT-NEXT: v_mov_b32_e32 v18, s11
+; NOOPT-NEXT: v_mov_b32_e32 v17, s10
+; NOOPT-NEXT: v_mov_b32_e32 v16, s9
+; NOOPT-NEXT: v_mov_b32_e32 v15, s8
+; NOOPT-NEXT: s_add_i32 m0, s4, 0xfffffe00
+; NOOPT-NEXT: v_movreld_b32_e32 v15, v0
+; NOOPT-NEXT: v_mov_b32_e32 v4, v18
+; NOOPT-NEXT: v_mov_b32_e32 v5, v17
+; NOOPT-NEXT: v_mov_b32_e32 v6, v16
+; NOOPT-NEXT: v_mov_b32_e32 v0, v15
+; NOOPT-NEXT: v_mov_b32_e32 v1, v22
+; NOOPT-NEXT: v_mov_b32_e32 v2, v21
+; NOOPT-NEXT: v_mov_b32_e32 v3, v20
+; NOOPT-NEXT: v_mov_b32_e32 v7, v19
+; NOOPT-NEXT: v_mov_b32_e32 v12, v26
+; NOOPT-NEXT: v_mov_b32_e32 v13, v25
+; NOOPT-NEXT: v_mov_b32_e32 v14, v24
+; NOOPT-NEXT: v_mov_b32_e32 v8, v23
+; NOOPT-NEXT: v_mov_b32_e32 v9, v30
+; NOOPT-NEXT: v_mov_b32_e32 v10, v29
+; NOOPT-NEXT: v_mov_b32_e32 v11, v28
+; NOOPT-NEXT: v_mov_b32_e32 v15, v27
+; NOOPT-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16_vgpr17_vgpr18 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v16, v11
+; NOOPT-NEXT: v_mov_b32_e32 v17, v10
+; NOOPT-NEXT: v_mov_b32_e32 v18, v9
+; NOOPT-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
+; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v9, v14
+; NOOPT-NEXT: v_mov_b32_e32 v10, v13
+; NOOPT-NEXT: v_mov_b32_e32 v11, v12
+; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; NOOPT-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v8, v3
+; NOOPT-NEXT: v_mov_b32_e32 v9, v2
+; NOOPT-NEXT: v_mov_b32_e32 v10, v1
+; NOOPT-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: insert_neg_offset_sgpr_loadreg:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19
+; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x29
+; SI-MOVREL-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0xb
+; SI-MOVREL-NEXT: s_mov_b32 s23, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s22, -1
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
+; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19
+; SI-MOVREL-NEXT: s_add_i32 m0, s0, 0xfffffe00
+; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, 5
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:48
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:32
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: insert_neg_offset_sgpr_loadreg:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
+; VI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0xa4
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
+; VI-MOVREL-NEXT: s_add_i32 m0, s20, 0xfffffe00
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6
+; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s18
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s19
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 5
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: insert_neg_offset_sgpr_loadreg:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
+; VI-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
+; VI-IDXMODE-NEXT: s_addk_i32 s20, 0xfe00
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s18
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s19
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(DST)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 5
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: insert_neg_offset_sgpr_loadreg:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
+; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s18
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s19
+; GFX9-IDXMODE-NEXT: s_addk_i32 s20, 0xfe00
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s20, gpr_idx(DST)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 5
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%index = add i32 %offset, -512
%value = insertelement <16 x i32> %vec, i32 5, i32 %index
@@ -285,29 +3169,885 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}insert_neg_offset_vgpr:
-; The offset depends on the register that holds the first element of the vector.
-
-; GCN: v_cmp_eq_u32_e32
-; GCN-COUNT-16: v_cndmask_b32
-; GCN-COUNT-4: buffer_store_dwordx4
define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; NOOPT-LABEL: insert_neg_offset_vgpr:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; NOOPT-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; NOOPT-NEXT: s_mov_b32 s22, -1
+; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000
+; NOOPT-NEXT: s_add_u32 s20, s20, s9
+; NOOPT-NEXT: s_addc_u32 s21, s21, 0
+; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane
+; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s6, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s4, 0xf000
+; NOOPT-NEXT: s_mov_b32 s5, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s6
+; NOOPT-NEXT: s_mov_b32 s2, s5
+; NOOPT-NEXT: s_mov_b32 s3, s4
+; NOOPT-NEXT: v_writelane_b32 v16, s0, 0
+; NOOPT-NEXT: v_writelane_b32 v16, s1, 1
+; NOOPT-NEXT: v_writelane_b32 v16, s2, 2
+; NOOPT-NEXT: v_writelane_b32 v16, s3, 3
+; NOOPT-NEXT: s_mov_b32 s0, 16
+; NOOPT-NEXT: s_mov_b32 s1, 15
+; NOOPT-NEXT: s_mov_b32 s2, 14
+; NOOPT-NEXT: s_mov_b32 s3, 13
+; NOOPT-NEXT: s_mov_b32 s4, 12
+; NOOPT-NEXT: s_mov_b32 s5, 11
+; NOOPT-NEXT: s_mov_b32 s6, 10
+; NOOPT-NEXT: s_mov_b32 s7, 9
+; NOOPT-NEXT: s_mov_b32 s8, 8
+; NOOPT-NEXT: s_mov_b32 s9, 7
+; NOOPT-NEXT: s_mov_b32 s10, 6
+; NOOPT-NEXT: s_mov_b32 s11, 5
+; NOOPT-NEXT: s_mov_b32 s12, 4
+; NOOPT-NEXT: s_mov_b32 s13, 3
+; NOOPT-NEXT: s_mov_b32 s14, 2
+; NOOPT-NEXT: s_mov_b32 s15, 1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, s15
+; NOOPT-NEXT: v_mov_b32_e32 v31, s14
+; NOOPT-NEXT: v_mov_b32_e32 v30, s13
+; NOOPT-NEXT: v_mov_b32_e32 v29, s12
+; NOOPT-NEXT: v_mov_b32_e32 v28, s11
+; NOOPT-NEXT: v_mov_b32_e32 v27, s10
+; NOOPT-NEXT: v_mov_b32_e32 v26, s9
+; NOOPT-NEXT: v_mov_b32_e32 v25, s8
+; NOOPT-NEXT: v_mov_b32_e32 v24, s7
+; NOOPT-NEXT: v_mov_b32_e32 v23, s6
+; NOOPT-NEXT: v_mov_b32_e32 v22, s5
+; NOOPT-NEXT: v_mov_b32_e32 v21, s4
+; NOOPT-NEXT: v_mov_b32_e32 v20, s3
+; NOOPT-NEXT: v_mov_b32_e32 v19, s2
+; NOOPT-NEXT: v_mov_b32_e32 v18, s1
+; NOOPT-NEXT: v_mov_b32_e32 v17, s0
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v31
+; NOOPT-NEXT: v_mov_b32_e32 v2, v30
+; NOOPT-NEXT: v_mov_b32_e32 v3, v29
+; NOOPT-NEXT: v_mov_b32_e32 v4, v28
+; NOOPT-NEXT: v_mov_b32_e32 v5, v27
+; NOOPT-NEXT: v_mov_b32_e32 v6, v26
+; NOOPT-NEXT: v_mov_b32_e32 v7, v25
+; NOOPT-NEXT: v_mov_b32_e32 v8, v24
+; NOOPT-NEXT: v_mov_b32_e32 v9, v23
+; NOOPT-NEXT: v_mov_b32_e32 v10, v22
+; NOOPT-NEXT: v_mov_b32_e32 v11, v21
+; NOOPT-NEXT: v_mov_b32_e32 v12, v20
+; NOOPT-NEXT: v_mov_b32_e32 v13, v19
+; NOOPT-NEXT: v_mov_b32_e32 v14, v18
+; NOOPT-NEXT: v_mov_b32_e32 v15, v17
+; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:80 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:84 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:88 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:92 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:96 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:100 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:104 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:108 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:112 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:116 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:120 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:124 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill
+; NOOPT-NEXT: v_mov_b32_e32 v17, 33
+; NOOPT-NEXT: buffer_store_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[0:1], exec
+; NOOPT-NEXT: v_writelane_b32 v16, s0, 4
+; NOOPT-NEXT: v_writelane_b32 v16, s1, 5
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: s_waitcnt expcnt(2)
+; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 6
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 7
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readfirstlane_b32 s2, v18
+; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18
+; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; NOOPT-NEXT: s_add_i32 m0, s2, 0xfffffe00
+; NOOPT-NEXT: v_movreld_b32_e32 v1, v17
+; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1]
+; NOOPT-NEXT: v_writelane_b32 v0, s2, 6
+; NOOPT-NEXT: v_writelane_b32 v0, s3, 7
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1]
+; NOOPT-NEXT: s_cbranch_execnz .LBB14_1
+; NOOPT-NEXT: ; %bb.2:
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 4
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 5
+; NOOPT-NEXT: s_mov_b64 exec, s[0:1]
+; NOOPT-NEXT: ; %bb.3:
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 0
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 1
+; NOOPT-NEXT: v_readlane_b32 s2, v0, 2
+; NOOPT-NEXT: v_readlane_b32 s3, v0, 3
+; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v19, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v20, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v21, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v22, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v23, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v24, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v25, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v26, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v27, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v28, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v30, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v5, v19
+; NOOPT-NEXT: v_mov_b32_e32 v6, v18
+; NOOPT-NEXT: v_mov_b32_e32 v7, v17
+; NOOPT-NEXT: v_mov_b32_e32 v1, v16
+; NOOPT-NEXT: v_mov_b32_e32 v2, v23
+; NOOPT-NEXT: v_mov_b32_e32 v3, v22
+; NOOPT-NEXT: v_mov_b32_e32 v4, v21
+; NOOPT-NEXT: v_mov_b32_e32 v8, v20
+; NOOPT-NEXT: v_mov_b32_e32 v13, v27
+; NOOPT-NEXT: v_mov_b32_e32 v14, v26
+; NOOPT-NEXT: v_mov_b32_e32 v15, v25
+; NOOPT-NEXT: v_mov_b32_e32 v9, v24
+; NOOPT-NEXT: v_mov_b32_e32 v10, v31
+; NOOPT-NEXT: v_mov_b32_e32 v11, v30
+; NOOPT-NEXT: v_mov_b32_e32 v12, v29
+; NOOPT-NEXT: v_mov_b32_e32 v16, v28
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v17, v12
+; NOOPT-NEXT: v_mov_b32_e32 v18, v11
+; NOOPT-NEXT: v_mov_b32_e32 v19, v10
+; NOOPT-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v10, v15
+; NOOPT-NEXT: v_mov_b32_e32 v11, v14
+; NOOPT-NEXT: v_mov_b32_e32 v12, v13
+; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v9, v4
+; NOOPT-NEXT: v_mov_b32_e32 v10, v3
+; NOOPT-NEXT: v_mov_b32_e32 v11, v2
+; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v2, v7
+; NOOPT-NEXT: v_mov_b32_e32 v3, v6
+; NOOPT-NEXT: v_mov_b32_e32 v4, v5
+; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0
+; NOOPT-NEXT: ; kill: killed $vgpr0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: insert_neg_offset_vgpr:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: v_add_i32_e32 v12, vcc, 0xfffffe00, v0
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 4, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 3, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v1, 2, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v0, 1, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v7, 8, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v11, 12, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v10, 11, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v9, 10, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v14, 15, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v13, 14, 33, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v12, 13, 33, vcc
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-LABEL: insert_neg_offset_vgpr:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: v_add_u32_e32 v12, vcc, 0xfffffe00, v0
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12
+; VI-NEXT: v_cndmask_b32_e64 v3, 4, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12
+; VI-NEXT: v_cndmask_b32_e64 v2, 3, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
+; VI-NEXT: v_cndmask_b32_e64 v1, 2, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
+; VI-NEXT: v_cndmask_b32_e64 v0, 1, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12
+; VI-NEXT: v_cndmask_b32_e64 v7, 8, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12
+; VI-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12
+; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
+; VI-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12
+; VI-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12
+; VI-NEXT: v_cndmask_b32_e64 v11, 12, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12
+; VI-NEXT: v_cndmask_b32_e64 v10, 11, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12
+; VI-NEXT: v_cndmask_b32_e64 v9, 10, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12
+; VI-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s2, s0, 48
+; VI-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12
+; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: v_cndmask_b32_e64 v14, 15, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12
+; VI-NEXT: v_mov_b32_e32 v17, s3
+; VI-NEXT: v_cndmask_b32_e64 v13, 14, 33, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12
+; VI-NEXT: v_mov_b32_e32 v16, s2
+; VI-NEXT: s_add_u32 s2, s0, 32
+; VI-NEXT: v_cndmask_b32_e64 v12, 13, 33, vcc
+; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v13, s3
+; VI-NEXT: v_mov_b32_e32 v12, s2
+; VI-NEXT: s_add_u32 s2, s0, 16
+; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v9, s3
+; VI-NEXT: v_mov_b32_e32 v8, s2
+; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: insert_neg_offset_vgpr:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: v_add_u32_e32 v12, 0xfffffe00, v0
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 4, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v2, 3, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v1, 2, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v0, 1, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v7, 8, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v11, 12, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v10, 11, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v9, 10, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v14, 15, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v13, 14, 33, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v12, 13, 33, vcc
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
- %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
%index = add i32 %id, -512
%value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 33, i32 %index
store <16 x i32> %value, ptr addrspace(1) %out
ret void
}
-; GCN-LABEL: {{^}}insert_neg_inline_offset_vgpr:
-
-; GCN: v_cmp_eq_u32_e32
-; GCN-COUNT-16: v_cndmask_b32
-; GCN-COUNT-4: buffer_store_dwordx4
define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; NOOPT-LABEL: insert_neg_inline_offset_vgpr:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; NOOPT-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; NOOPT-NEXT: s_mov_b32 s22, -1
+; NOOPT-NEXT: s_mov_b32 s23, 0xe8f000
+; NOOPT-NEXT: s_add_u32 s20, s20, s9
+; NOOPT-NEXT: s_addc_u32 s21, s21, 0
+; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane
+; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:136 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s6, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s4, 0xf000
+; NOOPT-NEXT: s_mov_b32 s5, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s6
+; NOOPT-NEXT: s_mov_b32 s2, s5
+; NOOPT-NEXT: s_mov_b32 s3, s4
+; NOOPT-NEXT: v_writelane_b32 v16, s0, 0
+; NOOPT-NEXT: v_writelane_b32 v16, s1, 1
+; NOOPT-NEXT: v_writelane_b32 v16, s2, 2
+; NOOPT-NEXT: v_writelane_b32 v16, s3, 3
+; NOOPT-NEXT: s_mov_b32 s0, 16
+; NOOPT-NEXT: s_mov_b32 s1, 15
+; NOOPT-NEXT: s_mov_b32 s2, 14
+; NOOPT-NEXT: s_mov_b32 s3, 13
+; NOOPT-NEXT: s_mov_b32 s4, 12
+; NOOPT-NEXT: s_mov_b32 s5, 11
+; NOOPT-NEXT: s_mov_b32 s6, 10
+; NOOPT-NEXT: s_mov_b32 s7, 9
+; NOOPT-NEXT: s_mov_b32 s8, 8
+; NOOPT-NEXT: s_mov_b32 s9, 7
+; NOOPT-NEXT: s_mov_b32 s10, 6
+; NOOPT-NEXT: s_mov_b32 s11, 5
+; NOOPT-NEXT: s_mov_b32 s12, 4
+; NOOPT-NEXT: s_mov_b32 s13, 3
+; NOOPT-NEXT: s_mov_b32 s14, 2
+; NOOPT-NEXT: s_mov_b32 s15, 1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, s15
+; NOOPT-NEXT: v_mov_b32_e32 v31, s14
+; NOOPT-NEXT: v_mov_b32_e32 v30, s13
+; NOOPT-NEXT: v_mov_b32_e32 v29, s12
+; NOOPT-NEXT: v_mov_b32_e32 v28, s11
+; NOOPT-NEXT: v_mov_b32_e32 v27, s10
+; NOOPT-NEXT: v_mov_b32_e32 v26, s9
+; NOOPT-NEXT: v_mov_b32_e32 v25, s8
+; NOOPT-NEXT: v_mov_b32_e32 v24, s7
+; NOOPT-NEXT: v_mov_b32_e32 v23, s6
+; NOOPT-NEXT: v_mov_b32_e32 v22, s5
+; NOOPT-NEXT: v_mov_b32_e32 v21, s4
+; NOOPT-NEXT: v_mov_b32_e32 v20, s3
+; NOOPT-NEXT: v_mov_b32_e32 v19, s2
+; NOOPT-NEXT: v_mov_b32_e32 v18, s1
+; NOOPT-NEXT: v_mov_b32_e32 v17, s0
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v31
+; NOOPT-NEXT: v_mov_b32_e32 v2, v30
+; NOOPT-NEXT: v_mov_b32_e32 v3, v29
+; NOOPT-NEXT: v_mov_b32_e32 v4, v28
+; NOOPT-NEXT: v_mov_b32_e32 v5, v27
+; NOOPT-NEXT: v_mov_b32_e32 v6, v26
+; NOOPT-NEXT: v_mov_b32_e32 v7, v25
+; NOOPT-NEXT: v_mov_b32_e32 v8, v24
+; NOOPT-NEXT: v_mov_b32_e32 v9, v23
+; NOOPT-NEXT: v_mov_b32_e32 v10, v22
+; NOOPT-NEXT: v_mov_b32_e32 v11, v21
+; NOOPT-NEXT: v_mov_b32_e32 v12, v20
+; NOOPT-NEXT: v_mov_b32_e32 v13, v19
+; NOOPT-NEXT: v_mov_b32_e32 v14, v18
+; NOOPT-NEXT: v_mov_b32_e32 v15, v17
+; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:72 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:76 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:80 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:84 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:88 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:92 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:96 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:100 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:104 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:108 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:112 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:116 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:120 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:124 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:132 ; 4-byte Folded Spill
+; NOOPT-NEXT: v_mov_b32_e32 v17, 0x1f4
+; NOOPT-NEXT: buffer_store_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[0:1], exec
+; NOOPT-NEXT: v_writelane_b32 v16, s0, 4
+; NOOPT-NEXT: v_writelane_b32 v16, s1, 5
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: s_waitcnt expcnt(2)
+; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 6
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 7
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v1, off, s[20:23], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:68 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:136 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readfirstlane_b32 s2, v18
+; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18
+; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; NOOPT-NEXT: s_add_i32 m0, s2, -16
+; NOOPT-NEXT: v_movreld_b32_e32 v1, v17
+; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 offset:140 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:144 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:148 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:152 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:156 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:160 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:164 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:168 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:172 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:176 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:180 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:184 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:188 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:192 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:196 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:200 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[20:23], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[20:23], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[20:23], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[20:23], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[20:23], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[20:23], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[20:23], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[20:23], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[20:23], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[20:23], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[20:23], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[20:23], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[20:23], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1]
+; NOOPT-NEXT: v_writelane_b32 v0, s2, 6
+; NOOPT-NEXT: v_writelane_b32 v0, s3, 7
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1]
+; NOOPT-NEXT: s_cbranch_execnz .LBB15_1
+; NOOPT-NEXT: ; %bb.2:
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 4
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 5
+; NOOPT-NEXT: s_mov_b64 exec, s[0:1]
+; NOOPT-NEXT: ; %bb.3:
+; NOOPT-NEXT: s_or_saveexec_b64 s[16:17], -1
+; NOOPT-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[16:17]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 0
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 1
+; NOOPT-NEXT: v_readlane_b32 s2, v0, 2
+; NOOPT-NEXT: v_readlane_b32 s3, v0, 3
+; NOOPT-NEXT: buffer_load_dword v16, off, s[20:23], 0 offset:140 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v17, off, s[20:23], 0 offset:144 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v18, off, s[20:23], 0 offset:148 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v19, off, s[20:23], 0 offset:152 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v20, off, s[20:23], 0 offset:156 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v21, off, s[20:23], 0 offset:160 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v22, off, s[20:23], 0 offset:164 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v23, off, s[20:23], 0 offset:168 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v24, off, s[20:23], 0 offset:172 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v25, off, s[20:23], 0 offset:176 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v26, off, s[20:23], 0 offset:180 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v27, off, s[20:23], 0 offset:184 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v28, off, s[20:23], 0 offset:188 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v29, off, s[20:23], 0 offset:192 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v30, off, s[20:23], 0 offset:196 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v31, off, s[20:23], 0 offset:200 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v5, v19
+; NOOPT-NEXT: v_mov_b32_e32 v6, v18
+; NOOPT-NEXT: v_mov_b32_e32 v7, v17
+; NOOPT-NEXT: v_mov_b32_e32 v1, v16
+; NOOPT-NEXT: v_mov_b32_e32 v2, v23
+; NOOPT-NEXT: v_mov_b32_e32 v3, v22
+; NOOPT-NEXT: v_mov_b32_e32 v4, v21
+; NOOPT-NEXT: v_mov_b32_e32 v8, v20
+; NOOPT-NEXT: v_mov_b32_e32 v13, v27
+; NOOPT-NEXT: v_mov_b32_e32 v14, v26
+; NOOPT-NEXT: v_mov_b32_e32 v15, v25
+; NOOPT-NEXT: v_mov_b32_e32 v9, v24
+; NOOPT-NEXT: v_mov_b32_e32 v10, v31
+; NOOPT-NEXT: v_mov_b32_e32 v11, v30
+; NOOPT-NEXT: v_mov_b32_e32 v12, v29
+; NOOPT-NEXT: v_mov_b32_e32 v16, v28
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17_vgpr18_vgpr19 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v17, v12
+; NOOPT-NEXT: v_mov_b32_e32 v18, v11
+; NOOPT-NEXT: v_mov_b32_e32 v19, v10
+; NOOPT-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v10, v15
+; NOOPT-NEXT: v_mov_b32_e32 v11, v14
+; NOOPT-NEXT: v_mov_b32_e32 v12, v13
+; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v9, v4
+; NOOPT-NEXT: v_mov_b32_e32 v10, v3
+; NOOPT-NEXT: v_mov_b32_e32 v11, v2
+; NOOPT-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v2, v7
+; NOOPT-NEXT: v_mov_b32_e32 v3, v6
+; NOOPT-NEXT: v_mov_b32_e32 v4, v5
+; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0
+; NOOPT-NEXT: ; kill: killed $vgpr0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: insert_neg_inline_offset_vgpr:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: v_add_i32_e32 v12, vcc, -16, v0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x1f4
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, 4, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 3, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v1, 2, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v0, 1, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 8, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 7, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 6, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, 5, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 12, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 11, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 10, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 9, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 16, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 15, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 14, v16, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 13, v16, vcc
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-LABEL: insert_neg_inline_offset_vgpr:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: v_add_u32_e32 v12, vcc, -16, v0
+; VI-NEXT: v_mov_b32_e32 v16, 0x1f4
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12
+; VI-NEXT: v_cndmask_b32_e32 v3, 4, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12
+; VI-NEXT: v_cndmask_b32_e32 v2, 3, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
+; VI-NEXT: v_cndmask_b32_e32 v1, 2, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
+; VI-NEXT: v_cndmask_b32_e32 v0, 1, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12
+; VI-NEXT: v_cndmask_b32_e32 v7, 8, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12
+; VI-NEXT: v_cndmask_b32_e32 v6, 7, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12
+; VI-NEXT: v_cndmask_b32_e32 v5, 6, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12
+; VI-NEXT: v_cndmask_b32_e32 v4, 5, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12
+; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
+; VI-NEXT: v_cndmask_b32_e32 v11, 12, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12
+; VI-NEXT: v_cndmask_b32_e32 v10, 11, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12
+; VI-NEXT: v_cndmask_b32_e32 v9, 10, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12
+; VI-NEXT: v_cndmask_b32_e32 v8, 9, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12
+; VI-NEXT: v_cndmask_b32_e32 v15, 16, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12
+; VI-NEXT: v_cndmask_b32_e32 v14, 15, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s2, s0, 48
+; VI-NEXT: v_cndmask_b32_e32 v13, 14, v16, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12
+; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: v_cndmask_b32_e32 v12, 13, v16, vcc
+; VI-NEXT: v_mov_b32_e32 v17, s3
+; VI-NEXT: v_mov_b32_e32 v16, s2
+; VI-NEXT: s_add_u32 s2, s0, 32
+; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v13, s3
+; VI-NEXT: v_mov_b32_e32 v12, s2
+; VI-NEXT: s_add_u32 s2, s0, 16
+; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v9, s3
+; VI-NEXT: v_mov_b32_e32 v8, s2
+; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-NEXT: s_nop 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: insert_neg_inline_offset_vgpr:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: v_add_u32_e32 v12, -16, v0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x1f4
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, 4, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v2, 3, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v1, 2, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v0, 1, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 8, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 7, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, 6, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, 5, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 12, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, 11, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 10, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x2c
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 9, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 16, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 15, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 14, v17, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 13, v17, vcc
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
- %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
%index = add i32 %id, -16
%value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 500, i32 %index
store <16 x i32> %value, ptr addrspace(1) %out
@@ -316,19 +4056,646 @@ entry:
; When the block is split to insert the loop, make sure any other
; places that need to be expanded in the same block are also handled.
-
-; GCN-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:
-
-; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
-; GCN: v_cmp_eq_u32
-; GCN: v_cndmask_b32_e64 [[RESULT0:v[0-9]+]], 16,
-; GCN: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 16,
-
-; GCN: buffer_store_dword [[RESULT0]]
-; GCN: buffer_store_dword [[RESULT1]]
-define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) #0 {
+define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) {
+; NOOPT-LABEL: extract_vgpr_offset_multiple_in_block:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; NOOPT-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; NOOPT-NEXT: s_mov_b32 s38, -1
+; NOOPT-NEXT: s_mov_b32 s39, 0xe8f000
+; NOOPT-NEXT: s_add_u32 s36, s36, s9
+; NOOPT-NEXT: s_addc_u32 s37, s37, 0
+; NOOPT-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
+; NOOPT-NEXT: s_mov_b64 s[0:1], s[2:3]
+; NOOPT-NEXT: v_mov_b32_e32 v1, v0
+; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
+; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
+; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:76 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s8, s3
+; NOOPT-NEXT: s_mov_b32 s4, s2
+; NOOPT-NEXT: s_mov_b32 s2, 0xf000
+; NOOPT-NEXT: s_mov_b32 s3, -1
+; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; NOOPT-NEXT: s_mov_b32 s5, s8
+; NOOPT-NEXT: s_mov_b32 s6, s3
+; NOOPT-NEXT: s_mov_b32 s7, s2
+; NOOPT-NEXT: s_waitcnt vmcnt(1)
+; NOOPT-NEXT: v_writelane_b32 v0, s4, 0
+; NOOPT-NEXT: v_writelane_b32 v0, s5, 1
+; NOOPT-NEXT: v_writelane_b32 v0, s6, 2
+; NOOPT-NEXT: v_writelane_b32 v0, s7, 3
+; NOOPT-NEXT: s_mov_b32 s4, 0
+; NOOPT-NEXT: v_writelane_b32 v0, s4, 4
+; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; NOOPT-NEXT: s_mov_b32 s5, s2
+; NOOPT-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
+; NOOPT-NEXT: s_mov_b32 s4, 2
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_lshlrev_b32_e64 v1, s4, v1
+; NOOPT-NEXT: s_mov_b32 s4, 0
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: v_mov_b32_e32 v3, 0
+; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v2, v3
+; NOOPT-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:72 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b32 s0, 1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_add_i32_e64 v1, s[0:1], v1, s0
+; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b32 s16, 16
+; NOOPT-NEXT: s_mov_b32 s17, 15
+; NOOPT-NEXT: s_mov_b32 s18, 14
+; NOOPT-NEXT: s_mov_b32 s20, 12
+; NOOPT-NEXT: s_mov_b32 s22, 10
+; NOOPT-NEXT: s_mov_b32 s24, 8
+; NOOPT-NEXT: s_mov_b32 s26, 6
+; NOOPT-NEXT: s_mov_b32 s27, 5
+; NOOPT-NEXT: s_mov_b32 s19, 13
+; NOOPT-NEXT: s_mov_b32 s21, 11
+; NOOPT-NEXT: s_mov_b32 s23, 9
+; NOOPT-NEXT: s_mov_b32 s25, 7
+; NOOPT-NEXT: s_mov_b32 s0, s25
+; NOOPT-NEXT: s_mov_b32 s1, s23
+; NOOPT-NEXT: s_mov_b32 s2, s21
+; NOOPT-NEXT: s_mov_b32 s3, s19
+; NOOPT-NEXT: s_mov_b32 s4, s27
+; NOOPT-NEXT: s_mov_b32 s5, s26
+; NOOPT-NEXT: s_mov_b32 s6, s25
+; NOOPT-NEXT: s_mov_b32 s7, s24
+; NOOPT-NEXT: s_mov_b32 s8, s23
+; NOOPT-NEXT: s_mov_b32 s9, s22
+; NOOPT-NEXT: s_mov_b32 s10, s21
+; NOOPT-NEXT: s_mov_b32 s11, s20
+; NOOPT-NEXT: s_mov_b32 s12, s19
+; NOOPT-NEXT: s_mov_b32 s13, s18
+; NOOPT-NEXT: s_mov_b32 s14, s17
+; NOOPT-NEXT: s_mov_b32 s15, s16
+; NOOPT-NEXT: v_writelane_b32 v0, s0, 5
+; NOOPT-NEXT: v_writelane_b32 v0, s1, 6
+; NOOPT-NEXT: v_writelane_b32 v0, s2, 7
+; NOOPT-NEXT: v_writelane_b32 v0, s3, 8
+; NOOPT-NEXT: v_writelane_b32 v0, s4, 9
+; NOOPT-NEXT: v_writelane_b32 v0, s5, 10
+; NOOPT-NEXT: v_writelane_b32 v0, s6, 11
+; NOOPT-NEXT: v_writelane_b32 v0, s7, 12
+; NOOPT-NEXT: v_writelane_b32 v0, s8, 13
+; NOOPT-NEXT: v_writelane_b32 v0, s9, 14
+; NOOPT-NEXT: v_writelane_b32 v0, s10, 15
+; NOOPT-NEXT: v_writelane_b32 v0, s11, 16
+; NOOPT-NEXT: v_writelane_b32 v0, s12, 17
+; NOOPT-NEXT: v_writelane_b32 v0, s13, 18
+; NOOPT-NEXT: v_writelane_b32 v0, s14, 19
+; NOOPT-NEXT: v_writelane_b32 v0, s15, 20
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v16, s15
+; NOOPT-NEXT: v_mov_b32_e32 v15, s14
+; NOOPT-NEXT: v_mov_b32_e32 v14, s13
+; NOOPT-NEXT: v_mov_b32_e32 v13, s12
+; NOOPT-NEXT: v_mov_b32_e32 v12, s11
+; NOOPT-NEXT: v_mov_b32_e32 v11, s10
+; NOOPT-NEXT: v_mov_b32_e32 v10, s9
+; NOOPT-NEXT: v_mov_b32_e32 v9, s8
+; NOOPT-NEXT: v_mov_b32_e32 v8, s7
+; NOOPT-NEXT: v_mov_b32_e32 v7, s6
+; NOOPT-NEXT: v_mov_b32_e32 v6, s5
+; NOOPT-NEXT: v_mov_b32_e32 v5, s4
+; NOOPT-NEXT: v_mov_b32_e32 v4, s3
+; NOOPT-NEXT: v_mov_b32_e32 v3, s2
+; NOOPT-NEXT: v_mov_b32_e32 v2, s1
+; NOOPT-NEXT: v_mov_b32_e32 v1, s0
+; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[36:39], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[36:39], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[36:39], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[36:39], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[36:39], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[36:39], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[36:39], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[36:39], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[36:39], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[36:39], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[36:39], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[36:39], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[36:39], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[36:39], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[36:39], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[0:1], exec
+; NOOPT-NEXT: v_writelane_b32 v0, s0, 21
+; NOOPT-NEXT: v_writelane_b32 v0, s1, 22
+; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
+; NOOPT-NEXT: ; implicit-def: $vgpr0
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1
+; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 23
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 24
+; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 offset:80 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v4, off, s[36:39], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v5, off, s[36:39], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v6, off, s[36:39], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v7, off, s[36:39], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v8, off, s[36:39], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v9, off, s[36:39], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v10, off, s[36:39], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v11, off, s[36:39], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v12, off, s[36:39], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:56 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:60 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:72 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
+; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
+; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; NOOPT-NEXT: s_mov_b32 m0, s2
+; NOOPT-NEXT: v_movrels_b32_e32 v1, v1
+; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:84 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:80 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1]
+; NOOPT-NEXT: v_writelane_b32 v0, s2, 23
+; NOOPT-NEXT: v_writelane_b32 v0, s3, 24
+; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
+; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1]
+; NOOPT-NEXT: s_cbranch_execnz .LBB16_1
+; NOOPT-NEXT: ; %bb.2:
+; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 21
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 22
+; NOOPT-NEXT: s_mov_b64 exec, s[0:1]
+; NOOPT-NEXT: ; %bb.3:
+; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
+; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
+; NOOPT-NEXT: ;;#ASMSTART
+; NOOPT-NEXT: s_mov_b32 s4, 17
+; NOOPT-NEXT: ;;#ASMEND
+; NOOPT-NEXT: s_mov_b32 s16, s4
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 5
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 6
+; NOOPT-NEXT: v_readlane_b32 s2, v0, 7
+; NOOPT-NEXT: v_readlane_b32 s3, v0, 8
+; NOOPT-NEXT: v_readlane_b32 s4, v0, 9
+; NOOPT-NEXT: v_readlane_b32 s5, v0, 10
+; NOOPT-NEXT: v_readlane_b32 s6, v0, 11
+; NOOPT-NEXT: v_readlane_b32 s7, v0, 12
+; NOOPT-NEXT: v_readlane_b32 s8, v0, 13
+; NOOPT-NEXT: v_readlane_b32 s9, v0, 14
+; NOOPT-NEXT: v_readlane_b32 s10, v0, 15
+; NOOPT-NEXT: v_readlane_b32 s11, v0, 16
+; NOOPT-NEXT: v_readlane_b32 s12, v0, 17
+; NOOPT-NEXT: v_readlane_b32 s13, v0, 18
+; NOOPT-NEXT: v_readlane_b32 s14, v0, 19
+; NOOPT-NEXT: v_readlane_b32 s15, v0, 20
+; NOOPT-NEXT: v_writelane_b32 v0, s16, 25
+; NOOPT-NEXT: v_mov_b32_e32 v16, s15
+; NOOPT-NEXT: v_mov_b32_e32 v15, s14
+; NOOPT-NEXT: v_mov_b32_e32 v14, s13
+; NOOPT-NEXT: v_mov_b32_e32 v13, s12
+; NOOPT-NEXT: v_mov_b32_e32 v12, s11
+; NOOPT-NEXT: v_mov_b32_e32 v11, s10
+; NOOPT-NEXT: v_mov_b32_e32 v10, s9
+; NOOPT-NEXT: v_mov_b32_e32 v9, s8
+; NOOPT-NEXT: v_mov_b32_e32 v8, s7
+; NOOPT-NEXT: v_mov_b32_e32 v7, s6
+; NOOPT-NEXT: v_mov_b32_e32 v6, s5
+; NOOPT-NEXT: v_mov_b32_e32 v5, s4
+; NOOPT-NEXT: v_mov_b32_e32 v4, s3
+; NOOPT-NEXT: v_mov_b32_e32 v3, s2
+; NOOPT-NEXT: v_mov_b32_e32 v2, s1
+; NOOPT-NEXT: v_mov_b32_e32 v1, s0
+; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:88 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[36:39], 0 offset:92 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[36:39], 0 offset:96 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[36:39], 0 offset:100 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[36:39], 0 offset:104 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[36:39], 0 offset:108 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[36:39], 0 offset:112 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[36:39], 0 offset:116 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[36:39], 0 offset:120 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[36:39], 0 offset:124 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[36:39], 0 offset:128 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[36:39], 0 offset:132 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[36:39], 0 offset:136 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[36:39], 0 offset:140 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[36:39], 0 offset:144 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[36:39], 0 offset:148 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[0:1], exec
+; NOOPT-NEXT: v_writelane_b32 v0, s0, 26
+; NOOPT-NEXT: v_writelane_b32 v0, s1, 27
+; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
+; NOOPT-NEXT: ; implicit-def: $vgpr0
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: .LBB16_4: ; =>This Inner Loop Header: Depth=1
+; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 28
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 29
+; NOOPT-NEXT: buffer_load_dword v18, off, s[36:39], 0 offset:152 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:88 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:92 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:96 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v4, off, s[36:39], 0 offset:100 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v5, off, s[36:39], 0 offset:104 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v6, off, s[36:39], 0 offset:108 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v7, off, s[36:39], 0 offset:112 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v8, off, s[36:39], 0 offset:116 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v9, off, s[36:39], 0 offset:120 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v10, off, s[36:39], 0 offset:124 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v11, off, s[36:39], 0 offset:128 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v12, off, s[36:39], 0 offset:132 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v13, off, s[36:39], 0 offset:136 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v14, off, s[36:39], 0 offset:140 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v15, off, s[36:39], 0 offset:144 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v16, off, s[36:39], 0 offset:148 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v17, off, s[36:39], 0 offset:68 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readfirstlane_b32 s2, v17
+; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v17
+; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; NOOPT-NEXT: s_mov_b32 m0, s2
+; NOOPT-NEXT: v_movrels_b32_e32 v1, v1
+; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:156 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v1, off, s[36:39], 0 offset:152 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1]
+; NOOPT-NEXT: v_writelane_b32 v0, s2, 28
+; NOOPT-NEXT: v_writelane_b32 v0, s3, 29
+; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
+; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1]
+; NOOPT-NEXT: s_cbranch_execnz .LBB16_4
+; NOOPT-NEXT: ; %bb.5:
+; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 26
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 27
+; NOOPT-NEXT: s_mov_b64 exec, s[0:1]
+; NOOPT-NEXT: ; %bb.6:
+; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
+; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 4
+; NOOPT-NEXT: v_readlane_b32 s4, v0, 0
+; NOOPT-NEXT: v_readlane_b32 s5, v0, 1
+; NOOPT-NEXT: v_readlane_b32 s6, v0, 2
+; NOOPT-NEXT: v_readlane_b32 s7, v0, 3
+; NOOPT-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:76 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v2, off, s[36:39], 0 offset:156 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v3, off, s[36:39], 0 offset:84 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v3, off, s[4:7], 0
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[4:7], 0
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0
+; NOOPT-NEXT: s_mov_b64 s[0:1], exec
+; NOOPT-NEXT: v_writelane_b32 v0, s0, 30
+; NOOPT-NEXT: v_writelane_b32 v0, s1, 31
+; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
+; NOOPT-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; NOOPT-NEXT: s_mov_b64 exec, s[0:1]
+; NOOPT-NEXT: s_cbranch_execz .LBB16_8
+; NOOPT-NEXT: ; %bb.7: ; %bb1
+; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s4, v0, 25
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: v_mov_b32_e32 v0, s4
+; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: .LBB16_8: ; %bb2
+; NOOPT-NEXT: s_or_saveexec_b64 s[28:29], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[36:39], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[28:29]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 30
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 31
+; NOOPT-NEXT: s_or_b64 exec, exec, s[0:1]
+; NOOPT-NEXT: ; kill: killed $vgpr0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: extract_vgpr_offset_multiple_in_block:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
+; SI-MOVREL-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9
+; SI-MOVREL-NEXT: s_mov_b32 s11, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s6, 0
+; SI-MOVREL-NEXT: s_mov_b32 s7, s11
+; SI-MOVREL-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; SI-MOVREL-NEXT: s_mov_b32 s10, -1
+; SI-MOVREL-NEXT: ;;#ASMSTART
+; SI-MOVREL-NEXT: s_mov_b32 s4, 17
+; SI-MOVREL-NEXT: ;;#ASMEND
+; SI-MOVREL-NEXT: v_add_i32_e64 v0, s[0:1], 1, v1
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 7, 9, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 11, v2, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 7, 9, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 13, v2, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 11, v3, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 5, v2, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 13, v3, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 6, v2, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 5, v3, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 7, v2, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 6, v3, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 8, v2, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 7, v3, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 9, v2, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 8, v3, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 10, v2, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 9, v3, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 11, v2, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 10, v3, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 12, v2, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 11, v3, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 13, v2, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 12, v3, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 14, v2, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 13, v3, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 15, v2, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v3, 14, v3, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v1
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v1, 16, v2, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v2, 15, v3, s[0:1]
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v0
+; SI-MOVREL-NEXT: v_cndmask_b32_e64 v0, 16, v2, s[0:1]
+; SI-MOVREL-NEXT: buffer_store_dword v1, off, s[8:11], 0
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; SI-MOVREL-NEXT: s_cbranch_execz .LBB16_2
+; SI-MOVREL-NEXT: ; %bb.1: ; %bb1
+; SI-MOVREL-NEXT: s_waitcnt expcnt(0)
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: .LBB16_2: ; %bb2
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-LABEL: extract_vgpr_offset_multiple_in_block:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
+; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT: flat_load_dword v2, v[1:2] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: ;;#ASMSTART
+; VI-NEXT: s_mov_b32 s4, 17
+; VI-NEXT: ;;#ASMEND
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e64 v3, s[0:1], 1, v2
+; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2
+; VI-NEXT: v_cndmask_b32_e64 v4, 7, 9, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v2
+; VI-NEXT: v_cndmask_b32_e64 v4, 11, v4, s[0:1]
+; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3
+; VI-NEXT: v_cndmask_b32_e64 v5, 7, 9, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v2
+; VI-NEXT: v_cndmask_b32_e64 v4, 13, v4, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v3
+; VI-NEXT: v_cndmask_b32_e64 v5, 11, v5, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v2
+; VI-NEXT: v_cndmask_b32_e64 v4, 5, v4, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v3
+; VI-NEXT: v_cndmask_b32_e64 v5, 13, v5, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v2
+; VI-NEXT: v_cndmask_b32_e64 v4, 6, v4, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v3
+; VI-NEXT: v_cndmask_b32_e64 v5, 5, v5, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v2
+; VI-NEXT: v_cndmask_b32_e64 v4, 7, v4, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v3
+; VI-NEXT: v_cndmask_b32_e64 v5, 6, v5, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v2
+; VI-NEXT: v_cndmask_b32_e64 v4, 8, v4, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v3
+; VI-NEXT: v_cndmask_b32_e64 v5, 7, v5, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v2
+; VI-NEXT: v_cndmask_b32_e64 v4, 9, v4, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v3
+; VI-NEXT: v_cndmask_b32_e64 v5, 8, v5, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v2
+; VI-NEXT: v_cndmask_b32_e64 v4, 10, v4, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v3
+; VI-NEXT: v_cndmask_b32_e64 v5, 9, v5, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v2
+; VI-NEXT: v_cndmask_b32_e64 v4, 11, v4, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v3
+; VI-NEXT: v_cndmask_b32_e64 v5, 10, v5, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v2
+; VI-NEXT: v_cndmask_b32_e64 v4, 12, v4, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v3
+; VI-NEXT: v_cndmask_b32_e64 v5, 11, v5, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v2
+; VI-NEXT: v_cndmask_b32_e64 v4, 13, v4, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v3
+; VI-NEXT: v_cndmask_b32_e64 v5, 12, v5, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v2
+; VI-NEXT: v_cndmask_b32_e64 v4, 14, v4, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v3
+; VI-NEXT: v_cndmask_b32_e64 v5, 13, v5, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v2
+; VI-NEXT: v_cndmask_b32_e64 v4, 15, v4, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v3
+; VI-NEXT: v_cndmask_b32_e64 v5, 14, v5, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v2
+; VI-NEXT: v_cndmask_b32_e64 v2, 16, v4, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v3
+; VI-NEXT: v_cndmask_b32_e64 v4, 15, v5, s[0:1]
+; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v3
+; VI-NEXT: v_cndmask_b32_e64 v3, 16, v4, s[0:1]
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: flat_store_dword v[0:1], v3
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; VI-NEXT: s_cbranch_execz .LBB16_2
+; VI-NEXT: ; %bb.1: ; %bb1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: flat_store_dword v[0:1], v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: .LBB16_2: ; %bb2
+; VI-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: extract_vgpr_offset_multiple_in_block:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dword v2, v1, s[0:1] glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-IDXMODE-NEXT: ;;#ASMSTART
+; GFX9-IDXMODE-NEXT: s_mov_b32 s4, 17
+; GFX9-IDXMODE-NEXT: ;;#ASMEND
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2
+; GFX9-IDXMODE-NEXT: v_add_u32_e32 v0, 1, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 7, 9, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 11, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 7, 9, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 13, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 11, v4, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 5, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 13, v4, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 6, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 4, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 5, v4, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 7, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 5, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 6, v4, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 8, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 6, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 7, v4, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 9, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 7, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 8, v4, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 10, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 8, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 9, v4, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 11, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 9, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 10, v4, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 12, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 10, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 11, v4, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 13, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 11, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 12, v4, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 14, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 12, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 13, v4, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 15, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 13, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v4, 14, v4, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v2
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v2, 16, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 14, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v3, 15, v4, s[0:1]
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e64 s[0:1], 15, v0
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e64 v0, 16, v3, s[0:1]
+; GFX9-IDXMODE-NEXT: global_store_dword v1, v2, s[6:7]
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-IDXMODE-NEXT: s_cbranch_execz .LBB16_2
+; GFX9-IDXMODE-NEXT: ; %bb.1: ; %bb1
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-IDXMODE-NEXT: global_store_dword v[0:1], v0, off
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: .LBB16_2: ; %bb2
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
- %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
%id.ext = zext i32 %id to i64
%gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext
%idx0 = load volatile i32, ptr addrspace(1) %gep
@@ -349,62 +4716,1870 @@ bb2:
ret void
}
-; Moved subtest for insert_vgpr_offset_multiple_in_block to separate file to
-; avoid very
diff erent schedule induced isses with gfx9.
-; test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
+define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) {
+; NOOPT-LABEL: insert_vgpr_offset_multiple_in_block:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_mov_b32 s28, SCRATCH_RSRC_DWORD0
+; NOOPT-NEXT: s_mov_b32 s29, SCRATCH_RSRC_DWORD1
+; NOOPT-NEXT: s_mov_b32 s30, -1
+; NOOPT-NEXT: s_mov_b32 s31, 0xe8f000
+; NOOPT-NEXT: s_add_u32 s28, s28, s9
+; NOOPT-NEXT: s_addc_u32 s29, s29, 0
+; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane
+; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:84 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_load_dwordx2 s[18:19], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0xd
+; NOOPT-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x19
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s24, s19
+; NOOPT-NEXT: s_mov_b32 s20, s18
+; NOOPT-NEXT: s_mov_b32 s18, 0xf000
+; NOOPT-NEXT: s_mov_b32 s19, -1
+; NOOPT-NEXT: ; kill: def $sgpr20 killed $sgpr20 def $sgpr20_sgpr21_sgpr22_sgpr23
+; NOOPT-NEXT: s_mov_b32 s21, s24
+; NOOPT-NEXT: s_mov_b32 s22, s19
+; NOOPT-NEXT: s_mov_b32 s23, s18
+; NOOPT-NEXT: v_writelane_b32 v16, s20, 0
+; NOOPT-NEXT: v_writelane_b32 v16, s21, 1
+; NOOPT-NEXT: v_writelane_b32 v16, s22, 2
+; NOOPT-NEXT: v_writelane_b32 v16, s23, 3
+; NOOPT-NEXT: s_mov_b32 s20, 0
+; NOOPT-NEXT: v_writelane_b32 v16, s20, 4
+; NOOPT-NEXT: ; kill: def $sgpr20 killed $sgpr20 def $sgpr20_sgpr21
+; NOOPT-NEXT: s_mov_b32 s21, s18
+; NOOPT-NEXT: ; kill: def $sgpr16_sgpr17 killed $sgpr16_sgpr17 def $sgpr16_sgpr17_sgpr18_sgpr19
+; NOOPT-NEXT: s_mov_b64 s[18:19], s[20:21]
+; NOOPT-NEXT: s_mov_b32 s20, 2
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_lshlrev_b32_e64 v0, s20, v0
+; NOOPT-NEXT: s_mov_b32 s20, 0
+; NOOPT-NEXT: ; implicit-def: $sgpr20
+; NOOPT-NEXT: v_mov_b32_e32 v2, 0
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v2
+; NOOPT-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:80 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b32 s16, 1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_add_i32_e64 v0, s[16:17], v0, s16
+; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:76 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: ;;#ASMSTART
+; NOOPT-NEXT: v_mov_b32 v0, 62
+; NOOPT-NEXT: ;;#ASMEND
+; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:72 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, s0
+; NOOPT-NEXT: v_mov_b32_e32 v1, s1
+; NOOPT-NEXT: v_mov_b32_e32 v2, s2
+; NOOPT-NEXT: v_mov_b32_e32 v3, s3
+; NOOPT-NEXT: v_mov_b32_e32 v4, s4
+; NOOPT-NEXT: v_mov_b32_e32 v5, s5
+; NOOPT-NEXT: v_mov_b32_e32 v6, s6
+; NOOPT-NEXT: v_mov_b32_e32 v7, s7
+; NOOPT-NEXT: v_mov_b32_e32 v8, s8
+; NOOPT-NEXT: v_mov_b32_e32 v9, s9
+; NOOPT-NEXT: v_mov_b32_e32 v10, s10
+; NOOPT-NEXT: v_mov_b32_e32 v11, s11
+; NOOPT-NEXT: v_mov_b32_e32 v12, s12
+; NOOPT-NEXT: v_mov_b32_e32 v13, s13
+; NOOPT-NEXT: v_mov_b32_e32 v14, s14
+; NOOPT-NEXT: v_mov_b32_e32 v15, s15
+; NOOPT-NEXT: s_mov_b64 s[0:1], exec
+; NOOPT-NEXT: v_writelane_b32 v16, s0, 5
+; NOOPT-NEXT: v_writelane_b32 v16, s1, 6
+; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
+; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
+; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 7
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 8
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v4, off, s[28:31], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v5, off, s[28:31], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:56 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:60 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:72 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:80 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readfirstlane_b32 s2, v18
+; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18
+; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; NOOPT-NEXT: s_mov_b32 m0, s2
+; NOOPT-NEXT: v_movreld_b32_e32 v1, v17
+; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:88 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:92 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:96 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:100 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:104 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:108 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:112 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:116 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:120 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:124 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:128 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:132 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:136 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:140 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:144 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:148 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1]
+; NOOPT-NEXT: v_writelane_b32 v0, s2, 7
+; NOOPT-NEXT: v_writelane_b32 v0, s3, 8
+; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
+; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1]
+; NOOPT-NEXT: s_cbranch_execnz .LBB17_1
+; NOOPT-NEXT: ; %bb.2:
+; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 5
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 6
+; NOOPT-NEXT: s_mov_b64 exec, s[0:1]
+; NOOPT-NEXT: ; %bb.3:
+; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:88 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:92 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:96 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:100 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v4, off, s[28:31], 0 offset:104 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v5, off, s[28:31], 0 offset:108 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:112 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:116 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:120 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:124 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:128 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:132 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:136 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:140 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:144 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:148 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
+; NOOPT-NEXT: v_mov_b32_e32 v17, 63
+; NOOPT-NEXT: buffer_store_dword v17, off, s[28:31], 0 offset:216 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[0:1], exec
+; NOOPT-NEXT: s_waitcnt vmcnt(1)
+; NOOPT-NEXT: v_writelane_b32 v16, s0, 9
+; NOOPT-NEXT: v_writelane_b32 v16, s1, 10
+; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
+; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:160 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:164 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:168 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:172 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:176 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:180 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:184 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:188 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:192 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:196 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:200 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:204 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:208 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: .LBB17_4: ; =>This Inner Loop Header: Depth=1
+; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 11
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 12
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:152 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v2, off, s[28:31], 0 offset:156 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v3, off, s[28:31], 0 offset:160 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v4, off, s[28:31], 0 offset:164 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v5, off, s[28:31], 0 offset:168 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v6, off, s[28:31], 0 offset:172 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v7, off, s[28:31], 0 offset:176 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v8, off, s[28:31], 0 offset:180 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v9, off, s[28:31], 0 offset:184 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v10, off, s[28:31], 0 offset:188 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v11, off, s[28:31], 0 offset:192 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v12, off, s[28:31], 0 offset:196 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v13, off, s[28:31], 0 offset:200 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v14, off, s[28:31], 0 offset:204 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v15, off, s[28:31], 0 offset:208 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v16, off, s[28:31], 0 offset:212 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:216 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:76 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readfirstlane_b32 s2, v18
+; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18
+; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; NOOPT-NEXT: s_mov_b32 m0, s2
+; NOOPT-NEXT: v_movreld_b32_e32 v1, v17
+; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:220 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:224 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:228 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:232 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:236 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:240 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:244 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:248 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:252 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:256 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:260 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:264 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:268 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:272 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:276 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:280 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[28:31], 0 offset:152 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[28:31], 0 offset:156 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[28:31], 0 offset:160 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[28:31], 0 offset:164 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[28:31], 0 offset:168 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[28:31], 0 offset:172 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[28:31], 0 offset:176 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[28:31], 0 offset:180 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[28:31], 0 offset:184 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[28:31], 0 offset:188 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[28:31], 0 offset:192 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[28:31], 0 offset:196 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[28:31], 0 offset:200 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[28:31], 0 offset:204 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[28:31], 0 offset:208 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[28:31], 0 offset:212 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1]
+; NOOPT-NEXT: v_writelane_b32 v0, s2, 11
+; NOOPT-NEXT: v_writelane_b32 v0, s3, 12
+; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
+; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1]
+; NOOPT-NEXT: s_cbranch_execnz .LBB17_4
+; NOOPT-NEXT: ; %bb.5:
+; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 9
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 10
+; NOOPT-NEXT: s_mov_b64 exec, s[0:1]
+; NOOPT-NEXT: ; %bb.6:
+; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 4
+; NOOPT-NEXT: v_readlane_b32 s4, v0, 0
+; NOOPT-NEXT: v_readlane_b32 s5, v0, 1
+; NOOPT-NEXT: v_readlane_b32 s6, v0, 2
+; NOOPT-NEXT: v_readlane_b32 s7, v0, 3
+; NOOPT-NEXT: buffer_load_dword v1, off, s[28:31], 0 offset:84 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v17, off, s[28:31], 0 offset:220 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v18, off, s[28:31], 0 offset:224 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v19, off, s[28:31], 0 offset:228 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v20, off, s[28:31], 0 offset:232 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v21, off, s[28:31], 0 offset:236 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v22, off, s[28:31], 0 offset:240 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v23, off, s[28:31], 0 offset:244 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v24, off, s[28:31], 0 offset:248 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v25, off, s[28:31], 0 offset:252 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v26, off, s[28:31], 0 offset:256 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v27, off, s[28:31], 0 offset:260 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v28, off, s[28:31], 0 offset:264 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v29, off, s[28:31], 0 offset:268 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v30, off, s[28:31], 0 offset:272 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v31, off, s[28:31], 0 offset:276 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v32, off, s[28:31], 0 offset:280 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v6, v20
+; NOOPT-NEXT: v_mov_b32_e32 v7, v19
+; NOOPT-NEXT: v_mov_b32_e32 v8, v18
+; NOOPT-NEXT: v_mov_b32_e32 v2, v17
+; NOOPT-NEXT: v_mov_b32_e32 v3, v24
+; NOOPT-NEXT: v_mov_b32_e32 v4, v23
+; NOOPT-NEXT: v_mov_b32_e32 v5, v22
+; NOOPT-NEXT: v_mov_b32_e32 v9, v21
+; NOOPT-NEXT: v_mov_b32_e32 v14, v28
+; NOOPT-NEXT: v_mov_b32_e32 v15, v27
+; NOOPT-NEXT: v_mov_b32_e32 v16, v26
+; NOOPT-NEXT: v_mov_b32_e32 v10, v25
+; NOOPT-NEXT: v_mov_b32_e32 v11, v32
+; NOOPT-NEXT: v_mov_b32_e32 v12, v31
+; NOOPT-NEXT: v_mov_b32_e32 v13, v30
+; NOOPT-NEXT: v_mov_b32_e32 v17, v29
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18_vgpr19_vgpr20 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v18, v13
+; NOOPT-NEXT: v_mov_b32_e32 v19, v12
+; NOOPT-NEXT: v_mov_b32_e32 v20, v11
+; NOOPT-NEXT: buffer_store_dwordx4 v[17:20], off, s[4:7], 0 offset:48
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v11, v16
+; NOOPT-NEXT: v_mov_b32_e32 v12, v15
+; NOOPT-NEXT: v_mov_b32_e32 v13, v14
+; NOOPT-NEXT: buffer_store_dwordx4 v[10:13], off, s[4:7], 0 offset:32
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10_vgpr11_vgpr12 killed $exec
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v10, v5
+; NOOPT-NEXT: v_mov_b32_e32 v11, v4
+; NOOPT-NEXT: v_mov_b32_e32 v12, v3
+; NOOPT-NEXT: buffer_store_dwordx4 v[9:12], off, s[4:7], 0 offset:16
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v3, v8
+; NOOPT-NEXT: v_mov_b32_e32 v4, v7
+; NOOPT-NEXT: v_mov_b32_e32 v5, v6
+; NOOPT-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0
+; NOOPT-NEXT: s_mov_b64 s[0:1], exec
+; NOOPT-NEXT: v_writelane_b32 v0, s0, 13
+; NOOPT-NEXT: v_writelane_b32 v0, s1, 14
+; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
+; NOOPT-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; NOOPT-NEXT: s_mov_b64 exec, s[0:1]
+; NOOPT-NEXT: s_cbranch_execz .LBB17_8
+; NOOPT-NEXT: ; %bb.7: ; %bb1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:68 ; 4-byte Folded Reload
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s6, s1
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s4, 0xf000
+; NOOPT-NEXT: s_mov_b32 s5, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s6
+; NOOPT-NEXT: s_mov_b32 s2, s5
+; NOOPT-NEXT: s_mov_b32 s3, s4
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: .LBB17_8: ; %bb2
+; NOOPT-NEXT: s_or_saveexec_b64 s[26:27], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[28:31], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[26:27]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 13
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 14
+; NOOPT-NEXT: s_or_b64 exec, exec, s[0:1]
+; NOOPT-NEXT: ; kill: killed $vgpr0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: insert_vgpr_offset_multiple_in_block:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
+; SI-MOVREL-NEXT: s_mov_b32 s23, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s6, 0
+; SI-MOVREL-NEXT: s_mov_b32 s7, s23
+; SI-MOVREL-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: buffer_load_dword v2, v[1:2], s[4:7], 0 addr64 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19
+; SI-MOVREL-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9
+; SI-MOVREL-NEXT: ;;#ASMSTART
+; SI-MOVREL-NEXT: v_mov_b32 v1, 62
+; SI-MOVREL-NEXT: ;;#ASMEND
+; SI-MOVREL-NEXT: s_mov_b32 s22, -1
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s16
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s17
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s18
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s19
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s12
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s13
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s14
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s15
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s8
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s10
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s11
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s4
+; SI-MOVREL-NEXT: v_mov_b32_e32 v16, s5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s6
+; SI-MOVREL-NEXT: v_mov_b32_e32 v18, s7
+; SI-MOVREL-NEXT: v_add_i32_e32 v19, vcc, 1, v2
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc
+; SI-MOVREL-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19
+; SI-MOVREL-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[6:9], off, s[20:23], 0 offset:16
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[2:5], off, s[20:23], 0
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; SI-MOVREL-NEXT: s_cbranch_execz .LBB17_2
+; SI-MOVREL-NEXT: ; %bb.1: ; %bb1
+; SI-MOVREL-NEXT: buffer_store_dword v1, off, s[20:23], 0
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: .LBB17_2: ; %bb2
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-LABEL: insert_vgpr_offset_multiple_in_block:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
+; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT: flat_load_dword v2, v[1:2] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
+; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-NEXT: ;;#ASMSTART
+; VI-NEXT: v_mov_b32 v1, 62
+; VI-NEXT: ;;#ASMEND
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s16
+; VI-NEXT: v_mov_b32_e32 v4, s17
+; VI-NEXT: v_mov_b32_e32 v5, s18
+; VI-NEXT: v_mov_b32_e32 v6, s19
+; VI-NEXT: v_mov_b32_e32 v7, s12
+; VI-NEXT: v_mov_b32_e32 v8, s13
+; VI-NEXT: v_mov_b32_e32 v9, s14
+; VI-NEXT: v_mov_b32_e32 v10, s15
+; VI-NEXT: v_mov_b32_e32 v11, s8
+; VI-NEXT: v_mov_b32_e32 v12, s9
+; VI-NEXT: v_mov_b32_e32 v13, s10
+; VI-NEXT: v_mov_b32_e32 v14, s11
+; VI-NEXT: v_mov_b32_e32 v15, s4
+; VI-NEXT: v_mov_b32_e32 v16, s5
+; VI-NEXT: v_mov_b32_e32 v17, s6
+; VI-NEXT: v_mov_b32_e32 v18, s7
+; VI-NEXT: s_add_u32 s2, s0, 48
+; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: v_add_u32_e32 v19, vcc, 1, v2
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2
+; VI-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2
+; VI-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2
+; VI-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2
+; VI-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2
+; VI-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2
+; VI-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2
+; VI-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2
+; VI-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2
+; VI-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2
+; VI-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2
+; VI-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2
+; VI-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; VI-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; VI-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
+; VI-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19
+; VI-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19
+; VI-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19
+; VI-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
+; VI-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19
+; VI-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19
+; VI-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19
+; VI-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19
+; VI-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19
+; VI-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19
+; VI-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19
+; VI-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19
+; VI-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19
+; VI-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19
+; VI-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19
+; VI-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19
+; VI-NEXT: v_mov_b32_e32 v19, s3
+; VI-NEXT: v_mov_b32_e32 v18, s2
+; VI-NEXT: s_add_u32 s2, s0, 32
+; VI-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc
+; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: flat_store_dwordx4 v[18:19], v[14:17]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_mov_b32_e32 v15, s3
+; VI-NEXT: v_mov_b32_e32 v14, s2
+; VI-NEXT: s_add_u32 s2, s0, 16
+; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v11, s3
+; VI-NEXT: v_mov_b32_e32 v10, s2
+; VI-NEXT: flat_store_dwordx4 v[10:11], v[6:9]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v7, s1
+; VI-NEXT: v_mov_b32_e32 v6, s0
+; VI-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; VI-NEXT: s_cbranch_execz .LBB17_2
+; VI-NEXT: ; %bb.1: ; %bb1
+; VI-NEXT: flat_store_dword v[0:1], v1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: .LBB17_2: ; %bb2
+; VI-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: insert_vgpr_offset_multiple_in_block:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX9-IDXMODE-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dword v3, v1, s[0:1] glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: ;;#ASMSTART
+; GFX9-IDXMODE-NEXT: v_mov_b32 v1, 62
+; GFX9-IDXMODE-NEXT: ;;#ASMEND
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s16
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s17
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s18
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s19
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s8
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s9
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s10
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s11
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, s4
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, s6
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, s7
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 12, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 13, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 14, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 15, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 8, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 9, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 10, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v27, v10, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 11, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, v11, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 4, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 5, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 6, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 7, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, v15, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, v16, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 2, v3
+; GFX9-IDXMODE-NEXT: v_add_u32_e32 v20, 1, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, v18, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 3, v3
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, v19, v1, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 3, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v6, 63, v3, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 2, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v5, 63, v5, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 1, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v3, 63, v12, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 7, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 6, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 5, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 4, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 11, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v14, 63, v11, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 10, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v13, 63, v27, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 9, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 8, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 15, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v18, 63, v24, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 14, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 13, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_ne_u32_e32 vcc, 12, v20
+; GFX9-IDXMODE-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[15:18], s[0:1] offset:48
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[11:14], s[0:1] offset:32
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[7:10], s[0:1] offset:16
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v2, v[3:6], s[0:1]
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-IDXMODE-NEXT: s_cbranch_execz .LBB17_2
+; GFX9-IDXMODE-NEXT: ; %bb.1: ; %bb1
+; GFX9-IDXMODE-NEXT: global_store_dword v[0:1], v1, off
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: .LBB17_2: ; %bb2
+; GFX9-IDXMODE-NEXT: s_endpgm
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.ext = zext i32 %id to i64
+ %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext
+ %idx0 = load volatile i32, ptr addrspace(1) %gep
+ %idx1 = add i32 %idx0, 1
+ %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
+ %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
+ %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
+ store volatile <16 x i32> %vec2, ptr addrspace(1) %out0
+ %cmp = icmp eq i32 %id, 0
+ br i1 %cmp, label %bb1, label %bb2
+bb1:
+ store volatile i32 %live.out.val, ptr addrspace(1) undef
+ br label %bb2
-; GCN-LABEL: {{^}}insert_adjacent_blocks:
-define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
+bb2:
+ ret void
+}
+
+; Avoid inserting extra v_mov from copies within the vgpr indexing sequence. The
+; gpr_idx mode switching sequence is expanded late for this reason.
+define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) {
+; NOOPT-LABEL: insert_w_offset_multiple_in_block:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xb
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_mov_b32 s5, 1
+; NOOPT-NEXT: s_add_i32 s5, s4, s5
+; NOOPT-NEXT: s_mov_b32 s6, 0x41800000
+; NOOPT-NEXT: s_mov_b32 s7, 0x41700000
+; NOOPT-NEXT: s_mov_b32 s8, 0x41600000
+; NOOPT-NEXT: s_mov_b32 s9, 0x41500000
+; NOOPT-NEXT: s_mov_b32 s10, 0x41400000
+; NOOPT-NEXT: s_mov_b32 s11, 0x41300000
+; NOOPT-NEXT: s_mov_b32 s12, 0x41200000
+; NOOPT-NEXT: s_mov_b32 s13, 0x41100000
+; NOOPT-NEXT: s_mov_b32 s14, 0x41000000
+; NOOPT-NEXT: s_mov_b32 s15, 0x40e00000
+; NOOPT-NEXT: s_mov_b32 s16, 0x40c00000
+; NOOPT-NEXT: s_mov_b32 s17, 0x40a00000
+; NOOPT-NEXT: s_mov_b32 s18, 4.0
+; NOOPT-NEXT: s_mov_b32 s19, 0x40400000
+; NOOPT-NEXT: s_mov_b32 s20, 2.0
+; NOOPT-NEXT: s_mov_b32 s21, 1.0
+; NOOPT-NEXT: v_mov_b32_e32 v23, s21
+; NOOPT-NEXT: v_mov_b32_e32 v14, s20
+; NOOPT-NEXT: v_mov_b32_e32 v13, s19
+; NOOPT-NEXT: v_mov_b32_e32 v12, s18
+; NOOPT-NEXT: v_mov_b32_e32 v11, s17
+; NOOPT-NEXT: v_mov_b32_e32 v10, s16
+; NOOPT-NEXT: v_mov_b32_e32 v9, s15
+; NOOPT-NEXT: v_mov_b32_e32 v8, s14
+; NOOPT-NEXT: v_mov_b32_e32 v7, s13
+; NOOPT-NEXT: v_mov_b32_e32 v6, s12
+; NOOPT-NEXT: v_mov_b32_e32 v5, s11
+; NOOPT-NEXT: v_mov_b32_e32 v4, s10
+; NOOPT-NEXT: v_mov_b32_e32 v3, s9
+; NOOPT-NEXT: v_mov_b32_e32 v2, s8
+; NOOPT-NEXT: v_mov_b32_e32 v1, s7
+; NOOPT-NEXT: v_mov_b32_e32 v0, s6
+; NOOPT-NEXT: ; kill: def $vgpr23 killed $vgpr23 def $vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v24, v14
+; NOOPT-NEXT: v_mov_b32_e32 v25, v13
+; NOOPT-NEXT: v_mov_b32_e32 v26, v12
+; NOOPT-NEXT: v_mov_b32_e32 v27, v11
+; NOOPT-NEXT: v_mov_b32_e32 v28, v10
+; NOOPT-NEXT: v_mov_b32_e32 v29, v9
+; NOOPT-NEXT: v_mov_b32_e32 v30, v8
+; NOOPT-NEXT: v_mov_b32_e32 v31, v7
+; NOOPT-NEXT: v_mov_b32_e32 v32, v6
+; NOOPT-NEXT: v_mov_b32_e32 v33, v5
+; NOOPT-NEXT: v_mov_b32_e32 v34, v4
+; NOOPT-NEXT: v_mov_b32_e32 v35, v3
+; NOOPT-NEXT: v_mov_b32_e32 v36, v2
+; NOOPT-NEXT: v_mov_b32_e32 v37, v1
+; NOOPT-NEXT: v_mov_b32_e32 v38, v0
+; NOOPT-NEXT: v_mov_b32_e32 v0, 0x41880000
+; NOOPT-NEXT: s_mov_b32 m0, s5
+; NOOPT-NEXT: v_movreld_b32_e32 v23, v0
+; NOOPT-NEXT: s_mov_b32 s5, 2
+; NOOPT-NEXT: s_add_i32 s4, s4, s5
+; NOOPT-NEXT: s_mov_b32 m0, s4
+; NOOPT-NEXT: v_mov_b32_e32 v7, v23
+; NOOPT-NEXT: v_mov_b32_e32 v8, v24
+; NOOPT-NEXT: v_mov_b32_e32 v9, v25
+; NOOPT-NEXT: v_mov_b32_e32 v10, v26
+; NOOPT-NEXT: v_mov_b32_e32 v11, v27
+; NOOPT-NEXT: v_mov_b32_e32 v12, v28
+; NOOPT-NEXT: v_mov_b32_e32 v13, v29
+; NOOPT-NEXT: v_mov_b32_e32 v14, v30
+; NOOPT-NEXT: v_mov_b32_e32 v15, v31
+; NOOPT-NEXT: v_mov_b32_e32 v16, v32
+; NOOPT-NEXT: v_mov_b32_e32 v17, v33
+; NOOPT-NEXT: v_mov_b32_e32 v18, v34
+; NOOPT-NEXT: v_mov_b32_e32 v19, v35
+; NOOPT-NEXT: v_mov_b32_e32 v20, v36
+; NOOPT-NEXT: v_mov_b32_e32 v21, v37
+; NOOPT-NEXT: v_mov_b32_e32 v22, v38
+; NOOPT-NEXT: v_movreld_b32_e32 v7, v0
+; NOOPT-NEXT: v_mov_b32_e32 v4, v38
+; NOOPT-NEXT: v_mov_b32_e32 v5, v37
+; NOOPT-NEXT: v_mov_b32_e32 v6, v36
+; NOOPT-NEXT: v_mov_b32_e32 v0, v35
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; NOOPT-NEXT: v_mov_b32_e32 v4, v34
+; NOOPT-NEXT: v_mov_b32_e32 v5, v33
+; NOOPT-NEXT: v_mov_b32_e32 v6, v32
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v31
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; NOOPT-NEXT: v_mov_b32_e32 v4, v30
+; NOOPT-NEXT: v_mov_b32_e32 v5, v29
+; NOOPT-NEXT: v_mov_b32_e32 v6, v28
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v27
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; NOOPT-NEXT: v_mov_b32_e32 v4, v26
+; NOOPT-NEXT: v_mov_b32_e32 v5, v25
+; NOOPT-NEXT: v_mov_b32_e32 v6, v24
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v23
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; NOOPT-NEXT: v_mov_b32_e32 v4, v22
+; NOOPT-NEXT: v_mov_b32_e32 v5, v21
+; NOOPT-NEXT: v_mov_b32_e32 v6, v20
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v19
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; NOOPT-NEXT: v_mov_b32_e32 v4, v18
+; NOOPT-NEXT: v_mov_b32_e32 v5, v17
+; NOOPT-NEXT: v_mov_b32_e32 v6, v16
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v15
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
+; NOOPT-NEXT: v_mov_b32_e32 v4, v14
+; NOOPT-NEXT: v_mov_b32_e32 v5, v13
+; NOOPT-NEXT: v_mov_b32_e32 v6, v12
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v11
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; NOOPT-NEXT: v_mov_b32_e32 v4, v10
+; NOOPT-NEXT: v_mov_b32_e32 v5, v9
+; NOOPT-NEXT: v_mov_b32_e32 v6, v8
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v7
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: insert_w_offset_multiple_in_block:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0xb
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_add_i32 s2, s4, 1
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000
+; SI-MOVREL-NEXT: s_mov_b32 m0, s2
+; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32
+; SI-MOVREL-NEXT: s_add_i32 s4, s4, 2
+; SI-MOVREL-NEXT: v_mov_b32_e32 v31, v15
+; SI-MOVREL-NEXT: v_mov_b32_e32 v30, v14
+; SI-MOVREL-NEXT: v_mov_b32_e32 v29, v13
+; SI-MOVREL-NEXT: v_mov_b32_e32 v28, v12
+; SI-MOVREL-NEXT: v_mov_b32_e32 v27, v11
+; SI-MOVREL-NEXT: v_mov_b32_e32 v26, v10
+; SI-MOVREL-NEXT: v_mov_b32_e32 v25, v9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v24, v8
+; SI-MOVREL-NEXT: v_mov_b32_e32 v23, v7
+; SI-MOVREL-NEXT: v_mov_b32_e32 v22, v6
+; SI-MOVREL-NEXT: v_mov_b32_e32 v21, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v20, v4
+; SI-MOVREL-NEXT: v_mov_b32_e32 v19, v3
+; SI-MOVREL-NEXT: v_mov_b32_e32 v18, v2
+; SI-MOVREL-NEXT: v_mov_b32_e32 v17, v1
+; SI-MOVREL-NEXT: v_mov_b32_e32 v16, v0
+; SI-MOVREL-NEXT: s_mov_b32 m0, s4
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: insert_w_offset_multiple_in_block:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dword s4, s[2:3], 0x2c
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 1.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 2.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, 0x40400000
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_add_i32 s2, s4, 1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, 0x41000000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, 0x41100000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41200000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, 0x41300000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, 0x41400000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, 0x41500000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, 0x41600000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, 0x41700000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v32, 0x41880000
+; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: s_add_i32 s4, s4, 2
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v32
+; VI-MOVREL-NEXT: v_mov_b32_e32 v31, v15
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
+; VI-MOVREL-NEXT: v_mov_b32_e32 v30, v14
+; VI-MOVREL-NEXT: v_mov_b32_e32 v29, v13
+; VI-MOVREL-NEXT: v_mov_b32_e32 v28, v12
+; VI-MOVREL-NEXT: v_mov_b32_e32 v27, v11
+; VI-MOVREL-NEXT: v_mov_b32_e32 v26, v10
+; VI-MOVREL-NEXT: v_mov_b32_e32 v25, v9
+; VI-MOVREL-NEXT: v_mov_b32_e32 v24, v8
+; VI-MOVREL-NEXT: v_mov_b32_e32 v23, v7
+; VI-MOVREL-NEXT: v_mov_b32_e32 v22, v6
+; VI-MOVREL-NEXT: v_mov_b32_e32 v21, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v20, v4
+; VI-MOVREL-NEXT: v_mov_b32_e32 v19, v3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v18, v2
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, v1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, v0
+; VI-MOVREL-NEXT: s_mov_b32 m0, s4
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v16, v32
+; VI-MOVREL-NEXT: v_mov_b32_e32 v33, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v32, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 64
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x70
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5
+; VI-MOVREL-NEXT: s_add_u32 s4, s0, 0x60
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[28:31]
+; VI-MOVREL-NEXT: s_addc_u32 s5, s1, 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5
+; VI-MOVREL-NEXT: s_add_u32 s0, s0, 0x50
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
+; VI-MOVREL-NEXT: s_addc_u32 s1, s1, 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s1
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s2
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s3
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: insert_w_offset_multiple_in_block:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_add_i32 s2, s4, 1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; VI-IDXMODE-NEXT: s_add_i32 s4, s4, 2
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, 0x41880000
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v32
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v31, v15
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v30, v14
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v29, v13
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v28, v12
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v27, v11
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v26, v10
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v25, v9
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v24, v8
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v23, v7
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v22, v6
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v21, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v20, v4
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v19, v3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v18, v2
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, v1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v0
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v32
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v33, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v32, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[32:33], v[12:15]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 64
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x70
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5
+; VI-IDXMODE-NEXT: s_add_u32 s4, s0, 0x60
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[28:31]
+; VI-IDXMODE-NEXT: s_addc_u32 s5, s1, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5
+; VI-IDXMODE-NEXT: s_add_u32 s0, s0, 0x50
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
+; VI-IDXMODE-NEXT: s_addc_u32 s1, s1, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s1
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s2
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s3
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: insert_w_offset_multiple_in_block:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dword s4, s[2:3], 0x2c
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, 0x40400000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, 4.0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: s_add_i32 s2, s4, 1
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, 0x40a00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0x40c00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, 0x40e00000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, 0x41000000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, 0x41100000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41200000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, 0x41300000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, 0x41400000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, 0x41500000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, 0x41600000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, 0x41700000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, 0x41800000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v32, 0x41880000
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v32
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v31, v15
+; GFX9-IDXMODE-NEXT: s_add_i32 s4, s4, 2
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v30, v14
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v29, v13
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v28, v12
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v27, v11
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v26, v10
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v25, v9
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v24, v8
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v23, v7
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v22, v6
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, v3
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, v2
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, v1
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v0
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v32
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v32, 0
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX9-IDXMODE-NEXT: s_endpgm
+entry:
+ %add1 = add i32 %in, 1
+ %ins1 = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add1
+ %add2 = add i32 %in, 2
+ %ins2 = insertelement <16 x float> %ins1, float 17.0, i32 %add2
+ store <16 x float> %ins1, ptr addrspace(1) %out1
+ %out2 = getelementptr <16 x float>, ptr addrspace(1) %out1, i32 1
+ store <16 x float> %ins2, ptr addrspace(1) %out2
+
+ ret void
+}
+
+; Make sure we don't hit use of undefined register errors when expanding an
+; extract with undef index.
+define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
+; NOOPT-LABEL: extract_adjacent_blocks:
+; NOOPT: ; %bb.0: ; %bb
+; NOOPT-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; NOOPT-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; NOOPT-NEXT: s_mov_b32 s14, -1
+; NOOPT-NEXT: s_mov_b32 s15, 0xe8f000
+; NOOPT-NEXT: s_add_u32 s12, s12, s9
+; NOOPT-NEXT: s_addc_u32 s13, s13, 0
+; NOOPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
+; NOOPT-NEXT: s_load_dword s2, s[2:3], 0x9
+; NOOPT-NEXT: s_mov_b64 s[0:1], -1
+; NOOPT-NEXT: ; implicit-def: $sgpr3
+; NOOPT-NEXT: s_mov_b32 s3, 0
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_cmp_lg_u32 s2, s3
+; NOOPT-NEXT: v_writelane_b32 v0, s0, 0
+; NOOPT-NEXT: v_writelane_b32 v0, s1, 1
+; NOOPT-NEXT: s_mov_b64 s[8:9], exec
+; NOOPT-NEXT: s_mov_b64 exec, -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[8:9]
+; NOOPT-NEXT: s_cbranch_scc1 .LBB19_3
+; NOOPT-NEXT: .LBB19_1: ; %Flow
+; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[8:9]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 0
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 1
+; NOOPT-NEXT: ; implicit-def: $sgpr2
+; NOOPT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; NOOPT-NEXT: s_mov_b32 s0, 1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0
+; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1]
+; NOOPT-NEXT: s_cbranch_vccnz .LBB19_4
+; NOOPT-NEXT: ; %bb.2: ; %bb1
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s6, s1
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s4, 0xf000
+; NOOPT-NEXT: s_mov_b32 s5, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s6
+; NOOPT-NEXT: s_mov_b32 s2, s5
+; NOOPT-NEXT: s_mov_b32 s3, s4
+; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: ; implicit-def: $sgpr0
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: ;;#ASMSTART
+; NOOPT-NEXT: ; reg use v[0:3]
+; NOOPT-NEXT: ;;#ASMEND
+; NOOPT-NEXT: s_branch .LBB19_4
+; NOOPT-NEXT: .LBB19_3: ; %bb4
+; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[8:9]
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s6, s1
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s4, 0xf000
+; NOOPT-NEXT: s_mov_b32 s5, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s6
+; NOOPT-NEXT: s_mov_b32 s2, s5
+; NOOPT-NEXT: s_mov_b32 s3, s4
+; NOOPT-NEXT: buffer_load_dwordx4 v[1:4], off, s[0:3], 0 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: ; implicit-def: $sgpr0
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: ;;#ASMSTART
+; NOOPT-NEXT: ; reg use v[1:4]
+; NOOPT-NEXT: ;;#ASMEND
+; NOOPT-NEXT: s_mov_b64 s[0:1], 0
+; NOOPT-NEXT: v_writelane_b32 v0, s0, 0
+; NOOPT-NEXT: v_writelane_b32 v0, s1, 1
+; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[8:9]
+; NOOPT-NEXT: s_branch .LBB19_1
+; NOOPT-NEXT: .LBB19_4: ; %bb7
+; NOOPT-NEXT: s_or_saveexec_b64 s[8:9], -1
+; NOOPT-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[8:9]
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: v_mov_b32_e32 v1, s4
+; NOOPT-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: ; kill: killed $vgpr0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: extract_adjacent_blocks:
+; SI-MOVREL: ; %bb.0: ; %bb
+; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x9
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_cmp_lg_u32 s0, 0
+; SI-MOVREL-NEXT: s_cbranch_scc0 .LBB19_4
+; SI-MOVREL-NEXT: ; %bb.1: ; %bb4
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: ;;#ASMSTART
+; SI-MOVREL-NEXT: ; reg use v[0:3]
+; SI-MOVREL-NEXT: ;;#ASMEND
+; SI-MOVREL-NEXT: s_cbranch_execnz .LBB19_3
+; SI-MOVREL-NEXT: .LBB19_2: ; %bb1
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: ;;#ASMSTART
+; SI-MOVREL-NEXT: ; reg use v[0:3]
+; SI-MOVREL-NEXT: ;;#ASMEND
+; SI-MOVREL-NEXT: .LBB19_3: ; %bb7
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: s_endpgm
+; SI-MOVREL-NEXT: .LBB19_4:
+; SI-MOVREL-NEXT: s_branch .LBB19_2
+;
+; VI-LABEL: extract_adjacent_blocks:
+; VI: ; %bb.0: ; %bb
+; VI-NEXT: s_load_dword s0, s[2:3], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cbranch_scc0 .LBB19_4
+; VI-NEXT: ; %bb.1: ; %bb4
+; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ;;#ASMSTART
+; VI-NEXT: ; reg use v[0:3]
+; VI-NEXT: ;;#ASMEND
+; VI-NEXT: s_cbranch_execnz .LBB19_3
+; VI-NEXT: .LBB19_2: ; %bb1
+; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ;;#ASMSTART
+; VI-NEXT: ; reg use v[0:3]
+; VI-NEXT: ;;#ASMEND
+; VI-NEXT: .LBB19_3: ; %bb7
+; VI-NEXT: flat_store_dword v[0:1], v0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_endpgm
+; VI-NEXT: .LBB19_4:
+; VI-NEXT: s_branch .LBB19_2
+;
+; GFX9-IDXMODE-LABEL: extract_adjacent_blocks:
+; GFX9-IDXMODE: ; %bb.0: ; %bb
+; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-IDXMODE-NEXT: s_cbranch_scc0 .LBB19_4
+; GFX9-IDXMODE-NEXT: ; %bb.1: ; %bb4
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v[0:1], off glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: ;;#ASMSTART
+; GFX9-IDXMODE-NEXT: ; reg use v[0:3]
+; GFX9-IDXMODE-NEXT: ;;#ASMEND
+; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB19_3
+; GFX9-IDXMODE-NEXT: .LBB19_2: ; %bb1
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v[0:1], off glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: ;;#ASMSTART
+; GFX9-IDXMODE-NEXT: ; reg use v[0:3]
+; GFX9-IDXMODE-NEXT: ;;#ASMEND
+; GFX9-IDXMODE-NEXT: .LBB19_3: ; %bb7
+; GFX9-IDXMODE-NEXT: global_store_dword v[0:1], v0, off
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: s_endpgm
+; GFX9-IDXMODE-NEXT: .LBB19_4:
+; GFX9-IDXMODE-NEXT: s_branch .LBB19_2
bb:
%tmp = icmp eq i32 %arg, 0
br i1 %tmp, label %bb1, label %bb4
-bb1: ; preds = %bb
+bb1:
+ %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef
+ %tmp3 = extractelement <4 x float> %tmp2, i32 undef
+ call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) ; Prevent block optimize out
+ br label %bb7
+
+bb4:
+ %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef
+ %tmp6 = extractelement <4 x float> %tmp5, i32 undef
+ call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) ; Prevent block optimize out
+ br label %bb7
+
+bb7:
+ %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
+ store volatile float %tmp8, ptr addrspace(1) undef
+ ret void
+}
+
+define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
+; NOOPT-LABEL: insert_adjacent_blocks:
+; NOOPT: ; %bb.0: ; %bb
+; NOOPT-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
+; NOOPT-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
+; NOOPT-NEXT: s_mov_b32 s18, -1
+; NOOPT-NEXT: s_mov_b32 s19, 0xe8f000
+; NOOPT-NEXT: s_add_u32 s16, s16, s9
+; NOOPT-NEXT: s_addc_u32 s17, s17, 0
+; NOOPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
+; NOOPT-NEXT: s_mov_b64 s[0:1], s[2:3]
+; NOOPT-NEXT: s_load_dword s2, s[0:1], 0x9
+; NOOPT-NEXT: s_load_dword s0, s[0:1], 0xa
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b64 s[0:1], -1
+; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
+; NOOPT-NEXT: s_mov_b32 s3, 0
+; NOOPT-NEXT: s_cmp_lg_u32 s2, s3
+; NOOPT-NEXT: v_writelane_b32 v0, s0, 0
+; NOOPT-NEXT: v_writelane_b32 v0, s1, 1
+; NOOPT-NEXT: s_mov_b64 s[12:13], exec
+; NOOPT-NEXT: s_mov_b64 exec, -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[12:13]
+; NOOPT-NEXT: s_cbranch_scc1 .LBB20_3
+; NOOPT-NEXT: .LBB20_1: ; %Flow
+; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[12:13]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 0
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 1
+; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
+; NOOPT-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; NOOPT-NEXT: s_mov_b32 s0, 1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0
+; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1]
+; NOOPT-NEXT: s_cbranch_vccnz .LBB20_4
+; NOOPT-NEXT: ; %bb.2: ; %bb1
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s6, s1
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s4, 0xf000
+; NOOPT-NEXT: s_mov_b32 s5, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s6
+; NOOPT-NEXT: s_mov_b32 s2, s5
+; NOOPT-NEXT: s_mov_b32 s3, s4
+; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: ;;#ASMSTART
+; NOOPT-NEXT: ; reg use v[0:3]
+; NOOPT-NEXT: ;;#ASMEND
+; NOOPT-NEXT: s_branch .LBB20_4
+; NOOPT-NEXT: .LBB20_3: ; %bb4
+; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[12:13]
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s6, s1
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s4, 0xf000
+; NOOPT-NEXT: s_mov_b32 s5, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s6
+; NOOPT-NEXT: s_mov_b32 s2, s5
+; NOOPT-NEXT: s_mov_b32 s3, s4
+; NOOPT-NEXT: buffer_load_dwordx4 v[1:4], off, s[0:3], 0 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: ;;#ASMSTART
+; NOOPT-NEXT: ; reg use v[1:4]
+; NOOPT-NEXT: ;;#ASMEND
+; NOOPT-NEXT: s_mov_b64 s[0:1], 0
+; NOOPT-NEXT: v_writelane_b32 v0, s0, 0
+; NOOPT-NEXT: v_writelane_b32 v0, s1, 1
+; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[12:13]
+; NOOPT-NEXT: s_branch .LBB20_1
+; NOOPT-NEXT: .LBB20_4: ; %bb7
+; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1
+; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[12:13]
+; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s10, s1
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s8, 0xf000
+; NOOPT-NEXT: s_mov_b32 s9, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s10
+; NOOPT-NEXT: s_mov_b32 s2, s9
+; NOOPT-NEXT: s_mov_b32 s3, s8
+; NOOPT-NEXT: v_mov_b32_e32 v1, s4
+; NOOPT-NEXT: v_mov_b32_e32 v2, s5
+; NOOPT-NEXT: v_mov_b32_e32 v3, s6
+; NOOPT-NEXT: v_mov_b32_e32 v4, s7
+; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: ; kill: killed $vgpr0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: insert_adjacent_blocks:
+; SI-MOVREL: ; %bb.0: ; %bb
+; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x9
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_cmp_lg_u32 s0, 0
+; SI-MOVREL-NEXT: s_cbranch_scc0 .LBB20_4
+; SI-MOVREL-NEXT: ; %bb.1: ; %bb4
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: ;;#ASMSTART
+; SI-MOVREL-NEXT: ; reg use v[0:3]
+; SI-MOVREL-NEXT: ;;#ASMEND
+; SI-MOVREL-NEXT: s_cbranch_execnz .LBB20_3
+; SI-MOVREL-NEXT: .LBB20_2: ; %bb1
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: ;;#ASMSTART
+; SI-MOVREL-NEXT: ; reg use v[0:3]
+; SI-MOVREL-NEXT: ;;#ASMEND
+; SI-MOVREL-NEXT: .LBB20_3: ; %bb7
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: s_endpgm
+; SI-MOVREL-NEXT: .LBB20_4:
+; SI-MOVREL-NEXT: s_branch .LBB20_2
+;
+; VI-LABEL: insert_adjacent_blocks:
+; VI: ; %bb.0: ; %bb
+; VI-NEXT: s_load_dword s0, s[2:3], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cbranch_scc0 .LBB20_4
+; VI-NEXT: ; %bb.1: ; %bb4
+; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ;;#ASMSTART
+; VI-NEXT: ; reg use v[0:3]
+; VI-NEXT: ;;#ASMEND
+; VI-NEXT: s_cbranch_execnz .LBB20_3
+; VI-NEXT: .LBB20_2: ; %bb1
+; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ;;#ASMSTART
+; VI-NEXT: ; reg use v[0:3]
+; VI-NEXT: ;;#ASMEND
+; VI-NEXT: .LBB20_3: ; %bb7
+; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_endpgm
+; VI-NEXT: .LBB20_4:
+; VI-NEXT: s_branch .LBB20_2
+;
+; GFX9-IDXMODE-LABEL: insert_adjacent_blocks:
+; GFX9-IDXMODE: ; %bb.0: ; %bb
+; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX9-IDXMODE-NEXT: s_cbranch_scc0 .LBB20_4
+; GFX9-IDXMODE-NEXT: ; %bb.1: ; %bb4
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v[0:1], off glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: ;;#ASMSTART
+; GFX9-IDXMODE-NEXT: ; reg use v[0:3]
+; GFX9-IDXMODE-NEXT: ;;#ASMEND
+; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB20_3
+; GFX9-IDXMODE-NEXT: .LBB20_2: ; %bb1
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v[0:1], off glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: ;;#ASMSTART
+; GFX9-IDXMODE-NEXT: ; reg use v[0:3]
+; GFX9-IDXMODE-NEXT: ;;#ASMEND
+; GFX9-IDXMODE-NEXT: .LBB20_3: ; %bb7
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: s_endpgm
+; GFX9-IDXMODE-NEXT: .LBB20_4:
+; GFX9-IDXMODE-NEXT: s_branch .LBB20_2
+bb:
+ %tmp = icmp eq i32 %arg, 0
+ br i1 %tmp, label %bb1, label %bb4
+
+bb1:
%tmp2 = load volatile <4 x float>, ptr addrspace(1) undef
%tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
- call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out
+ call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) ; Prevent block optimize out
br label %bb7
-bb4: ; preds = %bb
+bb4:
%tmp5 = load volatile <4 x float>, ptr addrspace(1) undef
%tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
- call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out
+ call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) ; Prevent block optimize out
br label %bb7
-bb7: ; preds = %bb4, %bb1
+bb7:
%tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
store volatile <4 x float> %tmp8, ptr addrspace(1) undef
ret void
}
; FIXME: Should be able to fold zero input to movreld to inline imm?
-
-; GCN-LABEL: {{^}}multi_same_block:
-
-; GCN: s_load_dword [[ARG:s[0-9]+]]
-
-; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
-; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
-; MOVREL: s_waitcnt
-; MOVREL: s_add_i32 m0, [[ARG]], -16
-; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0
-; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0
-; MOVREL: s_mov_b32 m0, -1
-
-
-; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
-; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
-; IDXMODE: s_waitcnt
-; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16
-; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0
-; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0
-; IDXMODE: s_set_gpr_idx_off
-
-; GCN: ds_write_b32
-; GCN: ds_write_b32
-; GCN: s_endpgm
-define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
+define amdgpu_kernel void @multi_same_block(i32 %arg) {
+; NOOPT-LABEL: multi_same_block:
+; NOOPT: ; %bb.0: ; %bb
+; NOOPT-NEXT: s_load_dword s0, s[2:3], 0x9
+; NOOPT-NEXT: s_mov_b32 s8, 0x41900000
+; NOOPT-NEXT: ; implicit-def: $sgpr9
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr7
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr6
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr5
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr3
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr2
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr10
+; NOOPT-NEXT: v_mov_b32_e32 v12, s9
+; NOOPT-NEXT: v_mov_b32_e32 v7, s8
+; NOOPT-NEXT: v_mov_b32_e32 v6, s7
+; NOOPT-NEXT: v_mov_b32_e32 v5, s6
+; NOOPT-NEXT: v_mov_b32_e32 v4, s5
+; NOOPT-NEXT: v_mov_b32_e32 v3, s4
+; NOOPT-NEXT: v_mov_b32_e32 v2, s3
+; NOOPT-NEXT: v_mov_b32_e32 v1, s2
+; NOOPT-NEXT: v_mov_b32_e32 v0, s1
+; NOOPT-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v13, v7
+; NOOPT-NEXT: v_mov_b32_e32 v14, v6
+; NOOPT-NEXT: v_mov_b32_e32 v15, v5
+; NOOPT-NEXT: v_mov_b32_e32 v16, v4
+; NOOPT-NEXT: v_mov_b32_e32 v17, v3
+; NOOPT-NEXT: v_mov_b32_e32 v18, v2
+; NOOPT-NEXT: v_mov_b32_e32 v19, v1
+; NOOPT-NEXT: v_mov_b32_e32 v20, v0
+; NOOPT-NEXT: v_mov_b32_e32 v0, 4.0
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_add_i32 m0, s0, -16
+; NOOPT-NEXT: v_movreld_b32_e32 v12, v0
+; NOOPT-NEXT: s_mov_b32 s4, 0x41b0cccd
+; NOOPT-NEXT: ; implicit-def: $sgpr9
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr8
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr7
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr6
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr5
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr3
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr2
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: ; implicit-def: $sgpr10
+; NOOPT-NEXT: v_mov_b32_e32 v3, s9
+; NOOPT-NEXT: v_mov_b32_e32 v25, s8
+; NOOPT-NEXT: v_mov_b32_e32 v24, s7
+; NOOPT-NEXT: v_mov_b32_e32 v23, s6
+; NOOPT-NEXT: v_mov_b32_e32 v22, s5
+; NOOPT-NEXT: v_mov_b32_e32 v21, s4
+; NOOPT-NEXT: v_mov_b32_e32 v2, s3
+; NOOPT-NEXT: v_mov_b32_e32 v1, s2
+; NOOPT-NEXT: v_mov_b32_e32 v0, s1
+; NOOPT-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v4, v25
+; NOOPT-NEXT: v_mov_b32_e32 v5, v24
+; NOOPT-NEXT: v_mov_b32_e32 v6, v23
+; NOOPT-NEXT: v_mov_b32_e32 v7, v22
+; NOOPT-NEXT: v_mov_b32_e32 v8, v21
+; NOOPT-NEXT: v_mov_b32_e32 v9, v2
+; NOOPT-NEXT: v_mov_b32_e32 v10, v1
+; NOOPT-NEXT: v_mov_b32_e32 v11, v0
+; NOOPT-NEXT: v_mov_b32_e32 v0, -4.0
+; NOOPT-NEXT: s_add_i32 m0, s0, -16
+; NOOPT-NEXT: v_movreld_b32_e32 v3, v0
+; NOOPT-NEXT: v_mov_b32_e32 v2, v13
+; NOOPT-NEXT: v_mov_b32_e32 v1, v8
+; NOOPT-NEXT: s_mov_b32 m0, -1
+; NOOPT-NEXT: ; implicit-def: $sgpr0
+; NOOPT-NEXT: v_mov_b32_e32 v0, s0
+; NOOPT-NEXT: ds_write_b32 v0, v2
+; NOOPT-NEXT: s_mov_b32 m0, -1
+; NOOPT-NEXT: ; implicit-def: $sgpr0
+; NOOPT-NEXT: v_mov_b32_e32 v0, s0
+; NOOPT-NEXT: ds_write_b32 v0, v1
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: multi_same_block:
+; SI-MOVREL: ; %bb.0: ; %bb
+; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v1, 0x41900000
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41b0cccd
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_add_i32 m0, s0, -16
+; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, 4.0
+; SI-MOVREL-NEXT: v_movreld_b32_e32 v4, -4.0
+; SI-MOVREL-NEXT: s_mov_b32 m0, -1
+; SI-MOVREL-NEXT: ds_write_b32 v0, v1
+; SI-MOVREL-NEXT: ds_write_b32 v0, v9
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: multi_same_block:
+; VI-MOVREL: ; %bb.0: ; %bb
+; VI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x24
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, 0x41900000
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, 0x41b0cccd
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_add_i32 m0, s0, -16
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, 4.0
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v4, -4.0
+; VI-MOVREL-NEXT: s_mov_b32 m0, -1
+; VI-MOVREL-NEXT: ds_write_b32 v0, v1
+; VI-MOVREL-NEXT: ds_write_b32 v0, v9
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: multi_same_block:
+; VI-IDXMODE: ; %bb.0: ; %bb
+; VI-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, 0x41900000
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41b0cccd
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_add_i32 s0, s0, -16
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 4.0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, -4.0
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: s_mov_b32 m0, -1
+; VI-IDXMODE-NEXT: ds_write_b32 v0, v1
+; VI-IDXMODE-NEXT: ds_write_b32 v0, v9
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: multi_same_block:
+; GFX9-IDXMODE: ; %bb.0: ; %bb
+; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, 0x41900000
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, 0x41b0cccd
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: s_add_i32 s0, s0, -16
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 4.0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, -4.0
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: ds_write_b32 v0, v1
+; GFX9-IDXMODE-NEXT: ds_write_b32 v0, v9
+; GFX9-IDXMODE-NEXT: s_endpgm
bb:
%tmp1 = add i32 %arg, -16
%tmp2 = insertelement <9 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01, float 2.300000e+01, float 2.400000e+01, float 2.500000e+01>, float 4.000000e+00, i32 %tmp1
@@ -420,20 +6595,192 @@ bb:
}
; offset puts outside of superegister bounaries, so clamp to 1st element.
-; GCN-LABEL: {{^}}extract_largest_inbounds_offset:
-; GCN-DAG: buffer_load_dwordx4 v[[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]
-; GCN-DAG: s_load_dword [[IDX0:s[0-9]+]]
-; GCN-DAG: s_add_i32 [[IDX:s[0-9]+]], [[IDX0]], 15
-
-; MOVREL: s_mov_b32 m0, [[IDX]]
-; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
-
-; IDXMODE: s_set_gpr_idx_on [[IDX]], gpr_idx(SRC0)
-; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
-; IDXMODE: s_set_gpr_idx_off
-
-; GCN: buffer_store_dword [[EXTRACT]]
define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
+; NOOPT-LABEL: extract_largest_inbounds_offset:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_mov_b32 s7, s9
+; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; NOOPT-NEXT: s_mov_b32 s9, s7
+; NOOPT-NEXT: s_mov_b32 s10, s6
+; NOOPT-NEXT: s_mov_b32 s11, s5
+; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v16, v15
+; NOOPT-NEXT: v_mov_b32_e32 v17, v14
+; NOOPT-NEXT: v_mov_b32_e32 v18, v13
+; NOOPT-NEXT: v_mov_b32_e32 v19, v12
+; NOOPT-NEXT: v_mov_b32_e32 v20, v11
+; NOOPT-NEXT: v_mov_b32_e32 v21, v10
+; NOOPT-NEXT: v_mov_b32_e32 v22, v9
+; NOOPT-NEXT: v_mov_b32_e32 v23, v8
+; NOOPT-NEXT: v_mov_b32_e32 v24, v7
+; NOOPT-NEXT: v_mov_b32_e32 v25, v6
+; NOOPT-NEXT: v_mov_b32_e32 v26, v5
+; NOOPT-NEXT: v_mov_b32_e32 v27, v4
+; NOOPT-NEXT: v_mov_b32_e32 v28, v3
+; NOOPT-NEXT: v_mov_b32_e32 v29, v2
+; NOOPT-NEXT: v_mov_b32_e32 v30, v1
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v30
+; NOOPT-NEXT: v_mov_b32_e32 v2, v29
+; NOOPT-NEXT: v_mov_b32_e32 v3, v28
+; NOOPT-NEXT: v_mov_b32_e32 v4, v27
+; NOOPT-NEXT: v_mov_b32_e32 v5, v26
+; NOOPT-NEXT: v_mov_b32_e32 v6, v25
+; NOOPT-NEXT: v_mov_b32_e32 v7, v24
+; NOOPT-NEXT: v_mov_b32_e32 v8, v23
+; NOOPT-NEXT: v_mov_b32_e32 v9, v22
+; NOOPT-NEXT: v_mov_b32_e32 v10, v21
+; NOOPT-NEXT: v_mov_b32_e32 v11, v20
+; NOOPT-NEXT: v_mov_b32_e32 v12, v19
+; NOOPT-NEXT: v_mov_b32_e32 v13, v18
+; NOOPT-NEXT: v_mov_b32_e32 v14, v17
+; NOOPT-NEXT: v_mov_b32_e32 v15, v16
+; NOOPT-NEXT: s_mov_b32 s5, 15
+; NOOPT-NEXT: s_add_i32 s4, s4, s5
+; NOOPT-NEXT: s_mov_b32 m0, s4
+; NOOPT-NEXT: v_movrels_b32_e32 v0, v0
+; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: extract_largest_inbounds_offset:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9
+; SI-MOVREL-NEXT: s_load_dword s12, s[2:3], 0xd
+; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s6, -1
+; SI-MOVREL-NEXT: s_mov_b32 s2, s6
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_mov_b32 s0, s10
+; SI-MOVREL-NEXT: s_mov_b32 s1, s11
+; SI-MOVREL-NEXT: s_mov_b32 s3, s7
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: s_add_i32 s12, s12, 15
+; SI-MOVREL-NEXT: s_mov_b32 m0, s12
+; SI-MOVREL-NEXT: s_mov_b32 s4, s8
+; SI-MOVREL-NEXT: s_mov_b32 s5, s9
+; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: extract_largest_inbounds_offset:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s6
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s7
+; VI-MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
+; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: s_add_u32 s0, s6, 48
+; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: s_add_u32 s0, s6, 32
+; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0
+; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc
+; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: s_add_u32 s0, s6, 16
+; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0
+; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
+; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: s_add_i32 s2, s2, 15
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
+; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s4
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s5
+; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
+; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: extract_largest_inbounds_offset:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 48
+; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s6
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 32
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s7
+; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0
+; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
+; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc
+; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 16
+; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0
+; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
+; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
+; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s4
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s5
+; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 15
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v0
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: flat_store_dword v[16:17], v0
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: extract_largest_inbounds_offset:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x34
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: s_add_i32 s0, s0, 15
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[4:5]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%ld = load volatile <16 x i32>, ptr addrspace(1) %in
%offset = add i32 %idx, 15
@@ -442,20 +6789,192 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}extract_out_of_bounds_offset:
-; GCN-DAG: buffer_load_dwordx4 v[[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]]
-; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
-; GCN: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 16
-
-; MOVREL: s_mov_b32 m0, [[ADD_IDX]]
-; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
-
-; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], gpr_idx(SRC0)
-; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
-; IDXMODE: s_set_gpr_idx_off
-
-; GCN: buffer_store_dword [[EXTRACT]]
define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
+; NOOPT-LABEL: extract_out_of_bounds_offset:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_mov_b32 s7, s9
+; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; NOOPT-NEXT: s_mov_b32 s9, s7
+; NOOPT-NEXT: s_mov_b32 s10, s6
+; NOOPT-NEXT: s_mov_b32 s11, s5
+; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v16, v15
+; NOOPT-NEXT: v_mov_b32_e32 v17, v14
+; NOOPT-NEXT: v_mov_b32_e32 v18, v13
+; NOOPT-NEXT: v_mov_b32_e32 v19, v12
+; NOOPT-NEXT: v_mov_b32_e32 v20, v11
+; NOOPT-NEXT: v_mov_b32_e32 v21, v10
+; NOOPT-NEXT: v_mov_b32_e32 v22, v9
+; NOOPT-NEXT: v_mov_b32_e32 v23, v8
+; NOOPT-NEXT: v_mov_b32_e32 v24, v7
+; NOOPT-NEXT: v_mov_b32_e32 v25, v6
+; NOOPT-NEXT: v_mov_b32_e32 v26, v5
+; NOOPT-NEXT: v_mov_b32_e32 v27, v4
+; NOOPT-NEXT: v_mov_b32_e32 v28, v3
+; NOOPT-NEXT: v_mov_b32_e32 v29, v2
+; NOOPT-NEXT: v_mov_b32_e32 v30, v1
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v30
+; NOOPT-NEXT: v_mov_b32_e32 v2, v29
+; NOOPT-NEXT: v_mov_b32_e32 v3, v28
+; NOOPT-NEXT: v_mov_b32_e32 v4, v27
+; NOOPT-NEXT: v_mov_b32_e32 v5, v26
+; NOOPT-NEXT: v_mov_b32_e32 v6, v25
+; NOOPT-NEXT: v_mov_b32_e32 v7, v24
+; NOOPT-NEXT: v_mov_b32_e32 v8, v23
+; NOOPT-NEXT: v_mov_b32_e32 v9, v22
+; NOOPT-NEXT: v_mov_b32_e32 v10, v21
+; NOOPT-NEXT: v_mov_b32_e32 v11, v20
+; NOOPT-NEXT: v_mov_b32_e32 v12, v19
+; NOOPT-NEXT: v_mov_b32_e32 v13, v18
+; NOOPT-NEXT: v_mov_b32_e32 v14, v17
+; NOOPT-NEXT: v_mov_b32_e32 v15, v16
+; NOOPT-NEXT: s_mov_b32 s5, 16
+; NOOPT-NEXT: s_add_i32 s4, s4, s5
+; NOOPT-NEXT: s_mov_b32 m0, s4
+; NOOPT-NEXT: v_movrels_b32_e32 v0, v0
+; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: extract_out_of_bounds_offset:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9
+; SI-MOVREL-NEXT: s_load_dword s12, s[2:3], 0xd
+; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s6, -1
+; SI-MOVREL-NEXT: s_mov_b32 s2, s6
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_mov_b32 s0, s10
+; SI-MOVREL-NEXT: s_mov_b32 s1, s11
+; SI-MOVREL-NEXT: s_mov_b32 s3, s7
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: s_add_i32 s12, s12, 16
+; SI-MOVREL-NEXT: s_mov_b32 m0, s12
+; SI-MOVREL-NEXT: s_mov_b32 s4, s8
+; SI-MOVREL-NEXT: s_mov_b32 s5, s9
+; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: extract_out_of_bounds_offset:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s6
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s7
+; VI-MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
+; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: s_add_u32 s0, s6, 48
+; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: s_add_u32 s0, s6, 32
+; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0
+; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc
+; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: s_add_u32 s0, s6, 16
+; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0
+; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
+; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: s_add_i32 s2, s2, 16
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
+; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s4
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s5
+; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v0
+; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: extract_out_of_bounds_offset:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 48
+; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s6
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 32
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s7
+; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0
+; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
+; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc
+; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 16
+; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0
+; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
+; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
+; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s4
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s5
+; VI-IDXMODE-NEXT: s_add_i32 s2, s2, 16
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v0
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: flat_store_dword v[16:17], v0
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: extract_out_of_bounds_offset:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x34
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: s_add_i32 s0, s0, 16
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v0
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[4:5]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%ld = load volatile <16 x i32>, ptr addrspace(1) %in
%offset = add i32 %idx, 16
@@ -464,17 +6983,192 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}extractelement_v16i32_or_index:
-; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
-; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
-
-; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
-; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-
-; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(SRC0)
-; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; IDXMODE: s_set_gpr_idx_off
define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx.in) {
+; NOOPT-LABEL: extractelement_v16i32_or_index:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xb
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0xd
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_mov_b32 s7, s9
+; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; NOOPT-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
+; NOOPT-NEXT: s_mov_b32 s9, s7
+; NOOPT-NEXT: s_mov_b32 s10, s6
+; NOOPT-NEXT: s_mov_b32 s11, s5
+; NOOPT-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v16, v15
+; NOOPT-NEXT: v_mov_b32_e32 v17, v14
+; NOOPT-NEXT: v_mov_b32_e32 v18, v13
+; NOOPT-NEXT: v_mov_b32_e32 v19, v12
+; NOOPT-NEXT: v_mov_b32_e32 v20, v11
+; NOOPT-NEXT: v_mov_b32_e32 v21, v10
+; NOOPT-NEXT: v_mov_b32_e32 v22, v9
+; NOOPT-NEXT: v_mov_b32_e32 v23, v8
+; NOOPT-NEXT: v_mov_b32_e32 v24, v7
+; NOOPT-NEXT: v_mov_b32_e32 v25, v6
+; NOOPT-NEXT: v_mov_b32_e32 v26, v5
+; NOOPT-NEXT: v_mov_b32_e32 v27, v4
+; NOOPT-NEXT: v_mov_b32_e32 v28, v3
+; NOOPT-NEXT: v_mov_b32_e32 v29, v2
+; NOOPT-NEXT: v_mov_b32_e32 v30, v1
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v30
+; NOOPT-NEXT: v_mov_b32_e32 v2, v29
+; NOOPT-NEXT: v_mov_b32_e32 v3, v28
+; NOOPT-NEXT: v_mov_b32_e32 v4, v27
+; NOOPT-NEXT: v_mov_b32_e32 v5, v26
+; NOOPT-NEXT: v_mov_b32_e32 v6, v25
+; NOOPT-NEXT: v_mov_b32_e32 v7, v24
+; NOOPT-NEXT: v_mov_b32_e32 v8, v23
+; NOOPT-NEXT: v_mov_b32_e32 v9, v22
+; NOOPT-NEXT: v_mov_b32_e32 v10, v21
+; NOOPT-NEXT: v_mov_b32_e32 v11, v20
+; NOOPT-NEXT: v_mov_b32_e32 v12, v19
+; NOOPT-NEXT: v_mov_b32_e32 v13, v18
+; NOOPT-NEXT: v_mov_b32_e32 v14, v17
+; NOOPT-NEXT: v_mov_b32_e32 v15, v16
+; NOOPT-NEXT: s_mov_b32 s5, 2
+; NOOPT-NEXT: s_lshl_b32 s4, s4, s5
+; NOOPT-NEXT: s_mov_b32 m0, s4
+; NOOPT-NEXT: v_movrels_b32_e32 v0, v1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: extractelement_v16i32_or_index:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9
+; SI-MOVREL-NEXT: s_load_dword s12, s[2:3], 0xd
+; SI-MOVREL-NEXT: s_mov_b32 s7, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s6, -1
+; SI-MOVREL-NEXT: s_mov_b32 s2, s6
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_mov_b32 s0, s10
+; SI-MOVREL-NEXT: s_mov_b32 s1, s11
+; SI-MOVREL-NEXT: s_mov_b32 s3, s7
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: s_lshl_b32 s0, s12, 2
+; SI-MOVREL-NEXT: s_mov_b32 m0, s0
+; SI-MOVREL-NEXT: s_mov_b32 s4, s8
+; SI-MOVREL-NEXT: s_mov_b32 s5, s9
+; SI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1
+; SI-MOVREL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: extractelement_v16i32_or_index:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-MOVREL-NEXT: s_load_dword s2, s[2:3], 0x34
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s6
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s7
+; VI-MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
+; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: s_add_u32 s0, s6, 48
+; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: s_add_u32 s0, s6, 32
+; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0
+; VI-MOVREL-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc
+; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: s_add_u32 s0, s6, 16
+; VI-MOVREL-NEXT: s_addc_u32 s1, s7, 0
+; VI-MOVREL-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
+; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: s_lshl_b32 s0, s2, 2
+; VI-MOVREL-NEXT: s_mov_b32 m0, s0
+; VI-MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
+; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s4
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s5
+; VI-MOVREL-NEXT: v_movrels_b32_e32 v0, v1
+; VI-MOVREL-NEXT: flat_store_dword v[16:17], v0
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: extractelement_v16i32_or_index:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-IDXMODE-NEXT: s_load_dword s2, s[2:3], 0x34
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 48
+; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s6
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 32
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s7
+; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0
+; VI-IDXMODE-NEXT: flat_load_dwordx4 v[0:3], v[0:1] glc
+; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; VI-IDXMODE-NEXT: flat_load_dwordx4 v[12:15], v[4:5] glc
+; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: s_add_u32 s0, s6, 16
+; VI-IDXMODE-NEXT: s_addc_u32 s1, s7, 0
+; VI-IDXMODE-NEXT: flat_load_dwordx4 v[8:11], v[4:5] glc
+; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: flat_load_dwordx4 v[4:7], v[4:5] glc
+; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s4
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s5
+; VI-IDXMODE-NEXT: s_lshl_b32 s0, s2, 2
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v1
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: flat_store_dword v[16:17], v0
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: extractelement_v16i32_or_index:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: s_load_dword s0, s[2:3], 0x34
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: s_lshl_b32 s0, s0, 2
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dword v16, v0, s[4:5]
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%ld = load volatile <16 x i32>, ptr addrspace(1) %in
%idx.shl = shl i32 %idx.in, 2
@@ -484,17 +7178,249 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}insertelement_v16f32_or_index:
-; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
-; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
-
-; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
-; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-
-; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], gpr_idx(DST)
-; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; IDXMODE: s_set_gpr_idx_off
define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, <16 x float> %a, i32 %idx.in) nounwind {
+; NOOPT-LABEL: insertelement_v16f32_or_index:
+; NOOPT: ; %bb.0:
+; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x19
+; NOOPT-NEXT: s_load_dword s4, s[2:3], 0x29
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT: s_mov_b32 s5, 0xf000
+; NOOPT-NEXT: s_mov_b32 s6, -1
+; NOOPT-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT: s_mov_b32 s1, s7
+; NOOPT-NEXT: s_mov_b32 s2, s6
+; NOOPT-NEXT: s_mov_b32 s3, s5
+; NOOPT-NEXT: s_mov_b32 s5, 2
+; NOOPT-NEXT: s_lshl_b32 s4, s4, s5
+; NOOPT-NEXT: v_mov_b32_e32 v0, 0x40a00000
+; NOOPT-NEXT: v_mov_b32_e32 v7, s8
+; NOOPT-NEXT: v_mov_b32_e32 v8, s9
+; NOOPT-NEXT: v_mov_b32_e32 v9, s10
+; NOOPT-NEXT: v_mov_b32_e32 v10, s11
+; NOOPT-NEXT: v_mov_b32_e32 v11, s12
+; NOOPT-NEXT: v_mov_b32_e32 v12, s13
+; NOOPT-NEXT: v_mov_b32_e32 v13, s14
+; NOOPT-NEXT: v_mov_b32_e32 v14, s15
+; NOOPT-NEXT: v_mov_b32_e32 v15, s16
+; NOOPT-NEXT: v_mov_b32_e32 v16, s17
+; NOOPT-NEXT: v_mov_b32_e32 v17, s18
+; NOOPT-NEXT: v_mov_b32_e32 v18, s19
+; NOOPT-NEXT: v_mov_b32_e32 v19, s20
+; NOOPT-NEXT: v_mov_b32_e32 v20, s21
+; NOOPT-NEXT: v_mov_b32_e32 v21, s22
+; NOOPT-NEXT: v_mov_b32_e32 v22, s23
+; NOOPT-NEXT: s_mov_b32 m0, s4
+; NOOPT-NEXT: v_movreld_b32_e32 v8, v0
+; NOOPT-NEXT: v_mov_b32_e32 v4, v22
+; NOOPT-NEXT: v_mov_b32_e32 v5, v21
+; NOOPT-NEXT: v_mov_b32_e32 v6, v20
+; NOOPT-NEXT: v_mov_b32_e32 v0, v19
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; NOOPT-NEXT: v_mov_b32_e32 v4, v18
+; NOOPT-NEXT: v_mov_b32_e32 v5, v17
+; NOOPT-NEXT: v_mov_b32_e32 v6, v16
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v15
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; NOOPT-NEXT: v_mov_b32_e32 v4, v14
+; NOOPT-NEXT: v_mov_b32_e32 v5, v13
+; NOOPT-NEXT: v_mov_b32_e32 v6, v12
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v11
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; NOOPT-NEXT: v_mov_b32_e32 v4, v10
+; NOOPT-NEXT: v_mov_b32_e32 v5, v9
+; NOOPT-NEXT: v_mov_b32_e32 v6, v8
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, v7
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v6
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v4
+; NOOPT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: insertelement_v16f32_or_index:
+; SI-MOVREL: ; %bb.0:
+; SI-MOVREL-NEXT: s_load_dword s0, s[2:3], 0x29
+; SI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19
+; SI-MOVREL-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000
+; SI-MOVREL-NEXT: s_mov_b32 s23, 0xf000
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: s_lshl_b32 s0, s0, 2
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
+; SI-MOVREL-NEXT: v_mov_b32_e32 v1, s5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s6
+; SI-MOVREL-NEXT: v_mov_b32_e32 v3, s7
+; SI-MOVREL-NEXT: v_mov_b32_e32 v4, s8
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, s9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, s10
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, s11
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, s12
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, s13
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, s14
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, s15
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, s16
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, s17
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, s18
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, s19
+; SI-MOVREL-NEXT: s_mov_b32 m0, s0
+; SI-MOVREL-NEXT: s_mov_b32 s22, -1
+; SI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:48
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:32
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: insertelement_v16f32_or_index:
+; VI-MOVREL: ; %bb.0:
+; VI-MOVREL-NEXT: s_load_dword s20, s[2:3], 0xa4
+; VI-MOVREL-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, 0x40a00000
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: s_lshl_b32 s2, s20, 2
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, s4
+; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 48
+; VI-MOVREL-NEXT: v_mov_b32_e32 v1, s5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s6
+; VI-MOVREL-NEXT: v_mov_b32_e32 v3, s7
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s8
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s9
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, s10
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, s11
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s12
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s13
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, s14
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, s15
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s16
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s17
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, s18
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, s19
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v1, v16
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 32
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, s2
+; VI-MOVREL-NEXT: s_add_u32 s2, s0, 16
+; VI-MOVREL-NEXT: s_addc_u32 s3, s1, 0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, s3
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, s2
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-MOVREL-NEXT: s_nop 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, s1
+; VI-MOVREL-NEXT: v_mov_b32_e32 v4, s0
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: insertelement_v16f32_or_index:
+; VI-IDXMODE: ; %bb.0:
+; VI-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4
+; VI-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, 0x40a00000
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: s_lshl_b32 s3, s20, 2
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, s5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s6
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v3, s7
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s8
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s9
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, s10
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, s11
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s12
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s13
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, s14
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, s15
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s16
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s17
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, s18
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, s19
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 48
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v1, v16
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 32
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, s2
+; VI-IDXMODE-NEXT: s_add_u32 s2, s0, 16
+; VI-IDXMODE-NEXT: s_addc_u32 s3, s1, 0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, s3
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, s2
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
+; VI-IDXMODE-NEXT: s_nop 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, s1
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v4, s0
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: insertelement_v16f32_or_index:
+; GFX9-IDXMODE: ; %bb.0:
+; GFX9-IDXMODE-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x64
+; GFX9-IDXMODE-NEXT: s_load_dword s20, s[2:3], 0xa4
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, 0x40a00000
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v4, s8
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, s9
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, s10
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, s11
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, s13
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, s14
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, s15
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, s16
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, s17
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, s18
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, s19
+; GFX9-IDXMODE-NEXT: s_lshl_b32 s2, s20, 2
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v1, v17
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX9-IDXMODE-NEXT: s_endpgm
%idx.shl = shl i32 %idx.in, 2
%idx = or i32 %idx.shl, 1
%vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx
@@ -502,57 +7428,891 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out,
ret void
}
-; GCN-LABEL: {{^}}broken_phi_bb:
-; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8
-
-; GCN: {{.LBB[0-9]+_[0-9]+}}:
-; GCN: [[BB2:.LBB[0-9]+_[0-9]+]]:
-; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
-; GCN: buffer_load_dword
-
-; GCN: [[REGLOOP:.LBB[0-9]+_[0-9]+]]:
-; MOVREL: v_movreld_b32_e32
-
-; IDXMODE: s_set_gpr_idx_on
-; IDXMODE: v_mov_b32_e32
-; IDXMODE: s_set_gpr_idx_off
-
-; GCN: s_cbranch_execnz [[REGLOOP]]
-
-; GCN: {{^; %bb.[0-9]}}:
-; GCN: s_mov_b64 exec,
-; GCN: s_cbranch_execnz [[BB2]]
-
-define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
+define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
+; NOOPT-LABEL: broken_phi_bb:
+; NOOPT: ; %bb.0: ; %bb
+; NOOPT-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0
+; NOOPT-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; NOOPT-NEXT: s_mov_b32 s26, -1
+; NOOPT-NEXT: s_mov_b32 s27, 0xe8f000
+; NOOPT-NEXT: s_add_u32 s24, s24, s9
+; NOOPT-NEXT: s_addc_u32 s25, s25, 0
+; NOOPT-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
+; NOOPT-NEXT: s_load_dword s1, s[2:3], 0x9
+; NOOPT-NEXT: s_load_dword s0, s[2:3], 0xa
+; NOOPT-NEXT: s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT: v_writelane_b32 v0, s1, 0
+; NOOPT-NEXT: s_mov_b32 s1, 8
+; NOOPT-NEXT: v_writelane_b32 v0, s0, 1
+; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, 8
+; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: .LBB26_1: ; %bb2
+; NOOPT-NEXT: ; =>This Loop Header: Depth=1
+; NOOPT-NEXT: ; Child Loop BB26_3 Depth 2
+; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s2, v0, 0
+; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 s[0:1], -1
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_cmp_ge_i32_e64 s[2:3], v1, s2
+; NOOPT-NEXT: v_mov_b32_e32 v1, s4
+; NOOPT-NEXT: s_and_b64 vcc, exec, s[2:3]
+; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT: v_writelane_b32 v0, s0, 2
+; NOOPT-NEXT: v_writelane_b32 v0, s1, 3
+; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
+; NOOPT-NEXT: s_cbranch_vccnz .LBB26_6
+; NOOPT-NEXT: ; %bb.2: ; %bb4
+; NOOPT-NEXT: ; in Loop: Header=BB26_1 Depth=1
+; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v16, 1
+; NOOPT-NEXT: ; implicit-def: $sgpr2_sgpr3
+; NOOPT-NEXT: ; kill: def $sgpr3 killed $sgpr3 killed $sgpr2_sgpr3
+; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5
+; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; NOOPT-NEXT: s_mov_b32 s1, 0xf000
+; NOOPT-NEXT: s_mov_b32 s2, -1
+; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; NOOPT-NEXT: s_mov_b32 s5, s3
+; NOOPT-NEXT: s_mov_b32 s6, s2
+; NOOPT-NEXT: s_mov_b32 s7, s1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:80 ; 4-byte Folded Spill
+; NOOPT-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, s4
+; NOOPT-NEXT: v_mov_b32_e32 v1, s5
+; NOOPT-NEXT: v_mov_b32_e32 v2, s6
+; NOOPT-NEXT: v_mov_b32_e32 v3, s7
+; NOOPT-NEXT: v_mov_b32_e32 v4, s8
+; NOOPT-NEXT: v_mov_b32_e32 v5, s9
+; NOOPT-NEXT: v_mov_b32_e32 v6, s10
+; NOOPT-NEXT: v_mov_b32_e32 v7, s11
+; NOOPT-NEXT: v_mov_b32_e32 v8, s12
+; NOOPT-NEXT: v_mov_b32_e32 v9, s13
+; NOOPT-NEXT: v_mov_b32_e32 v10, s14
+; NOOPT-NEXT: v_mov_b32_e32 v11, s15
+; NOOPT-NEXT: v_mov_b32_e32 v12, s16
+; NOOPT-NEXT: v_mov_b32_e32 v13, s17
+; NOOPT-NEXT: v_mov_b32_e32 v14, s18
+; NOOPT-NEXT: v_mov_b32_e32 v15, s19
+; NOOPT-NEXT: v_mov_b32_e32 v17, s0
+; NOOPT-NEXT: buffer_store_dword v17, off, s[24:27], 0 offset:76 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[0:1], exec
+; NOOPT-NEXT: v_writelane_b32 v16, s0, 4
+; NOOPT-NEXT: v_writelane_b32 v16, s1, 5
+; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
+; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: .LBB26_3: ; Parent Loop BB26_1 Depth=1
+; NOOPT-NEXT: ; => This Inner Loop Header: Depth=2
+; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 6
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 7
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v4, off, s[24:27], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v5, off, s[24:27], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:56 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:60 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:68 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:72 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v17, off, s[24:27], 0 offset:76 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v18, off, s[24:27], 0 offset:80 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readfirstlane_b32 s2, v18
+; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18
+; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; NOOPT-NEXT: s_mov_b32 m0, s2
+; NOOPT-NEXT: v_movreld_b32_e32 v1, v17
+; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:84 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:88 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:92 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:96 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:100 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:104 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:108 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:112 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:116 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:120 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:124 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:132 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:136 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:140 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:144 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[24:27], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[24:27], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[24:27], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[24:27], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[24:27], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[24:27], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[24:27], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[24:27], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[24:27], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[24:27], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[24:27], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[24:27], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[24:27], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[24:27], 0 offset:72 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1]
+; NOOPT-NEXT: v_writelane_b32 v0, s2, 6
+; NOOPT-NEXT: v_writelane_b32 v0, s3, 7
+; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
+; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1]
+; NOOPT-NEXT: s_cbranch_execnz .LBB26_3
+; NOOPT-NEXT: ; %bb.4: ; in Loop: Header=BB26_1 Depth=1
+; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 4
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 5
+; NOOPT-NEXT: s_mov_b64 exec, s[0:1]
+; NOOPT-NEXT: ; %bb.5: ; in Loop: Header=BB26_1 Depth=1
+; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
+; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:84 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v2, off, s[24:27], 0 offset:88 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v3, off, s[24:27], 0 offset:92 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v4, off, s[24:27], 0 offset:96 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v5, off, s[24:27], 0 offset:100 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v6, off, s[24:27], 0 offset:104 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v7, off, s[24:27], 0 offset:108 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v8, off, s[24:27], 0 offset:112 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v9, off, s[24:27], 0 offset:116 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v10, off, s[24:27], 0 offset:120 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v11, off, s[24:27], 0 offset:124 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v12, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v13, off, s[24:27], 0 offset:132 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v14, off, s[24:27], 0 offset:136 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v15, off, s[24:27], 0 offset:140 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v16, off, s[24:27], 0 offset:144 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 s[0:1], 0
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[24:27], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT: v_writelane_b32 v0, s0, 2
+; NOOPT-NEXT: v_writelane_b32 v0, s1, 3
+; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
+; NOOPT-NEXT: .LBB26_6: ; %Flow
+; NOOPT-NEXT: ; in Loop: Header=BB26_1 Depth=1
+; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT: s_waitcnt expcnt(1)
+; NOOPT-NEXT: buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v1, 2
+; NOOPT-NEXT: v_readlane_b32 s1, v1, 3
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; NOOPT-NEXT: s_mov_b32 s0, 1
+; NOOPT-NEXT: ; implicit-def: $sgpr1
+; NOOPT-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0
+; NOOPT-NEXT: s_and_b64 vcc, exec, s[0:1]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_cbranch_vccnz .LBB26_1
+; NOOPT-NEXT: ; %bb.7: ; %bb8
+; NOOPT-NEXT: s_or_saveexec_b64 s[20:21], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[20:21]
+; NOOPT-NEXT: ; kill: killed $vgpr0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: broken_phi_bb:
+; SI-MOVREL: ; %bb.0: ; %bb
+; SI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-MOVREL-NEXT: v_mov_b32_e32 v0, 8
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s2, -1
+; SI-MOVREL-NEXT: s_branch .LBB26_2
+; SI-MOVREL-NEXT: .LBB26_1:
+; SI-MOVREL-NEXT: ; implicit-def: $vgpr0
+; SI-MOVREL-NEXT: s_branch .LBB26_6
+; SI-MOVREL-NEXT: .LBB26_2: ; %bb2
+; SI-MOVREL-NEXT: ; =>This Loop Header: Depth=1
+; SI-MOVREL-NEXT: ; Child Loop BB26_4 Depth 2
+; SI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT: v_cmp_le_i32_e32 vcc, s0, v0
+; SI-MOVREL-NEXT: s_cbranch_vccnz .LBB26_1
+; SI-MOVREL-NEXT: ; %bb.3: ; %bb4
+; SI-MOVREL-NEXT: ; in Loop: Header=BB26_2 Depth=1
+; SI-MOVREL-NEXT: buffer_load_dword v16, off, s[0:3], 0 glc
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: v_mov_b32_e32 v17, s1
+; SI-MOVREL-NEXT: s_mov_b64 s[4:5], exec
+; SI-MOVREL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; SI-MOVREL-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1
+; SI-MOVREL-NEXT: ; => This Inner Loop Header: Depth=2
+; SI-MOVREL-NEXT: v_readfirstlane_b32 s6, v16
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v16
+; SI-MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc
+; SI-MOVREL-NEXT: s_mov_b32 m0, s6
+; SI-MOVREL-NEXT: v_movreld_b32_e32 v0, v17
+; SI-MOVREL-NEXT: s_xor_b64 exec, exec, vcc
+; SI-MOVREL-NEXT: s_cbranch_execnz .LBB26_4
+; SI-MOVREL-NEXT: ; %bb.5: ; in Loop: Header=BB26_2 Depth=1
+; SI-MOVREL-NEXT: s_mov_b64 exec, s[4:5]
+; SI-MOVREL-NEXT: s_cbranch_execnz .LBB26_2
+; SI-MOVREL-NEXT: .LBB26_6: ; %bb8
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: broken_phi_bb:
+; VI-MOVREL: ; %bb.0: ; %bb
+; VI-MOVREL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-MOVREL-NEXT: v_mov_b32_e32 v0, 8
+; VI-MOVREL-NEXT: s_branch .LBB26_2
+; VI-MOVREL-NEXT: .LBB26_1:
+; VI-MOVREL-NEXT: ; implicit-def: $vgpr0
+; VI-MOVREL-NEXT: s_branch .LBB26_6
+; VI-MOVREL-NEXT: .LBB26_2: ; %bb2
+; VI-MOVREL-NEXT: ; =>This Loop Header: Depth=1
+; VI-MOVREL-NEXT: ; Child Loop BB26_4 Depth 2
+; VI-MOVREL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT: v_cmp_le_i32_e32 vcc, s0, v0
+; VI-MOVREL-NEXT: s_cbranch_vccnz .LBB26_1
+; VI-MOVREL-NEXT: ; %bb.3: ; %bb4
+; VI-MOVREL-NEXT: ; in Loop: Header=BB26_2 Depth=1
+; VI-MOVREL-NEXT: flat_load_dword v16, v[0:1] glc
+; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, s1
+; VI-MOVREL-NEXT: s_mov_b64 s[2:3], exec
+; VI-MOVREL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; VI-MOVREL-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1
+; VI-MOVREL-NEXT: ; => This Inner Loop Header: Depth=2
+; VI-MOVREL-NEXT: v_readfirstlane_b32 s4, v16
+; VI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16
+; VI-MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc
+; VI-MOVREL-NEXT: s_mov_b32 m0, s4
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v0, v17
+; VI-MOVREL-NEXT: s_xor_b64 exec, exec, vcc
+; VI-MOVREL-NEXT: s_cbranch_execnz .LBB26_4
+; VI-MOVREL-NEXT: ; %bb.5: ; in Loop: Header=BB26_2 Depth=1
+; VI-MOVREL-NEXT: s_mov_b64 exec, s[2:3]
+; VI-MOVREL-NEXT: s_cbranch_execnz .LBB26_2
+; VI-MOVREL-NEXT: .LBB26_6: ; %bb8
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: broken_phi_bb:
+; VI-IDXMODE: ; %bb.0: ; %bb
+; VI-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, 8
+; VI-IDXMODE-NEXT: s_branch .LBB26_2
+; VI-IDXMODE-NEXT: .LBB26_1:
+; VI-IDXMODE-NEXT: ; implicit-def: $vgpr0
+; VI-IDXMODE-NEXT: s_branch .LBB26_6
+; VI-IDXMODE-NEXT: .LBB26_2: ; %bb2
+; VI-IDXMODE-NEXT: ; =>This Loop Header: Depth=1
+; VI-IDXMODE-NEXT: ; Child Loop BB26_4 Depth 2
+; VI-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT: v_cmp_le_i32_e32 vcc, s0, v0
+; VI-IDXMODE-NEXT: s_cbranch_vccnz .LBB26_1
+; VI-IDXMODE-NEXT: ; %bb.3: ; %bb4
+; VI-IDXMODE-NEXT: ; in Loop: Header=BB26_2 Depth=1
+; VI-IDXMODE-NEXT: flat_load_dword v16, v[0:1] glc
+; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, s1
+; VI-IDXMODE-NEXT: s_mov_b64 s[2:3], exec
+; VI-IDXMODE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; VI-IDXMODE-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1
+; VI-IDXMODE-NEXT: ; => This Inner Loop Header: Depth=2
+; VI-IDXMODE-NEXT: v_readfirstlane_b32 s4, v16
+; VI-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16
+; VI-IDXMODE-NEXT: s_and_saveexec_b64 vcc, vcc
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v0, v17
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: s_xor_b64 exec, exec, vcc
+; VI-IDXMODE-NEXT: s_cbranch_execnz .LBB26_4
+; VI-IDXMODE-NEXT: ; %bb.5: ; in Loop: Header=BB26_2 Depth=1
+; VI-IDXMODE-NEXT: s_mov_b64 exec, s[2:3]
+; VI-IDXMODE-NEXT: s_cbranch_execnz .LBB26_2
+; VI-IDXMODE-NEXT: .LBB26_6: ; %bb8
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: broken_phi_bb:
+; GFX9-IDXMODE: ; %bb.0: ; %bb
+; GFX9-IDXMODE-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-IDXMODE-NEXT: s_branch .LBB26_2
+; GFX9-IDXMODE-NEXT: .LBB26_1:
+; GFX9-IDXMODE-NEXT: ; implicit-def: $vgpr0
+; GFX9-IDXMODE-NEXT: s_branch .LBB26_6
+; GFX9-IDXMODE-NEXT: .LBB26_2: ; %bb2
+; GFX9-IDXMODE-NEXT: ; =>This Loop Header: Depth=1
+; GFX9-IDXMODE-NEXT: ; Child Loop BB26_4 Depth 2
+; GFX9-IDXMODE-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT: v_cmp_le_i32_e32 vcc, s0, v0
+; GFX9-IDXMODE-NEXT: s_cbranch_vccnz .LBB26_1
+; GFX9-IDXMODE-NEXT: ; %bb.3: ; %bb4
+; GFX9-IDXMODE-NEXT: ; in Loop: Header=BB26_2 Depth=1
+; GFX9-IDXMODE-NEXT: global_load_dword v16, v[0:1], off glc
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, s1
+; GFX9-IDXMODE-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-IDXMODE-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX9-IDXMODE-NEXT: .LBB26_4: ; Parent Loop BB26_2 Depth=1
+; GFX9-IDXMODE-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX9-IDXMODE-NEXT: v_readfirstlane_b32 s4, v16
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16
+; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 vcc, vcc
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v0, v17
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: s_xor_b64 exec, exec, vcc
+; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB26_4
+; GFX9-IDXMODE-NEXT: ; %bb.5: ; in Loop: Header=BB26_2 Depth=1
+; GFX9-IDXMODE-NEXT: s_mov_b64 exec, s[2:3]
+; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB26_2
+; GFX9-IDXMODE-NEXT: .LBB26_6: ; %bb8
+; GFX9-IDXMODE-NEXT: s_endpgm
bb:
br label %bb2
-bb2: ; preds = %bb4, %bb
+bb2:
%tmp = phi i32 [ 8, %bb ], [ %tmp7, %bb4 ]
%tmp3 = icmp slt i32 %tmp, %arg
br i1 %tmp3, label %bb4, label %bb8
-bb4: ; preds = %bb2
+bb4:
%vgpr = load volatile i32, ptr addrspace(1) undef
%tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr
%tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr
%tmp7 = extractelement <16 x i32> %tmp6, i32 0
br label %bb2
-bb8: ; preds = %bb2
+bb8:
ret void
}
-; GCN-LABEL: {{^}}insert_or_disj_index:
-; GCN: v_mov_b32_e32 v[[#VIDX:]], 0
-
-; MOVREL: s_mov_b32 m0, s{{[0-9]+}}
-; MOVREL: v_movreld_b32_e32 v[[#VIDX + 1]], v{{[0-9]+}}
-
-; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(DST)
-; IDXMODE: v_mov_b32_e32 v[[#VIDX + 1]], v{{[0-9]+}}
-; IDXMODE: s_set_gpr_idx_off
define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %val, <4 x i32> inreg %desc, i32 inreg %A) {
+; NOOPT-LABEL: insert_or_disj_index:
+; NOOPT: ; %bb.0: ; %entry
+; NOOPT-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
+; NOOPT-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
+; NOOPT-NEXT: s_mov_b32 s18, -1
+; NOOPT-NEXT: s_mov_b32 s19, 0xe8f000
+; NOOPT-NEXT: s_add_u32 s16, s16, s5
+; NOOPT-NEXT: s_addc_u32 s17, s17, 0
+; NOOPT-NEXT: ; implicit-def: $vgpr16 : SGPR spill to VGPR lane
+; NOOPT-NEXT: v_writelane_b32 v16, s4, 0
+; NOOPT-NEXT: s_mov_b32 s4, s1
+; NOOPT-NEXT: v_readlane_b32 s1, v16, 0
+; NOOPT-NEXT: v_writelane_b32 v16, s4, 1
+; NOOPT-NEXT: s_mov_b32 s4, s0
+; NOOPT-NEXT: v_readlane_b32 s0, v16, 1
+; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:144 ; 4-byte Folded Spill
+; NOOPT-NEXT: v_mov_b32_e32 v2, v1
+; NOOPT-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; NOOPT-NEXT: s_mov_b32 s5, s0
+; NOOPT-NEXT: s_mov_b32 s6, s2
+; NOOPT-NEXT: s_mov_b32 s7, s3
+; NOOPT-NEXT: ; implicit-def: $sgpr0
+; NOOPT-NEXT: ; implicit-def: $sgpr0
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v2
+; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:136 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:140 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b32 s8, 0xf000
+; NOOPT-NEXT: s_mov_b32 s0, 0
+; NOOPT-NEXT: v_writelane_b32 v16, s0, 2
+; NOOPT-NEXT: s_mov_b32 s2, s0
+; NOOPT-NEXT: s_mov_b32 s3, s8
+; NOOPT-NEXT: s_mov_b32 s8, s0
+; NOOPT-NEXT: s_mov_b32 s9, s0
+; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9 killed $sgpr8_sgpr9 def $sgpr8_sgpr9_sgpr10_sgpr11
+; NOOPT-NEXT: s_mov_b64 s[10:11], s[2:3]
+; NOOPT-NEXT: v_writelane_b32 v16, s8, 3
+; NOOPT-NEXT: v_writelane_b32 v16, s9, 4
+; NOOPT-NEXT: v_writelane_b32 v16, s10, 5
+; NOOPT-NEXT: v_writelane_b32 v16, s11, 6
+; NOOPT-NEXT: ; kill: def $sgpr8_sgpr9_sgpr10_sgpr11 killed $sgpr4_sgpr5_sgpr6_sgpr7
+; NOOPT-NEXT: ; implicit-def: $sgpr2_sgpr3
+; NOOPT-NEXT: s_waitcnt expcnt(1)
+; NOOPT-NEXT: v_mov_b32_e32 v0, s1
+; NOOPT-NEXT: buffer_load_dword v0, v0, s[4:7], s0 offen
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:132 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v0, s0
+; NOOPT-NEXT: v_mov_b32_e32 v31, s0
+; NOOPT-NEXT: v_mov_b32_e32 v30, s0
+; NOOPT-NEXT: v_mov_b32_e32 v29, s0
+; NOOPT-NEXT: v_mov_b32_e32 v28, s0
+; NOOPT-NEXT: v_mov_b32_e32 v27, s0
+; NOOPT-NEXT: v_mov_b32_e32 v26, s0
+; NOOPT-NEXT: v_mov_b32_e32 v25, s0
+; NOOPT-NEXT: v_mov_b32_e32 v24, s0
+; NOOPT-NEXT: v_mov_b32_e32 v23, s0
+; NOOPT-NEXT: v_mov_b32_e32 v22, s0
+; NOOPT-NEXT: v_mov_b32_e32 v21, s0
+; NOOPT-NEXT: v_mov_b32_e32 v20, s0
+; NOOPT-NEXT: v_mov_b32_e32 v19, s0
+; NOOPT-NEXT: v_mov_b32_e32 v18, s0
+; NOOPT-NEXT: v_mov_b32_e32 v17, s0
+; NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v1, v31
+; NOOPT-NEXT: v_mov_b32_e32 v2, v30
+; NOOPT-NEXT: v_mov_b32_e32 v3, v29
+; NOOPT-NEXT: v_mov_b32_e32 v4, v28
+; NOOPT-NEXT: v_mov_b32_e32 v5, v27
+; NOOPT-NEXT: v_mov_b32_e32 v6, v26
+; NOOPT-NEXT: v_mov_b32_e32 v7, v25
+; NOOPT-NEXT: v_mov_b32_e32 v8, v24
+; NOOPT-NEXT: v_mov_b32_e32 v9, v23
+; NOOPT-NEXT: v_mov_b32_e32 v10, v22
+; NOOPT-NEXT: v_mov_b32_e32 v11, v21
+; NOOPT-NEXT: v_mov_b32_e32 v12, v20
+; NOOPT-NEXT: v_mov_b32_e32 v13, v19
+; NOOPT-NEXT: v_mov_b32_e32 v14, v18
+; NOOPT-NEXT: v_mov_b32_e32 v15, v17
+; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:68 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:72 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:76 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:80 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:84 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:88 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:92 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:96 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:100 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:104 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:108 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:112 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:116 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:120 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:124 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:128 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[0:1], exec
+; NOOPT-NEXT: v_writelane_b32 v16, s0, 7
+; NOOPT-NEXT: v_writelane_b32 v16, s1, 8
+; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1
+; NOOPT-NEXT: buffer_store_dword v16, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[12:13]
+; NOOPT-NEXT: s_waitcnt expcnt(1)
+; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT: ; implicit-def: $sgpr0_sgpr1
+; NOOPT-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1
+; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[12:13]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 9
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 10
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v7, off, s[16:19], 0 offset:24 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v8, off, s[16:19], 0 offset:28 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v9, off, s[16:19], 0 offset:32 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v10, off, s[16:19], 0 offset:36 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v11, off, s[16:19], 0 offset:40 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v12, off, s[16:19], 0 offset:44 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v13, off, s[16:19], 0 offset:48 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v14, off, s[16:19], 0 offset:52 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v15, off, s[16:19], 0 offset:56 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v16, off, s[16:19], 0 offset:60 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v17, off, s[16:19], 0 offset:144 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v18, off, s[16:19], 0 offset:132 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readfirstlane_b32 s2, v18
+; NOOPT-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v18
+; NOOPT-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; NOOPT-NEXT: s_mov_b32 m0, s2
+; NOOPT-NEXT: v_movreld_b32_e32 v2, v17
+; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 offset:148 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:152 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:156 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:160 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:164 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:168 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:172 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:176 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:180 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:184 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:188 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:192 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:196 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:200 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:204 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[16:19], 0 offset:208 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_store_dword v1, off, s[16:19], 0 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: buffer_store_dword v2, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v6, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v7, off, s[16:19], 0 offset:24 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v8, off, s[16:19], 0 offset:28 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v9, off, s[16:19], 0 offset:32 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v10, off, s[16:19], 0 offset:36 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v11, off, s[16:19], 0 offset:40 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v12, off, s[16:19], 0 offset:44 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v13, off, s[16:19], 0 offset:48 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v14, off, s[16:19], 0 offset:52 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v15, off, s[16:19], 0 offset:56 ; 4-byte Folded Spill
+; NOOPT-NEXT: buffer_store_dword v16, off, s[16:19], 0 offset:60 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 s[2:3], s[0:1]
+; NOOPT-NEXT: v_writelane_b32 v0, s2, 9
+; NOOPT-NEXT: v_writelane_b32 v0, s3, 10
+; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1
+; NOOPT-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Spill
+; NOOPT-NEXT: s_mov_b64 exec, s[12:13]
+; NOOPT-NEXT: s_xor_b64 exec, exec, s[0:1]
+; NOOPT-NEXT: s_cbranch_execnz .LBB27_1
+; NOOPT-NEXT: ; %bb.2:
+; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[12:13]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 7
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 8
+; NOOPT-NEXT: s_mov_b64 exec, s[0:1]
+; NOOPT-NEXT: ; %bb.3:
+; NOOPT-NEXT: s_or_saveexec_b64 s[12:13], -1
+; NOOPT-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:64 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_mov_b64 exec, s[12:13]
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_readlane_b32 s0, v0, 3
+; NOOPT-NEXT: v_readlane_b32 s1, v0, 4
+; NOOPT-NEXT: v_readlane_b32 s2, v0, 5
+; NOOPT-NEXT: v_readlane_b32 s3, v0, 6
+; NOOPT-NEXT: buffer_load_dword v5, off, s[16:19], 0 offset:136 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v6, off, s[16:19], 0 offset:140 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v18, off, s[16:19], 0 offset:148 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v19, off, s[16:19], 0 offset:152 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v20, off, s[16:19], 0 offset:156 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v21, off, s[16:19], 0 offset:160 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v22, off, s[16:19], 0 offset:164 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v23, off, s[16:19], 0 offset:168 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v24, off, s[16:19], 0 offset:172 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v25, off, s[16:19], 0 offset:176 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v26, off, s[16:19], 0 offset:180 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v27, off, s[16:19], 0 offset:184 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v28, off, s[16:19], 0 offset:188 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v29, off, s[16:19], 0 offset:192 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v30, off, s[16:19], 0 offset:196 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v31, off, s[16:19], 0 offset:200 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v32, off, s[16:19], 0 offset:204 ; 4-byte Folded Reload
+; NOOPT-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:208 ; 4-byte Folded Reload
+; NOOPT-NEXT: s_waitcnt vmcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v7, v21
+; NOOPT-NEXT: v_mov_b32_e32 v8, v20
+; NOOPT-NEXT: v_mov_b32_e32 v9, v19
+; NOOPT-NEXT: v_mov_b32_e32 v1, v18
+; NOOPT-NEXT: v_mov_b32_e32 v2, v25
+; NOOPT-NEXT: v_mov_b32_e32 v3, v24
+; NOOPT-NEXT: v_mov_b32_e32 v4, v23
+; NOOPT-NEXT: v_mov_b32_e32 v10, v22
+; NOOPT-NEXT: v_mov_b32_e32 v15, v29
+; NOOPT-NEXT: v_mov_b32_e32 v16, v28
+; NOOPT-NEXT: v_mov_b32_e32 v17, v27
+; NOOPT-NEXT: v_mov_b32_e32 v11, v26
+; NOOPT-NEXT: v_mov_b32_e32 v12, v33
+; NOOPT-NEXT: v_mov_b32_e32 v13, v32
+; NOOPT-NEXT: v_mov_b32_e32 v14, v31
+; NOOPT-NEXT: v_mov_b32_e32 v18, v30
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19_vgpr20_vgpr21 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v19, v14
+; NOOPT-NEXT: v_mov_b32_e32 v20, v13
+; NOOPT-NEXT: v_mov_b32_e32 v21, v12
+; NOOPT-NEXT: v_mov_b32_e32 v13, v6
+; NOOPT-NEXT: v_mov_b32_e32 v12, v5
+; NOOPT-NEXT: buffer_store_dwordx4 v[18:21], v[12:13], s[0:3], 0 addr64 offset:48
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12_vgpr13_vgpr14 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v12, v17
+; NOOPT-NEXT: v_mov_b32_e32 v13, v16
+; NOOPT-NEXT: v_mov_b32_e32 v14, v15
+; NOOPT-NEXT: v_mov_b32_e32 v16, v6
+; NOOPT-NEXT: v_mov_b32_e32 v15, v5
+; NOOPT-NEXT: buffer_store_dwordx4 v[11:14], v[15:16], s[0:3], 0 addr64 offset:32
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec
+; NOOPT-NEXT: s_waitcnt expcnt(0)
+; NOOPT-NEXT: v_mov_b32_e32 v11, v4
+; NOOPT-NEXT: v_mov_b32_e32 v12, v3
+; NOOPT-NEXT: v_mov_b32_e32 v13, v2
+; NOOPT-NEXT: v_mov_b32_e32 v2, v5
+; NOOPT-NEXT: v_mov_b32_e32 v3, v6
+; NOOPT-NEXT: buffer_store_dwordx4 v[10:13], v[2:3], s[0:3], 0 addr64 offset:16
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; implicit-def: $sgpr4
+; NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
+; NOOPT-NEXT: v_mov_b32_e32 v2, v9
+; NOOPT-NEXT: v_mov_b32_e32 v3, v8
+; NOOPT-NEXT: v_mov_b32_e32 v4, v7
+; NOOPT-NEXT: buffer_store_dwordx4 v[1:4], v[5:6], s[0:3], 0 addr64
+; NOOPT-NEXT: ; kill: killed $vgpr0
+; NOOPT-NEXT: s_endpgm
+;
+; SI-MOVREL-LABEL: insert_or_disj_index:
+; SI-MOVREL: ; %bb.0: ; %entry
+; SI-MOVREL-NEXT: v_mov_b32_e32 v2, s4
+; SI-MOVREL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
+; SI-MOVREL-NEXT: s_mov_b32 s2, 0
+; SI-MOVREL-NEXT: v_mov_b32_e32 v5, 0
+; SI-MOVREL-NEXT: s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT: s_mov_b32 s0, s2
+; SI-MOVREL-NEXT: s_mov_b32 s1, s2
+; SI-MOVREL-NEXT: v_mov_b32_e32 v6, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v7, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v8, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v9, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v10, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v11, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v12, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v13, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v14, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v15, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v16, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v17, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v18, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v19, v5
+; SI-MOVREL-NEXT: v_mov_b32_e32 v20, v5
+; SI-MOVREL-NEXT: s_mov_b64 s[4:5], exec
+; SI-MOVREL-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1
+; SI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; SI-MOVREL-NEXT: v_readfirstlane_b32 s6, v2
+; SI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2
+; SI-MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc
+; SI-MOVREL-NEXT: s_mov_b32 m0, s6
+; SI-MOVREL-NEXT: v_movreld_b32_e32 v6, v4
+; SI-MOVREL-NEXT: s_xor_b64 exec, exec, vcc
+; SI-MOVREL-NEXT: s_cbranch_execnz .LBB27_1
+; SI-MOVREL-NEXT: ; %bb.2:
+; SI-MOVREL-NEXT: s_mov_b64 exec, s[4:5]
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[17:20], v[0:1], s[0:3], 0 addr64 offset:48
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[13:16], v[0:1], s[0:3], 0 addr64 offset:32
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[9:12], v[0:1], s[0:3], 0 addr64 offset:16
+; SI-MOVREL-NEXT: buffer_store_dwordx4 v[5:8], v[0:1], s[0:3], 0 addr64
+; SI-MOVREL-NEXT: s_endpgm
+;
+; VI-MOVREL-LABEL: insert_or_disj_index:
+; VI-MOVREL: ; %bb.0: ; %entry
+; VI-MOVREL-NEXT: v_mov_b32_e32 v2, s4
+; VI-MOVREL-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
+; VI-MOVREL-NEXT: v_mov_b32_e32 v5, 0
+; VI-MOVREL-NEXT: v_mov_b32_e32 v6, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v7, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v8, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v9, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v10, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v11, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v12, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v13, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v14, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v15, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v16, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v17, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v18, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v19, v5
+; VI-MOVREL-NEXT: v_mov_b32_e32 v20, v5
+; VI-MOVREL-NEXT: s_mov_b64 s[0:1], exec
+; VI-MOVREL-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1
+; VI-MOVREL-NEXT: s_waitcnt vmcnt(0)
+; VI-MOVREL-NEXT: v_readfirstlane_b32 s2, v2
+; VI-MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2
+; VI-MOVREL-NEXT: s_and_saveexec_b64 vcc, vcc
+; VI-MOVREL-NEXT: s_mov_b32 m0, s2
+; VI-MOVREL-NEXT: v_movreld_b32_e32 v6, v4
+; VI-MOVREL-NEXT: s_xor_b64 exec, exec, vcc
+; VI-MOVREL-NEXT: s_cbranch_execnz .LBB27_1
+; VI-MOVREL-NEXT: ; %bb.2:
+; VI-MOVREL-NEXT: s_mov_b64 exec, s[0:1]
+; VI-MOVREL-NEXT: v_add_u32_e32 v2, vcc, 48, v0
+; VI-MOVREL-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[2:3], v[17:20]
+; VI-MOVREL-NEXT: v_add_u32_e32 v2, vcc, 32, v0
+; VI-MOVREL-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[2:3], v[13:16]
+; VI-MOVREL-NEXT: v_add_u32_e32 v2, vcc, 16, v0
+; VI-MOVREL-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[2:3], v[9:12]
+; VI-MOVREL-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
+; VI-MOVREL-NEXT: s_endpgm
+;
+; VI-IDXMODE-LABEL: insert_or_disj_index:
+; VI-IDXMODE: ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v2, s4
+; VI-IDXMODE-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v5, 0
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v7, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v8, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v9, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v10, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v11, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v12, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v13, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v14, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v15, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v16, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v17, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v18, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v19, v5
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v20, v5
+; VI-IDXMODE-NEXT: s_mov_b64 s[0:1], exec
+; VI-IDXMODE-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1
+; VI-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; VI-IDXMODE-NEXT: v_readfirstlane_b32 s2, v2
+; VI-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2
+; VI-IDXMODE-NEXT: s_and_saveexec_b64 vcc, vcc
+; VI-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; VI-IDXMODE-NEXT: v_mov_b32_e32 v6, v4
+; VI-IDXMODE-NEXT: s_set_gpr_idx_off
+; VI-IDXMODE-NEXT: s_xor_b64 exec, exec, vcc
+; VI-IDXMODE-NEXT: s_cbranch_execnz .LBB27_1
+; VI-IDXMODE-NEXT: ; %bb.2:
+; VI-IDXMODE-NEXT: s_mov_b64 exec, s[0:1]
+; VI-IDXMODE-NEXT: v_add_u32_e32 v2, vcc, 48, v0
+; VI-IDXMODE-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[2:3], v[17:20]
+; VI-IDXMODE-NEXT: v_add_u32_e32 v2, vcc, 32, v0
+; VI-IDXMODE-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[2:3], v[13:16]
+; VI-IDXMODE-NEXT: v_add_u32_e32 v2, vcc, 16, v0
+; VI-IDXMODE-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[2:3], v[9:12]
+; VI-IDXMODE-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
+; VI-IDXMODE-NEXT: s_endpgm
+;
+; GFX9-IDXMODE-LABEL: insert_or_disj_index:
+; GFX9-IDXMODE: ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-IDXMODE-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v8, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v9, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v10, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v11, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v12, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v13, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v14, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v15, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v16, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v17, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v18, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v19, v5
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v20, v5
+; GFX9-IDXMODE-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-IDXMODE-NEXT: .LBB27_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-IDXMODE-NEXT: s_waitcnt vmcnt(0)
+; GFX9-IDXMODE-NEXT: v_readfirstlane_b32 s2, v2
+; GFX9-IDXMODE-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2
+; GFX9-IDXMODE-NEXT: s_and_saveexec_b64 vcc, vcc
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX9-IDXMODE-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-IDXMODE-NEXT: s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT: s_xor_b64 exec, exec, vcc
+; GFX9-IDXMODE-NEXT: s_cbranch_execnz .LBB27_1
+; GFX9-IDXMODE-NEXT: ; %bb.2:
+; GFX9-IDXMODE-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[17:20], off offset:48
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[13:16], off offset:32
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[9:12], off offset:16
+; GFX9-IDXMODE-NEXT: global_store_dwordx4 v[0:1], v[5:8], off
+; GFX9-IDXMODE-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %desc, i32 %A, i32 0, i32 0)
%off = or disjoint i32 %idx, 1
@@ -560,10 +8320,3 @@ entry:
store <16 x i32> %v, ptr addrspace(1) %out
ret void
}
-
-declare i32 @llvm.amdgcn.workitem.id.x() #1
-declare void @llvm.amdgcn.s.barrier() #2
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind convergent }
More information about the llvm-commits
mailing list