[llvm] [AMDGPU] Generate checks for vector indexing. NFC. (PR #105668)

Thu Aug 22 07:20:35 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Jay Foad (jayfoad)

<details>
<summary>Changes</summary>

This allows combining some test files that were only split because
adding new RUN lines introduced too much churn in the checks.


---

Patch is 421.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/105668.diff


4 Files Affected:

- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll (-67) 
- (removed) llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll (-63) 
- (removed) llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll (-53) 
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+8066-313) 


``````````diff

diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
index 31fa32b3475cb7..872a457a3b5c34 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll
@@ -2,70 +2,6 @@
 
 ; indexing of vectors.
 
-; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll
-; to avoid gfx9 scheduling induced issues.
-
-
-; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
-; GCN-DAG: s_load_dwordx16 s[[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]]
-; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
-; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
-
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
-
-; GCN: v_cmp_eq_u32_e32
-; GCN-COUNT-32: v_cndmask_b32
-
-; GCN-COUNT-4: buffer_store_dwordx4
-define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) #0 {
-entry:
-  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %id.ext = zext i32 %id to i64
-  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext
-  %idx0 = load volatile i32, ptr addrspace(1) %gep
-  %idx1 = add i32 %idx0, 1
-  %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
-  %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
-  %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
-  store volatile <16 x i32> %vec2, ptr addrspace(1) %out0
-  %cmp = icmp eq i32 %id, 0
-  br i1 %cmp, label %bb1, label %bb2
-
-bb1:
-  store volatile i32 %live.out.val, ptr addrspace(1) undef
-  br label %bb2
-
-bb2:
-  ret void
-}
-
-; Avoid inserting extra v_mov from copies within the vgpr indexing sequence. The
-; gpr_idx mode switching sequence is expanded late for this reason.
-
-; GCN-LABEL: {{^}}insert_w_offset_multiple_in_block
-
-; GCN: s_set_gpr_idx_on
-; GCN-NEXT: v_mov_b32_e32
-; GCN-NEXT: s_set_gpr_idx_off
-
-; GCN: s_set_gpr_idx_on
-; GCN-NEXT: v_mov_b32_e32
-; GCN-NOT: v_mov_b32_e32
-; GCN-NEXT: s_set_gpr_idx_off
-define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) #0 {
-entry:
-  %add1 = add i32 %in, 1
-  %ins1 = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add1
-  %add2 = add i32 %in, 2
-  %ins2 = insertelement <16 x float> %ins1, float 17.0, i32 %add2
-  store <16 x float> %ins1, ptr addrspace(1) %out1
-  %out2 = getelementptr <16 x float>, ptr addrspace(1) %out1, i32 1
-  store <16 x float> %ins2, ptr addrspace(1) %out2
-
-  ret void
-}
-
 declare hidden void @foo()
 
 ; For functions with calls, we were not accounting for m0_lo16/m0_hi16
@@ -83,7 +19,4 @@ define amdgpu_kernel void @insertelement_with_call(ptr addrspace(1) %ptr, i32 %i
   ret void
 }
 
-declare i32 @llvm.amdgcn.workitem.id.x() #1
-declare void @llvm.amdgcn.s.barrier() #2
-
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
deleted file mode 100644
index 1a72140963d696..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc -O0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
-
-; FIXME: Merge into indirect-addressing-si.ll
-
-; Make sure that TwoAddressInstructions keeps src0 as subregister sub0
-; of the tied implicit use and def of the super register.
-
-; CHECK-LABEL: {{^}}insert_wo_offset:
-; CHECK: s_load_dword [[IN:s[0-9]+]]
-; CHECK: s_mov_b32 m0, [[IN]]
-; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
-; CHECK: buffer_store_dwordx4
-; CHECK: buffer_store_dwordx4
-; CHECK: buffer_store_dwordx4
-; CHECK: buffer_store_dwordx4
-define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) {
-entry:
-  %ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
-  store <16 x float> %ins, ptr addrspace(1) %out
-  ret void
-}
-
-; Make sure we don't hit use of undefined register errors when expanding an
-; extract with undef index.
-
-; CHECK-LABEL: {{^}}extract_adjacent_blocks:
-; CHECK: s_load_dword [[ARG:s[0-9]+]]
-; CHECK: s_cmp_lg_u32
-; CHECK: s_cbranch_scc1 [[BB4:.LBB[0-9]+_[0-9]+]]
-
-; CHECK: buffer_load_dwordx4
-
-; CHECK: s_branch [[ENDBB:.LBB[0-9]+_[0-9]+]]
-
-; CHECK: [[BB4]]:
-; CHECK: buffer_load_dwordx4
-
-; CHECK: [[ENDBB]]:
-; CHECK: buffer_store_dword
-; CHECK: s_endpgm
-
-define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) #0 {
-bb:
-  %tmp = icmp eq i32 %arg, 0
-  br i1 %tmp, label %bb1, label %bb4
-
-bb1:
-  %tmp2 = load volatile <4 x float>, ptr addrspace(1) undef
-  %tmp3 = extractelement <4 x float> %tmp2, i32 undef
-  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp2) #0 ; Prevent block optimize out
-  br label %bb7
-
-bb4:
-  %tmp5 = load volatile <4 x float>, ptr addrspace(1) undef
-  %tmp6 = extractelement <4 x float> %tmp5, i32 undef
-  call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp5) #0 ; Prevent block optimize out
-  br label %bb7
-
-bb7:
-  %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
-  store volatile float %tmp8, ptr addrspace(1) undef
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
deleted file mode 100644
index cbb5d9e1692843..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
-
-; Tests for indirect addressing on SI, which is implemented using dynamic
-; indexing of vectors.
-
-; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll
-; to avoid gfx9 scheduling induced issues.
-
-
-; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
-; GCN-DAG: s_load_dwordx16 s[[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]]
-; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
-; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
-
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
-; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
-
-; GCN-DAG: v_add_{{i32|u32}}_e32 [[IDX1:v[0-9]+]], vcc, 1, [[IDX0]]
-
-; GCN: v_cmp_eq_u32_e32
-; GCN-COUNT-32: v_cndmask_b32
-
-; GCN-COUNT-4: buffer_store_dwordx4
-define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) #0 {
-entry:
-  %id = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %id.ext = zext i32 %id to i64
-  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %id.ext
-  %idx0 = load volatile i32, ptr addrspace(1) %gep
-  %idx1 = add i32 %idx0, 1
-  %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
-  %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
-  %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
-  store volatile <16 x i32> %vec2, ptr addrspace(1) %out0
-  %cmp = icmp eq i32 %id, 0
-  br i1 %cmp, label %bb1, label %bb2
-
-bb1:
-  store volatile i32 %live.out.val, ptr addrspace(1) undef
-  br label %bb2
-
-bb2:
-  ret void
-}
-
-declare i32 @llvm.amdgcn.workitem.id.x() #1
-declare void @llvm.amdgcn.s.barrier() #2
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index f095aef7a0cc81..c130eb04d02370 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1,26 +1,197 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 < %s | FileCheck -check-prefix=NOOPT %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI-MOVREL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI,VI-MOVREL %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-vgpr-index-mode < %s | FileCheck -check-prefixes=VI,VI-IDXMODE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-IDXMODE %s
 
 ; Tests for indirect addressing on SI, which is implemented using dynamic
 ; indexing of vectors.
-
-; GCN-LABEL: {{^}}extract_w_offset:
-; GCN-DAG: s_load_dword [[IN0:s[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
-; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
-; GCN-DAG: s_add_i32 [[IN:s[0-9]+]], [[IN0]], 1
-
-; MOVREL-DAG: s_mov_b32 m0, [[IN]]
-; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
-
-; IDXMODE: s_set_gpr_idx_on [[IN]], gpr_idx(SRC0){{$}}
-; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
-; IDXMODE-NEXT: s_set_gpr_idx_off
 define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) {
+; NOOPT-LABEL: extract_w_offset:
+; NOOPT:       ; %bb.0: ; %entry
+; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xb
+; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT:    s_mov_b32 s7, s1
+; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
+; NOOPT-NEXT:    s_mov_b32 s6, -1
+; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT:    s_mov_b32 s1, s7
+; NOOPT-NEXT:    s_mov_b32 s2, s6
+; NOOPT-NEXT:    s_mov_b32 s3, s5
+; NOOPT-NEXT:    s_mov_b32 s5, 1
+; NOOPT-NEXT:    s_add_i32 s4, s4, s5
+; NOOPT-NEXT:    s_mov_b32 s5, 0x41800000
+; NOOPT-NEXT:    s_mov_b32 s6, 0x41700000
+; NOOPT-NEXT:    s_mov_b32 s7, 0x41600000
+; NOOPT-NEXT:    s_mov_b32 s8, 0x41500000
+; NOOPT-NEXT:    s_mov_b32 s9, 0x41400000
+; NOOPT-NEXT:    s_mov_b32 s10, 0x41300000
+; NOOPT-NEXT:    s_mov_b32 s11, 0x41200000
+; NOOPT-NEXT:    s_mov_b32 s12, 0x41100000
+; NOOPT-NEXT:    s_mov_b32 s13, 0x41000000
+; NOOPT-NEXT:    s_mov_b32 s14, 0x40e00000
+; NOOPT-NEXT:    s_mov_b32 s15, 0x40c00000
+; NOOPT-NEXT:    s_mov_b32 s16, 0x40a00000
+; NOOPT-NEXT:    s_mov_b32 s17, 4.0
+; NOOPT-NEXT:    s_mov_b32 s18, 0x40400000
+; NOOPT-NEXT:    s_mov_b32 s19, 2.0
+; NOOPT-NEXT:    s_mov_b32 s20, 1.0
+; NOOPT-NEXT:    v_mov_b32_e32 v0, s20
+; NOOPT-NEXT:    v_mov_b32_e32 v30, s19
+; NOOPT-NEXT:    v_mov_b32_e32 v29, s18
+; NOOPT-NEXT:    v_mov_b32_e32 v28, s17
+; NOOPT-NEXT:    v_mov_b32_e32 v27, s16
+; NOOPT-NEXT:    v_mov_b32_e32 v26, s15
+; NOOPT-NEXT:    v_mov_b32_e32 v25, s14
+; NOOPT-NEXT:    v_mov_b32_e32 v24, s13
+; NOOPT-NEXT:    v_mov_b32_e32 v23, s12
+; NOOPT-NEXT:    v_mov_b32_e32 v22, s11
+; NOOPT-NEXT:    v_mov_b32_e32 v21, s10
+; NOOPT-NEXT:    v_mov_b32_e32 v20, s9
+; NOOPT-NEXT:    v_mov_b32_e32 v19, s8
+; NOOPT-NEXT:    v_mov_b32_e32 v18, s7
+; NOOPT-NEXT:    v_mov_b32_e32 v17, s6
+; NOOPT-NEXT:    v_mov_b32_e32 v16, s5
+; NOOPT-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; NOOPT-NEXT:    v_mov_b32_e32 v1, v30
+; NOOPT-NEXT:    v_mov_b32_e32 v2, v29
+; NOOPT-NEXT:    v_mov_b32_e32 v3, v28
+; NOOPT-NEXT:    v_mov_b32_e32 v4, v27
+; NOOPT-NEXT:    v_mov_b32_e32 v5, v26
+; NOOPT-NEXT:    v_mov_b32_e32 v6, v25
+; NOOPT-NEXT:    v_mov_b32_e32 v7, v24
+; NOOPT-NEXT:    v_mov_b32_e32 v8, v23
+; NOOPT-NEXT:    v_mov_b32_e32 v9, v22
+; NOOPT-NEXT:    v_mov_b32_e32 v10, v21
+; NOOPT-NEXT:    v_mov_b32_e32 v11, v20
+; NOOPT-NEXT:    v_mov_b32_e32 v12, v19
+; NOOPT-NEXT:    v_mov_b32_e32 v13, v18
+; NOOPT-NEXT:    v_mov_b32_e32 v14, v17
+; NOOPT-NEXT:    v_mov_b32_e32 v15, v16
+; NOOPT-NEXT:    s_mov_b32 m0, s4
+; NOOPT-NEXT:    v_movrels_b32_e32 v0, v0
+; NOOPT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; NOOPT-NEXT:    s_endpgm
+;
+; SI-MOVREL-LABEL: extract_w_offset:
+; SI-MOVREL:       ; %bb.0: ; %entry
+; SI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
+; SI-MOVREL-NEXT:    s_mov_b32 s3, 0xf000
+; SI-MOVREL-NEXT:    s_mov_b32 s2, -1
+; SI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-MOVREL-NEXT:    s_add_i32 s4, s4, 1
+; SI-MOVREL-NEXT:    s_mov_b32 m0, s4
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
+; SI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
+; SI-MOVREL-NEXT:    v_movrels_b32_e32 v0, v0
+; SI-MOVREL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-MOVREL-NEXT:    s_endpgm
+;
+; VI-MOVREL-LABEL: extract_w_offset:
+; VI-MOVREL:       ; %bb.0: ; %entry
+; VI-MOVREL-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; VI-MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, 1.0
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, 2.0
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; VI-MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-MOVREL-NEXT:    s_add_i32 s4, s4, 1
+; VI-MOVREL-NEXT:    s_mov_b32 m0, s4
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v3, 4.0
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v10, 0x41300000
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v11, 0x41400000
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v14, 0x41700000
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v15, 0x41800000
+; VI-MOVREL-NEXT:    v_movrels_b32_e32 v2, v0
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v0, s0
+; VI-MOVREL-NEXT:    v_mov_b32_e32 v1, s1
+; VI-MOVREL-NEXT:    flat_store_dword v[0:1], v2
+; VI-MOVREL-NEXT:    s_endpgm
+;
+; VI-IDXMODE-LABEL: extract_w_offset:
+; VI-IDXMODE:       ; %bb.0: ; %entry
+; VI-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; VI-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; VI-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-IDXMODE-NEXT:    s_add_i32 s4, s4, 1
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
+; VI-IDXMODE-NEXT:    s_set_gpr_idx_on s4, gpr_idx(SRC0)
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v2, v0
+; VI-IDXMODE-NEXT:    s_set_gpr_idx_off
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v0, s0
+; VI-IDXMODE-NEXT:    v_mov_b32_e32 v1, s1
+; VI-IDXMODE-NEXT:    flat_store_dword v[0:1], v2
+; VI-IDXMODE-NEXT:    s_endpgm
+;
+; GFX9-IDXMODE-LABEL: extract_w_offset:
+; GFX9-IDXMODE:       ; %bb.0: ; %entry
+; GFX9-IDXMODE-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GFX9-IDXMODE-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v16, 0
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX9-IDXMODE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-IDXMODE-NEXT:    s_add_i32 s4, s4, 1
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v3, 4.0
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v10, 0x41300000
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v11, 0x41400000
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v14, 0x41700000
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v15, 0x41800000
+; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_on s4, gpr_idx(SRC0)
+; GFX9-IDXMODE-NEXT:    v_mov_b32_e32 v0, v0
+; GFX9-IDXMODE-NEXT:    s_set_gpr_idx_off
+; GFX9-IDXMODE-NEXT:    global_store_dword v16, v0, s[0:1]
+; GFX9-IDXMODE-NEXT:    s_endpgm
 entry:
   %idx = add i32 %in, 1
   %elt = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %idx
@@ -29,24 +200,291 @@ entry:
 }
 
 ; XXX: Could do v_or_b32 directly
-; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector:
-; GCN-DAG: s_or_b32
-; GCN-DAG: s_or_b32
-; GCN-DAG: s_or_b32
-; GCN-DAG: s_or_b32
-; MOVREL: s_mov_b32 m0
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-
-
-; MOVREL: v_movrels_b32_e32
-
-; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, gpr_idx(SRC0){{$}}
-; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
-; IDXMODE-NEXT: s_set_gpr_idx_off
 define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %out, i32 %in, <16 x i32> %or.val) {
+; NOOPT-LABEL: extract_w_offset_salu_use_vector:
+; NOOPT:       ; %bb.0: ; %entry
+; NOOPT-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; NOOPT-NEXT:    s_load_dword s4, s[2:3], 0xb
+; NOOPT-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x19
+; NOOPT-NEXT:    s_waitcnt lgkmcnt(0)
+; NOOPT-NEXT:    s_mov_b32 s7, s1
+; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; NOOPT-NEXT:    s_mov_b32 s5, 0xf000
+; NOOPT-NEXT:    s_mov_b32 s6, -1
+; NOOPT-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; NOOPT-NEXT:    s_mov_b32 s1, s7
+; NOOPT-NEXT:    s_mov_b32 s2,...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/105668