[llvm] 5fd319f - [AMDGPU] load-local-i16.ll - regenerate test checks

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Fri Jul 11 03:33:45 PDT 2025


Author: Simon Pilgrim
Date: 2025-07-11T11:33:33+01:00
New Revision: 5fd319f5cc9b2439a3731c9e08c60bbf69b2ecc7

URL: https://github.com/llvm/llvm-project/commit/5fd319f5cc9b2439a3731c9e08c60bbf69b2ecc7
DIFF: https://github.com/llvm/llvm-project/commit/5fd319f5cc9b2439a3731c9e08c60bbf69b2ecc7.diff

LOG: [AMDGPU] load-local-i16.ll - regenerate test checks

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/load-local-i16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index 1dd08c561b2ab..8b7102582c2d0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -1,246 +1,872 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,GFX89,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX89,FUNC %s
-; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=VI,VI-NO-DS128 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX9,GFX9-NO-DS128 %s
+; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG %s
 
 ; Testing for ds_read/write_b128
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=VI,VI-DS128 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX9,GFX9-DS128 %s
 
-; FUNC-LABEL: {{^}}local_load_i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_u16 v{{[0-9]+}}
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG: LDS_SHORT_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_load_i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_i16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_u16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b16 v1, v0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_load_i16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_u16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b16 v1, v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_load_i16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b16 v1, v0
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_load_i16:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 4, @0, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_SHORT_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
 entry:
   %ld = load i16, ptr addrspace(3) %in
   store i16 %ld, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_load_v2i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b32
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_load_v2i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v2i16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b32 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b32 v1, v0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_load_v2i16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_b32 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b32 v1, v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_load_v2i16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_b32 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v1, v0
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_load_v2i16:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 4, @1, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
 entry:
   %ld = load <2 x i16>, ptr addrspace(3) %in
   store <2 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_load_v3i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-; GCN-DAG: ds_write_b32
-; GCN-DAG: ds_write_b16
-
-; EG-DAG: LDS_USHORT_READ_RET
-; EG-DAG: LDS_USHORT_READ_RET
 define amdgpu_kernel void @local_load_v3i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v3i16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b32 v2, v0
+; SI-NEXT:    ds_write_b16 v2, v1 offset:4
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_load_v3i16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b16 v2, v1 offset:4
+; VI-NEXT:    ds_write_b32 v2, v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_load_v3i16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b16 v2, v1 offset:4
+; GFX9-NEXT:    ds_write_b32 v2, v0
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_load_v3i16:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 11, @2, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.Y,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_SHORT_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
 entry:
   %ld = load <3 x i16>, ptr addrspace(3) %in
   store <3 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_load_v4i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_load_v4i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v4i16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_load_v4i16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b64 v2, v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_load_v4i16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_load_v4i16:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 11, @3, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
 entry:
   %ld = load <4 x i16>, ptr addrspace(3) %in
   store <4 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_load_v8i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_load_v8i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v8i16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_load_v8i16:
+; VI-NO-DS128:       ; %bb.0: ; %entry
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_load_v8i16:
+; GFX9-NO-DS128:       ; %bb.0: ; %entry
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_load_v8i16:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 25, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_load_v8i16:
+; VI-DS128:       ; %bb.0: ; %entry
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_load_v8i16:
+; GFX9-DS128:       ; %bb.0: ; %entry
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
 entry:
   %ld = load <8 x i16>, ptr addrspace(3) %in
   store <8 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_load_v16i16:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_load_v16i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_load_v16i16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v8, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    ds_write2_b64 v8, v[4:5], v[6:7] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_load_v16i16:
+; VI-NO-DS128:       ; %bb.0: ; %entry
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset1:1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    ds_write2_b64 v8, v[4:5], v[6:7] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_load_v16i16:
+; GFX9-NO-DS128:       ; %bb.0: ; %entry
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset1:1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v8, v[4:5], v[6:7] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_load_v16i16:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 53, @5, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_load_v16i16:
+; VI-DS128:       ; %bb.0: ; %entry
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v4 offset:16
+; VI-DS128-NEXT:    ds_read_b128 v[4:7], v4
+; VI-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:16
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    ds_write_b128 v8, v[4:7]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_load_v16i16:
+; GFX9-DS128:       ; %bb.0: ; %entry
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v4 offset:16
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v4
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:16
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[4:7]
+; GFX9-DS128-NEXT:    s_endpgm
 entry:
   %ld = load <16 x i16>, ptr addrspace(3) %in
   store <16 x i16> %ld, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_i16_to_i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_u16
-; GCN: ds_write_b32
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_zextload_i16_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_i16_to_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_u16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b32 v1, v0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_zextload_i16_to_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_u16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b32 v1, v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_zextload_i16_to_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v1, v0
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_i16_to_i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 4, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
   %a = load i16, ptr addrspace(3) %in
   %ext = zext i16 %a to i32
   store i32 %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_i16_to_i32:
-; GCN-NOT: s_wqm_b64
-
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_i16
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
-; EG: 16
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_sextload_i16_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_i16_to_i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_i16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b32 v1, v0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_sextload_i16_to_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_i16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b32 v1, v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_sextload_i16_to_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_i16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v1, v0
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_i16_to_i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 6, @7, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.X, OQAP,
+; EG-NEXT:     BFE_INT T0.W, PV.X, 0.0, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %a = load i16, ptr addrspace(3) %in
   %ext = sext i16 %a to i32
   store i32 %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_u16
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v1i16_to_v1i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_u16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b32 v1, v0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_zextload_v1i16_to_v1i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_u16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b32 v1, v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_zextload_v1i16_to_v1i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v1, v0
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v1i16_to_v1i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 4, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
   %load = load <1 x i16>, ptr addrspace(3) %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_i16
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
-; EG: 16
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v1i16_to_v1i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_i16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b32 v1, v0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_sextload_v1i16_to_v1i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_i16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    ds_write_b32 v1, v0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_sextload_v1i16_to_v1i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_i16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v1, v0
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v1i16_to_v1i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 6, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.X, OQAP,
+; EG-NEXT:     BFE_INT T0.W, PV.X, 0.0, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %load = load <1 x i16>, ptr addrspace(3) %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32:
-; GCN-NOT: s_wqm_b64
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b32
-
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v2i16_to_v2i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b32 v0, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_zextload_v2i16_to_v2i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_b32 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NEXT:    ds_write_b64 v2, v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_zextload_v2i16_to_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_b32 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v2i16_to_v2i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 10, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.Y, OQAP,
+; EG-NEXT:     AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %load = load <2 x i16>, ptr addrspace(3) %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32:
-; GCN-NOT: s_wqm_b64
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b32
-
-; EG: LDS_READ_RET
-; EG: BFE_INT
-; EG: BFE_INT
 define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v2i16_to_v2i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b32 v0, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_sextload_v2i16_to_v2i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_b32 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NEXT:    ds_write_b64 v2, v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_sextload_v2i16_to_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_b32 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v2i16_to_v2i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.Y, OQAP,
+; EG-NEXT:     LSHR * T0.W, PV.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %load = load <2 x i16>, ptr addrspace(3) %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_local_zextload_v3i16_to_v3i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-; SI-DAG: ds_write_b32
-; SI-DAG: ds_write_b64
-; CIVI-DAG: ds_write_b96
-; GFX9-DAG: ds_write_b96
-
-; EG: LDS_USHORT_READ_RET
-; EG: LDS_USHORT_READ_RET
-; EG: LDS_USHORT_READ_RET
 define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_local_zextload_v3i16_to_v3i32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; SI-NEXT:    ds_write_b32 v4, v0 offset:8
+; SI-NEXT:    ds_write_b64 v4, v[2:3]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_local_zextload_v3i16_to_v3i32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NEXT:    ds_write_b96 v3, v[0:2]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_local_zextload_v3i16_to_v3i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    ds_write_b96 v3, v[0:2]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_local_zextload_v3i16_to_v3i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 18, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.Z,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.Y,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
 entry:
   %ld = load <3 x i16>, ptr addrspace(3) %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -248,23 +874,79 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_local_sextload_v3i16_to_v3i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-; SI-DAG: ds_write_b32
-; SI-DAG: ds_write_b64
-; CIVI-DAG: ds_write_b96
-; GFX9-DAG: ds_write_b96
-
-; EG: LDS_USHORT_READ_RET
-; EG: LDS_USHORT_READ_RET
-; EG: LDS_USHORT_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
 define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_local_sextload_v3i16_to_v3i32:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
+; SI-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v0, v1, 0, 16
+; SI-NEXT:    ds_write_b32 v4, v0 offset:8
+; SI-NEXT:    ds_write_b64 v4, v[2:3]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_local_sextload_v3i16_to_v3i32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_b64 v[3:4], v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_ashrrev_i32_e32 v1, 16, v3
+; VI-NEXT:    v_bfe_i32 v2, v4, 0, 16
+; VI-NEXT:    v_bfe_i32 v0, v3, 0, 16
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    ds_write_b96 v3, v[0:2]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_local_sextload_v3i16_to_v3i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_b64 v[3:4], v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 16, v3
+; GFX9-NEXT:    v_bfe_i32 v2, v4, 0, 16
+; GFX9-NEXT:    v_bfe_i32 v0, v3, 0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    ds_write_b96 v3, v[0:2]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_local_sextload_v3i16_to_v3i32:
+; EG:       ; %bb.0: ; %entry
+; EG-NEXT:    ALU 22, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     BFE_INT T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T0.X, 0.0, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
 entry:
   %ld = load <3 x i16>, ptr addrspace(3) %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -272,659 +954,7978 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32:
-; GCN-NOT: s_wqm_b64
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_local_zextload_v4i16_to_v4i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v1
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_local_zextload_v4i16_to_v4i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_local_zextload_v4i16_to_v4i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_local_zextload_v4i16_to_v4i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 22, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_local_zextload_v4i16_to_v4i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; VI-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_local_zextload_v4i16_to_v4i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <4 x i16>, ptr addrspace(3) %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32:
-; GCN-NOT: s_wqm_b64
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read_b64
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
 define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v4i16_to_v4i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 16, v0
+; SI-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v3, v0, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v1
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v1
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v4i16_to_v4i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 25, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     LSHR * T0.W, T0.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHR T1.Z, PV.Z, literal.x,
+; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v4i16_to_v4i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b64 v[4:5], v0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v5
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v4
+; VI-DS128-NEXT:    v_bfe_i32 v2, v5, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v4, 0, 16
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b64 v[4:5], v0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v5
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v4
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v5, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v4, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <4 x i16>, ptr addrspace(3) %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v8i16_to_v8i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v12, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; SI-NEXT:    ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v12, v[4:5], v[6:7] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v8i16_to_v8i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v2, v[0:1], v[8:9] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v2, v[4:5], v[6:7] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v8i16_to_v8i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v10, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v10, v[0:1], v[8:9] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v10, v[4:5], v[6:7] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v8i16_to_v8i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 46, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     AND_INT T1.W, T0.W, literal.x,
+; EG-NEXT:     MOV * T2.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v8i16_to_v8i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; VI-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; VI-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; VI-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; VI-DS128-NEXT:    ds_write_b128 v0, v[8:11] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v0, v[4:7]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v8i16_to_v8i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v12, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX9-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[8:11] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[4:7]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <8 x i16>, ptr addrspace(3) %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
 define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v8i16_to_v8i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v12, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; SI-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; SI-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; SI-NEXT:    ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v12, v[4:5], v[6:7] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v8i16_to_v8i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v0, v[8:9], v[10:11] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v0, v[4:5], v[6:7] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v8i16_to_v8i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v12, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v12, v[4:5], v[6:7] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v8i16_to_v8i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 51, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     LSHR * T1.W, T0.Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     LSHR T1.Z, T0.W, literal.x,
+; EG-NEXT:     BFE_INT T1.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T2.Z, T0.Y, literal.x,
+; EG-NEXT:     BFE_INT T1.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.Z, T1.Y, literal.x,
+; EG-NEXT:     BFE_INT T1.W, T2.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     BFE_INT T1.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     BFE_INT T1.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     MOV * T2.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     BFE_INT T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v8i16_to_v8i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; VI-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; VI-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; VI-DS128-NEXT:    ds_write_b128 v0, v[8:11] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v0, v[4:7]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v8i16_to_v8i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v12, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; GFX9-DS128-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[8:11] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[4:7]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <8 x i16>, ptr addrspace(3) %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-
-; GCN: ds_write2_b64
-; GCN: ds_write2_b64
-; GCN: ds_write2_b64
-; GCN: ds_write2_b64
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v16i16_to_v16i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v4
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v3
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v5
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v4
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v7
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v6
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v4, v[10:11], v[8:9] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v16i16_to_v16i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v16, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v5
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[6:7], v[14:15] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[4:5], v[12:13] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[2:3], v[10:11] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[0:1], v[8:9] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v16i16_to_v16i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v16, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v5
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[6:7], v[14:15] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[4:5], v[12:13] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[2:3], v[10:11] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[0:1], v[8:9] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v16i16_to_v16i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 94, @18, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     MOV * T2.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT:     MOV * T3.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v16i16_to_v16i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v4
+; VI-DS128-NEXT:    ds_read_b128 v[4:7], v4 offset:16
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; VI-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v1
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; VI-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
+; VI-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v7
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
+; VI-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v6
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; VI-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v3
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; VI-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v2
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; VI-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; VI-DS128-NEXT:    ds_write_b128 v4, v[16:19] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v4, v[12:15] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v4, v[8:11]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v16i16_to_v16i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v4
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v4 offset:16
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v1
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX9-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
+; GFX9-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v7
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
+; GFX9-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v6
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GFX9-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v3
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX9-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v2
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX9-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[16:19] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[12:15] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[8:11]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <16 x i16>, ptr addrspace(3) %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
 define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v16i16_to_v16i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v4
+; SI-NEXT:    v_bfe_i32 v8, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v10, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v12, v3, 0, 16
+; SI-NEXT:    v_bfe_i32 v14, v2, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v7
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v6
+; SI-NEXT:    v_bfe_i32 v16, v5, 0, 16
+; SI-NEXT:    v_bfe_i32 v18, v4, 0, 16
+; SI-NEXT:    v_bfe_i32 v0, v7, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v6, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v4, v[10:11], v[8:9] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v16i16_to_v16i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v1
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v3
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v2
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v4
+; VI-NO-DS128-NEXT:    v_bfe_i32 v8, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v10, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v12, v3, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v7
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v6
+; VI-NO-DS128-NEXT:    v_bfe_i32 v18, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v7, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v6, 0, 16
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v5
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v5, 0, 16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[10:11], v[8:9] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v1
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v2
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v4
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v10, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v12, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v7
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v6
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v18, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v7, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v6, 0, 16
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v5
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v5, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[10:11], v[8:9] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v16i16_to_v16i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 95, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     MOV * T2.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     LSHR T2.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     LSHR * T3.Z, T2.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T2.W, T2.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T4.Z, T0.Y, literal.x,
+; EG-NEXT:     BFE_INT T2.W, T3.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T3.Z, T0.Z, literal.x,
+; EG-NEXT:     BFE_INT T2.W, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T4.Z, T0.W, literal.x,
+; EG-NEXT:     BFE_INT T2.W, T3.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T3.Z, T1.Y, literal.x,
+; EG-NEXT:     BFE_INT T2.W, T4.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T4.Z, T1.Z, literal.x,
+; EG-NEXT:     BFE_INT T2.W, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T3.Z, T2.Z, literal.x,
+; EG-NEXT:     BFE_INT T2.W, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     BFE_INT T2.W, T3.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     BFE_INT T1.W, T1.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     BFE_INT T1.W, T2.Y, 0.0, literal.x,
+; EG-NEXT:     MOV * T2.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     BFE_INT T1.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     BFE_INT T1.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     BFE_INT T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    ALU 7, @20, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     BFE_INT T0.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T2.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v16i16_to_v16i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v4
+; VI-DS128-NEXT:    ds_read_b128 v[4:7], v4 offset:16
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v1
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v3
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v2
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v4
+; VI-DS128-NEXT:    v_bfe_i32 v10, v1, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v8, v0, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v14, v3, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v12, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
+; VI-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v7, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v6, 0, 16
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v5
+; VI-DS128-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v4, v[16:19] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v4, v[12:15] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v4, v[8:11]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v16i16_to_v16i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v4
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v4 offset:16
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v1
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v0
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v3
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v2
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v4
+; GFX9-DS128-NEXT:    v_bfe_i32 v10, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v8, v0, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v14, v3, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v12, v2, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
+; GFX9-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v7, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v6, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v5
+; GFX9-DS128-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[16:19] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[12:15] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[8:11]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <16 x i16>, ptr addrspace(3) %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v32i16_to_v32i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v12, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v12 offset1:1
+; SI-NEXT:    ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[8:11], v12 offset0:4 offset1:5
+; SI-NEXT:    ds_read2_b64 v[12:15], v12 offset0:6 offset1:7
+; SI-NEXT:    s_waitcnt lgkmcnt(3)
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v3
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v2
+; SI-NEXT:    s_waitcnt lgkmcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v5
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v9
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v11
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v13
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
+; SI-NEXT:    v_and_b32_e32 v30, 0xffff, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT:    v_mov_b32_e32 v32, s0
+; SI-NEXT:    ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15
+; SI-NEXT:    ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11
+; SI-NEXT:    ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v32i16_to_v32i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v24, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v7
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v6
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v5
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v5
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v4
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v5
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v27, 16, v3
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v26, 0xffff, v3
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v29, 16, v7
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v7
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[14:15], v[12:13] offset1:1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v32i16_to_v32i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v24, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v7
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v6
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v5
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v5
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v4
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v5
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v27, 16, v3
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v26, 0xffff, v3
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v29, 16, v7
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v7
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[14:15], v[12:13] offset1:1
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v32i16_to_v32i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 105, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.W, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Y, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Z, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.W, OQAP,
+; EG-NEXT:     MOV * T4.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Y, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Z, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.W, OQAP,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.Y, OQAP,
+; EG-NEXT:     LSHR T5.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     AND_INT T4.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T5.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T5.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T4.Z, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T4.Z, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T4.Y, literal.x,
+; EG-NEXT:     MOV * T5.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T3.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT:    ALU 84, @22, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T3.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T3.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T2.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 92(1.289195e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 88(1.233143e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 84(1.177091e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 76(1.064987e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 72(1.008935e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 68(9.528830e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 124(1.737610e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 116(1.625506e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 108(1.513402e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 100(1.401298e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v32i16_to_v32i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v20, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v20
+; VI-DS128-NEXT:    ds_read_b128 v[4:7], v20 offset:16
+; VI-DS128-NEXT:    ds_read_b128 v[16:19], v20 offset:32
+; VI-DS128-NEXT:    ds_read_b128 v[20:23], v20 offset:48
+; VI-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; VI-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; VI-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v23
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
+; VI-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v22
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
+; VI-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v21
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
+; VI-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; VI-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; VI-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; VI-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
+; VI-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v6
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; VI-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; VI-DS128-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; VI-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; VI-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v17
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; VI-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; VI-DS128-NEXT:    ds_write_b128 v32, v[20:23] offset:96
+; VI-DS128-NEXT:    ds_write_b128 v32, v[28:31] offset:112
+; VI-DS128-NEXT:    ds_write_b128 v32, v[16:19] offset:64
+; VI-DS128-NEXT:    ds_write_b128 v32, v[24:27] offset:80
+; VI-DS128-NEXT:    ds_write_b128 v32, v[4:7] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v32, v[12:15] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v32, v[0:3]
+; VI-DS128-NEXT:    ds_write_b128 v32, v[8:11] offset:16
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v32i16_to_v32i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v20, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v20
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v20 offset:16
+; GFX9-DS128-NEXT:    ds_read_b128 v[16:19], v20 offset:32
+; GFX9-DS128-NEXT:    ds_read_b128 v[20:23], v20 offset:48
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX9-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v23
+; GFX9-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v23
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v29, 16, v22
+; GFX9-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v22
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
+; GFX9-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v21
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
+; GFX9-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX9-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX9-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
+; GFX9-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v6
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX9-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; GFX9-DS128-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; GFX9-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GFX9-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v17
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; GFX9-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[20:23] offset:96
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[28:31] offset:112
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[16:19] offset:64
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[24:27] offset:80
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[4:7] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[12:15] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[0:3]
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[8:11] offset:16
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <32 x i16>, ptr addrspace(3) %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v32i16_to_v32i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v12, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v12 offset1:1
+; SI-NEXT:    ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[8:11], v12 offset0:4 offset1:5
+; SI-NEXT:    ds_read2_b64 v[12:15], v12 offset0:6 offset1:7
+; SI-NEXT:    s_waitcnt lgkmcnt(3)
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 16, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v23, 16, v2
+; SI-NEXT:    v_bfe_i32 v16, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v18, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v20, v3, 0, 16
+; SI-NEXT:    v_bfe_i32 v22, v2, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(2)
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v4
+; SI-NEXT:    v_bfe_i32 v0, v5, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v4, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v7
+; SI-NEXT:    v_bfe_i32 v4, v7, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v6
+; SI-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_ashrrev_i32_e32 v25, 16, v9
+; SI-NEXT:    v_bfe_i32 v24, v9, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v8
+; SI-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v27, 16, v11
+; SI-NEXT:    v_bfe_i32 v26, v11, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v10
+; SI-NEXT:    v_bfe_i32 v10, v10, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v29, 16, v13
+; SI-NEXT:    v_bfe_i32 v28, v13, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v12
+; SI-NEXT:    v_bfe_i32 v12, v12, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v31, 16, v15
+; SI-NEXT:    v_bfe_i32 v30, v15, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v14
+; SI-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v32, s0
+; SI-NEXT:    ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15
+; SI-NEXT:    ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11
+; SI-NEXT:    ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v24, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v3
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v2
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v1
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v7
+; VI-NO-DS128-NEXT:    v_bfe_i32 v8, v3, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v10, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v12, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v6
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v5
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v7, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v18, v6, 0, 16
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_bfe_i32 v20, v5, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v23, 16, v4
+; VI-NO-DS128-NEXT:    v_bfe_i32 v22, v4, 0, 16
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v1
+; VI-NO-DS128-NEXT:    v_bfe_i32 v24, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v31, 16, v5
+; VI-NO-DS128-NEXT:    v_bfe_i32 v30, v5, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v4
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v3
+; VI-NO-DS128-NEXT:    v_bfe_i32 v26, v3, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v2
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v7
+; VI-NO-DS128-NEXT:    v_bfe_i32 v28, v7, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v6
+; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[14:15], v[12:13] offset1:1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v24, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v2
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v1
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v7
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v10, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v12, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v6
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v5
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v7, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v18, v6, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v24 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v20, v5, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v23, 16, v4
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v22, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v24 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v1
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v24, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v0
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v31, 16, v5
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v30, v5, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v4
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v3
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v26, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v2
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v7
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v28, v7, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v6
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[14:15], v[12:13] offset1:1
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v32i16_to_v32i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 101, @23, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.W, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Y, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Z, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.W, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Y, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Z, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.W, OQAP,
+; EG-NEXT:     LSHR * T5.W, T4.Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T6.W
+; EG-NEXT:     MOV T5.Y, OQAP,
+; EG-NEXT:     LSHR T5.Z, T4.W, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T6.Z, T0.Y, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.Z, T0.Z, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T6.Z, T0.W, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.Z, T1.Y, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T6.Z, T1.Z, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.Z, T1.W, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR * T6.Z, T2.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU 89, @24, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.Z, T2.Z, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 92(1.289195e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T6.Z, T2.W, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 84(1.177091e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.Z, T3.Y, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 76(1.064987e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T6.Z, T3.Z, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 68(9.528830e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.Z, T3.W, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 124(1.737610e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T6.Z, T4.Y, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 116(1.625506e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.Z, T5.Y, literal.x,
+; EG-NEXT:     BFE_INT T5.W, T6.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 108(1.513402e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     BFE_INT T5.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 100(1.401298e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     BFE_INT T5.W, T4.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     BFE_INT T4.W, T4.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     BFE_INT T4.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     BFE_INT T4.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     MOV * T5.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     BFE_INT T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T2.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T2.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 80(1.121039e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T2.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T3.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 64(8.968310e-44)
+; EG-NEXT:    ALU 16, @25, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T3.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T3.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 112(1.569454e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T4.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T5.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 96(1.345247e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v32i16_to_v32i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v24, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v24
+; VI-DS128-NEXT:    ds_read_b128 v[4:7], v24 offset:16
+; VI-DS128-NEXT:    ds_read_b128 v[20:23], v24 offset:32
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v1
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v0
+; VI-DS128-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v14, v1, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v12, v0, 0, 16
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v5
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v4
+; VI-DS128-NEXT:    v_bfe_i32 v2, v7, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v6, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; VI-DS128-NEXT:    ds_read_b128 v[4:7], v24 offset:48
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v26, 16, v23
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v24, 16, v22
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v30, 16, v21
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v28, 16, v20
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v38, 16, v5
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v36, 16, v4
+; VI-DS128-NEXT:    v_bfe_i32 v37, v5, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v35, v4, 0, 16
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    v_bfe_i32 v25, v23, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v23, v22, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v29, v21, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v27, v20, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v34, 16, v7
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v32, 16, v6
+; VI-DS128-NEXT:    v_bfe_i32 v33, v7, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v31, v6, 0, 16
+; VI-DS128-NEXT:    ds_write_b128 v4, v[35:38] offset:96
+; VI-DS128-NEXT:    ds_write_b128 v4, v[31:34] offset:112
+; VI-DS128-NEXT:    ds_write_b128 v4, v[27:30] offset:64
+; VI-DS128-NEXT:    ds_write_b128 v4, v[23:26] offset:80
+; VI-DS128-NEXT:    ds_write_b128 v4, v[16:19] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v4, v[12:15]
+; VI-DS128-NEXT:    ds_write_b128 v4, v[8:11] offset:16
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v32i16_to_v32i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v24, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v24
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v24 offset:16
+; GFX9-DS128-NEXT:    ds_read_b128 v[20:23], v24 offset:32
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v3
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v2
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v1
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v0
+; GFX9-DS128-NEXT:    v_bfe_i32 v10, v3, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v8, v2, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v14, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v12, v0, 0, 16
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v7
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v6
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v5
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v4
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v7, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v6, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v24 offset:48
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v26, 16, v23
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v24, 16, v22
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v30, 16, v21
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v28, 16, v20
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v38, 16, v5
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v36, 16, v4
+; GFX9-DS128-NEXT:    v_bfe_i32 v37, v5, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v35, v4, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    v_bfe_i32 v25, v23, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v23, v22, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v29, v21, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v27, v20, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v34, 16, v7
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v32, 16, v6
+; GFX9-DS128-NEXT:    v_bfe_i32 v33, v7, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v31, v6, 0, 16
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[35:38] offset:96
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[31:34] offset:112
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[27:30] offset:64
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[23:26] offset:80
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[16:19] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[12:15]
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[8:11] offset:16
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <32 x i16>, ptr addrspace(3) %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v64i16_to_v64i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s14, -1
+; SI-NEXT:    s_mov_b32 s15, 0xe8f000
+; SI-NEXT:    s_add_u32 s12, s12, s11
+; SI-NEXT:    s_addc_u32 s13, s13, 0
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v24, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v24 offset0:8 offset1:9
+; SI-NEXT:    ds_read2_b64 v[4:7], v24 offset0:10 offset1:11
+; SI-NEXT:    ds_read2_b64 v[12:15], v24 offset0:12 offset1:13
+; SI-NEXT:    ds_read2_b64 v[8:11], v24 offset0:14 offset1:15
+; SI-NEXT:    ds_read2_b64 v[20:23], v24 offset1:1
+; SI-NEXT:    ds_read2_b64 v[16:19], v24 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[34:37], v24 offset0:4 offset1:5
+; SI-NEXT:    ds_read2_b64 v[38:41], v24 offset0:6 offset1:7
+; SI-NEXT:    s_waitcnt lgkmcnt(7)
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v2
+; SI-NEXT:    s_waitcnt lgkmcnt(6)
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v5
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v1
+; SI-NEXT:    buffer_store_dword v24, off, s[12:15], 0 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v3
+; SI-NEXT:    v_and_b32_e32 v30, 0xffff, v2
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; SI-NEXT:    v_and_b32_e32 v32, 0xffff, v5
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v4
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; SI-NEXT:    s_waitcnt lgkmcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v13
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v15
+; SI-NEXT:    v_and_b32_e32 v42, 0xffff, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT:    s_waitcnt lgkmcnt(4)
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v9
+; SI-NEXT:    v_and_b32_e32 v44, 0xffff, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v11
+; SI-NEXT:    v_and_b32_e32 v46, 0xffff, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT:    s_waitcnt lgkmcnt(3)
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v21
+; SI-NEXT:    v_and_b32_e32 v48, 0xffff, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v23
+; SI-NEXT:    v_and_b32_e32 v50, 0xffff, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; SI-NEXT:    s_waitcnt lgkmcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v17
+; SI-NEXT:    v_and_b32_e32 v52, 0xffff, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v19
+; SI-NEXT:    v_and_b32_e32 v54, 0xffff, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v35
+; SI-NEXT:    v_and_b32_e32 v56, 0xffff, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v34
+; SI-NEXT:    v_and_b32_e32 v34, 0xffff, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v37
+; SI-NEXT:    v_and_b32_e32 v58, 0xffff, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v36
+; SI-NEXT:    v_and_b32_e32 v36, 0xffff, v36
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v39
+; SI-NEXT:    v_and_b32_e32 v60, 0xffff, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v38
+; SI-NEXT:    v_and_b32_e32 v38, 0xffff, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v41
+; SI-NEXT:    v_and_b32_e32 v62, 0xffff, v41
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v40
+; SI-NEXT:    v_and_b32_e32 v40, 0xffff, v40
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    ds_write2_b64 v0, v[40:41], v[62:63] offset0:14 offset1:15
+; SI-NEXT:    ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11
+; SI-NEXT:    ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v0, v[18:19], v[54:55] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v0, v[16:17], v[52:53] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v0, v[22:23], v[50:51] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v0, v[20:21], v[48:49] offset1:1
+; SI-NEXT:    ds_write2_b64 v0, v[10:11], v[46:47] offset0:30 offset1:31
+; SI-NEXT:    ds_write2_b64 v0, v[8:9], v[44:45] offset0:28 offset1:29
+; SI-NEXT:    ds_write2_b64 v0, v[14:15], v[42:43] offset0:26 offset1:27
+; SI-NEXT:    ds_write2_b64 v0, v[12:13], v[6:7] offset0:24 offset1:25
+; SI-NEXT:    ds_write2_b64 v0, v[4:5], v[2:3] offset0:22 offset1:23
+; SI-NEXT:    ds_write2_b64 v0, v[24:25], v[32:33] offset0:20 offset1:21
+; SI-NEXT:    ds_write2_b64 v0, v[30:31], v[28:29] offset0:18 offset1:19
+; SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ds_write2_b64 v0, v[26:27], v[1:2] offset0:16 offset1:17
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; VI-NO-DS128-NEXT:    s_mov_b32 s90, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v16, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v16 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[17:20], v16 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    s_mov_b32 s91, 0xe80000
+; VI-NO-DS128-NEXT:    s_add_u32 s88, s88, s11
+; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
+; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v10
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v13
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v12
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v17
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v18
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v17
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v20
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v19
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v19
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[17:20], v16 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v26, 16, v22
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v25, 0xffff, v22
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v28, 16, v21
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v27, 0xffff, v21
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v30, 16, v24
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v29, 0xffff, v24
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v31, 0xffff, v23
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v34, 16, v18
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v33, 0xffff, v18
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v35, 0xffff, v17
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v38, 16, v20
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[21:24], v16 offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v37, 0xffff, v20
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v40, 16, v19
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v19
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[17:20], v16 offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v42, 16, v22
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v41, 0xffff, v22
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v21
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v43, 0xffff, v21
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v24
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v24
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v48, 16, v23
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v47, 0xffff, v23
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v18
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v49, 0xffff, v18
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v52, 16, v17
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v51, 0xffff, v17
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[21:24], v16 offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v56, 16, v19
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v55, 0xffff, v19
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v54, 16, v20
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v20
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v58, 16, v22
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v57, 0xffff, v22
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v20, 16, v19
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v18
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v18, s0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v60, 16, v24
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v59, 0xffff, v24
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v24, 16, v23
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v62, 16, v17
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v61, 0xffff, v17
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[0:1], v[19:20] offset0:30 offset1:31
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[16:17], v[61:62] offset0:28 offset1:29
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[23:24], v[59:60] offset0:26 offset1:27
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[21:22], v[57:58] offset0:24 offset1:25
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[55:56], v[53:54] offset0:22 offset1:23
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[51:52], v[49:50] offset0:20 offset1:21
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[47:48], v[45:46] offset0:18 offset1:19
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[43:44], v[41:42] offset0:16 offset1:17
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[39:40], v[37:38] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[35:36], v[33:34] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[31:32], v[29:30] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[27:28], v[25:26] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[14:15], v[12:13] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[10:11], v[8:9] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[6:7], v[4:5] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
+; VI-NO-DS128-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; VI-NO-DS128-NEXT:    s_waitcnt vmcnt(0)
+; VI-NO-DS128-NEXT:    ds_write2_b64 v18, v[2:3], v[0:1] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s14, -1
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s15, 0xe00000
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v56, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v56 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v56 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    s_add_u32 s12, s12, s11
+; GFX9-NO-DS128-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v11
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    s_nop 0
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[18:21], v56 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[22:25], v56 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v15
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v10
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v13
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v12
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v17
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v15
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v14
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v17
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v16
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v16
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v27, 16, v19
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v29, 16, v18
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v18
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v31, 16, v21
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v21
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v32, 0xffff, v20
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v23
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v34, 0xffff, v23
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v37, 16, v22
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v36, 0xffff, v22
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[16:19], v56 offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v56 offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v39, 16, v25
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v38, 0xffff, v25
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v41, 16, v17
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v40, 0xffff, v17
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v43, 16, v16
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v42, 0xffff, v16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v45, 16, v19
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v44, 0xffff, v19
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v47, 16, v18
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v46, 0xffff, v18
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v49, 16, v21
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v48, 0xffff, v21
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v50, 0xffff, v20
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v53, 16, v23
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[16:19], v56 offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v52, 0xffff, v23
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v55, 16, v22
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v54, 0xffff, v22
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v56 offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v57, 16, v17
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v56, 0xffff, v17
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v63, 16, v23
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v62, 0xffff, v23
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v19
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v58, 0xffff, v19
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v61, 16, v21
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v60, 0xffff, v21
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v20
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[22:23], v[62:63] offset0:30 offset1:31
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[20:21], v[60:61] offset0:28 offset1:29
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[18:19], v[58:59] offset0:26 offset1:27
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[16:17], v[56:57] offset0:24 offset1:25
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[54:55], v[52:53] offset0:22 offset1:23
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[50:51], v[48:49] offset0:20 offset1:21
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[46:47], v[44:45] offset0:18 offset1:19
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[42:43], v[40:41] offset0:16 offset1:17
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[24:25], v[38:39] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[36:37], v[34:35] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[32:33], v[30:31] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[28:29], v[26:27] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[14:15], v[12:13] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[10:11], v[8:9] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[6:7], v[4:5] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
+; GFX9-NO-DS128-NEXT:    buffer_load_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-NO-DS128-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v64i16_to_v64i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 116, @26, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    120(1.681558e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    124(1.737610e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    104(1.457350e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    108(1.513402e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.W, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Y, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    88(1.233143e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Z, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    92(1.289195e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.W, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Y, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Z, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    72(1.008935e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.W, OQAP,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT:    76(1.064987e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.Y, OQAP,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.Z, OQAP,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.W, OQAP,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T6.W
+; EG-NEXT:     MOV T6.Y, OQAP,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T6.W
+; EG-NEXT:     MOV T6.Z, OQAP,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T6.W
+; EG-NEXT:     MOV T6.W, OQAP,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T7.W
+; EG-NEXT:     MOV T7.Y, OQAP,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T7.W
+; EG-NEXT:     MOV T7.Z, OQAP,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T7.W
+; EG-NEXT:     MOV T7.W, OQAP,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T8.W
+; EG-NEXT:     MOV T8.Y, OQAP,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T8.W
+; EG-NEXT:     MOV T8.Z, OQAP,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T8.W
+; EG-NEXT:     MOV T8.W, OQAP,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T9.W
+; EG-NEXT:     MOV T9.Y, OQAP,
+; EG-NEXT:     MOV * T9.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T9.W
+; EG-NEXT:     MOV T9.Z, OQAP,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:    ALU 95, @27, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_READ_RET * OQAP, T9.W
+; EG-NEXT:     MOV T9.W, OQAP,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T10.W
+; EG-NEXT:     MOV T10.Y, OQAP,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T10.W
+; EG-NEXT:     MOV T10.Z, OQAP,
+; EG-NEXT:     LSHR T10.W, T10.Y, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     AND_INT T10.W, T10.Y, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T10.W, T10.Z, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     AND_INT T10.W, T10.Z, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T10.W, T9.W, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     AND_INT T9.W, T9.W, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     LSHR T9.W, T9.Z, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     AND_INT T9.W, T9.Z, literal.x,
+; EG-NEXT:     MOV * T10.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     LSHR T9.W, T9.Y, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     AND_INT T9.W, T9.Y, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     LSHR T9.W, T8.W, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     AND_INT T8.W, T8.W, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T8.W,
+; EG-NEXT:     LSHR T8.W, T8.Z, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T8.W,
+; EG-NEXT:     AND_INT T8.W, T8.Z, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T8.W,
+; EG-NEXT:     LSHR T8.W, T8.Y, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T8.W,
+; EG-NEXT:     AND_INT T8.W, T8.Y, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T8.W,
+; EG-NEXT:     LSHR T8.W, T7.W, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 92(1.289195e-43)
+; EG-NEXT:     LDS_WRITE * T9.W, T8.W,
+; EG-NEXT:     AND_INT T7.W, T7.W, literal.x,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 88(1.233143e-43)
+; EG-NEXT:     LDS_WRITE * T8.W, T7.W,
+; EG-NEXT:     LSHR T7.W, T7.Z, literal.x,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 84(1.177091e-43)
+; EG-NEXT:     LDS_WRITE * T8.W, T7.W,
+; EG-NEXT:     AND_INT T7.W, T7.Z, literal.x,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
+; EG-NEXT:     LDS_WRITE * T8.W, T7.W,
+; EG-NEXT:     LSHR T7.W, T7.Y, literal.x,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 76(1.064987e-43)
+; EG-NEXT:     LDS_WRITE * T8.W, T7.W,
+; EG-NEXT:     AND_INT * T7.W, T7.Y, literal.x,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:    ALU 93, @28, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Y, literal.x,
+; EG-NEXT:    72(1.008935e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T8.W, T7.W,
+; EG-NEXT:     LSHR T7.W, T6.W, literal.x,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 68(9.528830e-44)
+; EG-NEXT:     LDS_WRITE * T8.W, T7.W,
+; EG-NEXT:     AND_INT T6.W, T6.W, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     LSHR T6.W, T6.Z, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 124(1.737610e-43)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     AND_INT T6.W, T6.Z, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     LSHR T6.W, T6.Y, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 116(1.625506e-43)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     AND_INT T6.W, T6.Y, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     LSHR T6.W, T5.W, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 108(1.513402e-43)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     AND_INT T5.W, T5.W, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.W, T5.Z, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 100(1.401298e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     AND_INT T5.W, T5.Z, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.W, T5.Y, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 156(2.186026e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     AND_INT T5.W, T5.Y, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 152(2.129974e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     LSHR T5.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 148(2.073922e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     AND_INT T4.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 144(2.017870e-43)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T4.Z, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 140(1.961818e-43)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T4.Z, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 136(1.905766e-43)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 132(1.849714e-43)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 128(1.793662e-43)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 188(2.634441e-43)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T3.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 184(2.578389e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 180(2.522337e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T3.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 176(2.466285e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 172(2.410233e-43)
+; EG-NEXT:    ALU 76, @29, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T3.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 168(2.354181e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 164(2.298129e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T2.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 160(2.242078e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 220(3.082857e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 216(3.026805e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 212(2.970753e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 208(2.914701e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 204(2.858649e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 200(2.802597e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 196(2.746545e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 192(2.690493e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 252(3.531272e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 248(3.475220e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 244(3.419168e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 240(3.363116e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 236(3.307064e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 232(3.251012e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 228(3.194960e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 224(3.138909e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v64i16_to_v64i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; VI-DS128-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; VI-DS128-NEXT:    s_mov_b32 s90, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[8:11], v0
+; VI-DS128-NEXT:    ds_read_b128 v[16:19], v0 offset:16
+; VI-DS128-NEXT:    s_mov_b32 s91, 0xe80000
+; VI-DS128-NEXT:    s_add_u32 s88, s88, s11
+; VI-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; VI-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v19
+; VI-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; VI-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v16
+; VI-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v17
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v16
+; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v23
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v22
+; VI-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v23
+; VI-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; VI-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v21
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; VI-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v21
+; VI-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v20
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v27
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v26
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; VI-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v27
+; VI-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
+; VI-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v26
+; VI-DS128-NEXT:    v_and_b32_e32 v34, 0xffff, v25
+; VI-DS128-NEXT:    v_and_b32_e32 v32, 0xffff, v24
+; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:80
+; VI-DS128-NEXT:    ds_read_b128 v[55:58], v0 offset:96
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; VI-DS128-NEXT:    v_mov_b32_e32 v31, v15
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v54, 16, v25
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v52, 16, v24
+; VI-DS128-NEXT:    v_and_b32_e32 v49, 0xffff, v27
+; VI-DS128-NEXT:    v_and_b32_e32 v47, 0xffff, v26
+; VI-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v25
+; VI-DS128-NEXT:    v_and_b32_e32 v51, 0xffff, v24
+; VI-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:112
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v9
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
+; VI-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v11
+; VI-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v10
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
+; VI-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v25
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v24
+; VI-DS128-NEXT:    v_mov_b32_e32 v24, s0
+; VI-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v9
+; VI-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v8
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v42, 16, v39
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v40, 16, v38
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v36
+; VI-DS128-NEXT:    v_and_b32_e32 v41, 0xffff, v39
+; VI-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v38
+; VI-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
+; VI-DS128-NEXT:    v_and_b32_e32 v43, 0xffff, v36
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v57
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v56
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
+; VI-DS128-NEXT:    v_and_b32_e32 v60, 0xffff, v58
+; VI-DS128-NEXT:    v_and_b32_e32 v58, 0xffff, v57
+; VI-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v56
+; VI-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v55
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v27
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v26
+; VI-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v27
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v26
+; VI-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:224
+; VI-DS128-NEXT:    ds_write_b128 v24, v[4:7] offset:240
+; VI-DS128-NEXT:    ds_write_b128 v24, v[8:11] offset:192
+; VI-DS128-NEXT:    ds_write_b128 v24, v[58:61] offset:208
+; VI-DS128-NEXT:    ds_write_b128 v24, v[51:54] offset:160
+; VI-DS128-NEXT:    ds_write_b128 v24, v[47:50] offset:176
+; VI-DS128-NEXT:    ds_write_b128 v24, v[43:46] offset:128
+; VI-DS128-NEXT:    ds_write_b128 v24, v[39:42] offset:144
+; VI-DS128-NEXT:    ds_write_b128 v24, v[32:35] offset:96
+; VI-DS128-NEXT:    ds_write_b128 v24, v[20:23] offset:112
+; VI-DS128-NEXT:    ds_write_b128 v24, v[16:19] offset:64
+; VI-DS128-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:36 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:40 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:44 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    s_waitcnt vmcnt(0)
+; VI-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:80
+; VI-DS128-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    s_waitcnt vmcnt(0)
+; VI-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:32
+; VI-DS128-NEXT:    buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    s_waitcnt vmcnt(0)
+; VI-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v24, v[12:15]
+; VI-DS128-NEXT:    ds_write_b128 v24, v[28:31] offset:16
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v64i16_to_v64i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-DS128-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-DS128-NEXT:    s_mov_b32 s14, -1
+; GFX9-DS128-NEXT:    s_mov_b32 s15, 0xe00000
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[8:11], v0
+; GFX9-DS128-NEXT:    ds_read_b128 v[16:19], v0 offset:16
+; GFX9-DS128-NEXT:    s_add_u32 s12, s12, s11
+; GFX9-DS128-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-DS128-NEXT:    ds_read_b128 v[20:23], v0 offset:32
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v19
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX9-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v19
+; GFX9-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; GFX9-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v16
+; GFX9-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v17
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v16
+; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:48
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v23
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v22
+; GFX9-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v23
+; GFX9-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; GFX9-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v21
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v20
+; GFX9-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v21
+; GFX9-DS128-NEXT:    v_and_b32_e32 v16, 0xffff, v20
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v27
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v21, 16, v26
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GFX9-DS128-NEXT:    v_and_b32_e32 v22, 0xffff, v27
+; GFX9-DS128-NEXT:    ds_read_b128 v[36:39], v0 offset:64
+; GFX9-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v26
+; GFX9-DS128-NEXT:    v_and_b32_e32 v34, 0xffff, v25
+; GFX9-DS128-NEXT:    v_and_b32_e32 v32, 0xffff, v24
+; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:80
+; GFX9-DS128-NEXT:    ds_read_b128 v[55:58], v0 offset:96
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v31, v15
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v9
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v27
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v48, 16, v26
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v54, 16, v25
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v52, 16, v24
+; GFX9-DS128-NEXT:    v_and_b32_e32 v49, 0xffff, v27
+; GFX9-DS128-NEXT:    v_and_b32_e32 v47, 0xffff, v26
+; GFX9-DS128-NEXT:    v_and_b32_e32 v53, 0xffff, v25
+; GFX9-DS128-NEXT:    v_and_b32_e32 v51, 0xffff, v24
+; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v0 offset:112
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
+; GFX9-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v11
+; GFX9-DS128-NEXT:    v_and_b32_e32 v28, 0xffff, v10
+; GFX9-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v9
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
+; GFX9-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v25
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v24
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v24, s0
+; GFX9-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v8
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v42, 16, v39
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v40, 16, v38
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v46, 16, v37
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v36
+; GFX9-DS128-NEXT:    v_and_b32_e32 v41, 0xffff, v39
+; GFX9-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v38
+; GFX9-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v37
+; GFX9-DS128-NEXT:    v_and_b32_e32 v43, 0xffff, v36
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v59, 16, v57
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v56
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
+; GFX9-DS128-NEXT:    v_and_b32_e32 v60, 0xffff, v58
+; GFX9-DS128-NEXT:    v_and_b32_e32 v58, 0xffff, v57
+; GFX9-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v56
+; GFX9-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v55
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v27
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v26
+; GFX9-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v27
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v26
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:224
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[4:7] offset:240
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[8:11] offset:192
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[58:61] offset:208
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[51:54] offset:160
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[47:50] offset:176
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[43:46] offset:128
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[39:42] offset:144
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[32:35] offset:96
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[20:23] offset:112
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[16:19] offset:64
+; GFX9-DS128-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:80
+; GFX9-DS128-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:32
+; GFX9-DS128-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[0:3] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[12:15]
+; GFX9-DS128-NEXT:    ds_write_b128 v24, v[28:31] offset:16
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <64 x i16>, ptr addrspace(3) %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v64i16_to_v64i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s14, -1
+; SI-NEXT:    s_mov_b32 s15, 0xe8f000
+; SI-NEXT:    s_add_u32 s12, s12, s11
+; SI-NEXT:    s_addc_u32 s13, s13, 0
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v20, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[4:7], v20 offset0:8 offset1:9
+; SI-NEXT:    ds_read2_b64 v[0:3], v20 offset0:10 offset1:11
+; SI-NEXT:    ds_read2_b64 v[8:11], v20 offset0:12 offset1:13
+; SI-NEXT:    ds_read2_b64 v[12:15], v20 offset0:14 offset1:15
+; SI-NEXT:    ds_read2_b64 v[16:19], v20 offset1:1
+; SI-NEXT:    ds_read2_b64 v[30:33], v20 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[34:37], v20 offset0:4 offset1:5
+; SI-NEXT:    ds_read2_b64 v[38:41], v20 offset0:6 offset1:7
+; SI-NEXT:    s_waitcnt lgkmcnt(7)
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 16, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v23, 16, v4
+; SI-NEXT:    v_ashrrev_i32_e32 v25, 16, v7
+; SI-NEXT:    v_ashrrev_i32_e32 v27, 16, v6
+; SI-NEXT:    s_waitcnt lgkmcnt(6)
+; SI-NEXT:    v_ashrrev_i32_e32 v29, 16, v1
+; SI-NEXT:    v_bfe_i32 v20, v5, 0, 16
+; SI-NEXT:    buffer_store_dword v20, off, s[12:15], 0 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    v_bfe_i32 v22, v4, 0, 16
+; SI-NEXT:    v_bfe_i32 v24, v7, 0, 16
+; SI-NEXT:    v_bfe_i32 v26, v6, 0, 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v21, 16, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v7, 16, v3
+; SI-NEXT:    v_bfe_i32 v28, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v20, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v6, v3, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v5, 16, v2
+; SI-NEXT:    v_bfe_i32 v4, v2, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(5)
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v9
+; SI-NEXT:    v_bfe_i32 v2, v9, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 16, v8
+; SI-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v43, 16, v11
+; SI-NEXT:    v_bfe_i32 v42, v11, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 16, v10
+; SI-NEXT:    v_bfe_i32 v10, v10, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(4)
+; SI-NEXT:    v_ashrrev_i32_e32 v45, 16, v13
+; SI-NEXT:    v_bfe_i32 v44, v13, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 16, v12
+; SI-NEXT:    v_bfe_i32 v12, v12, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v47, 16, v15
+; SI-NEXT:    v_bfe_i32 v46, v15, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v14
+; SI-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(3)
+; SI-NEXT:    v_ashrrev_i32_e32 v49, 16, v17
+; SI-NEXT:    v_bfe_i32 v48, v17, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 16, v16
+; SI-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v51, 16, v19
+; SI-NEXT:    v_bfe_i32 v50, v19, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 16, v18
+; SI-NEXT:    v_bfe_i32 v18, v18, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(2)
+; SI-NEXT:    v_ashrrev_i32_e32 v53, 16, v31
+; SI-NEXT:    v_bfe_i32 v52, v31, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v31, 16, v30
+; SI-NEXT:    v_bfe_i32 v30, v30, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v55, 16, v33
+; SI-NEXT:    v_bfe_i32 v54, v33, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v33, 16, v32
+; SI-NEXT:    v_bfe_i32 v32, v32, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_ashrrev_i32_e32 v57, 16, v35
+; SI-NEXT:    v_bfe_i32 v56, v35, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v35, 16, v34
+; SI-NEXT:    v_bfe_i32 v34, v34, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v59, 16, v37
+; SI-NEXT:    v_bfe_i32 v58, v37, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v37, 16, v36
+; SI-NEXT:    v_bfe_i32 v36, v36, 0, 16
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v61, 16, v39
+; SI-NEXT:    v_bfe_i32 v60, v39, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v39, 16, v38
+; SI-NEXT:    v_bfe_i32 v38, v38, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v63, 16, v41
+; SI-NEXT:    v_bfe_i32 v62, v41, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v41, 16, v40
+; SI-NEXT:    v_bfe_i32 v40, v40, 0, 16
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    ds_write2_b64 v0, v[40:41], v[62:63] offset0:14 offset1:15
+; SI-NEXT:    ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11
+; SI-NEXT:    ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v0, v[32:33], v[54:55] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v0, v[30:31], v[52:53] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v0, v[18:19], v[50:51] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v0, v[16:17], v[48:49] offset1:1
+; SI-NEXT:    ds_write2_b64 v0, v[14:15], v[46:47] offset0:30 offset1:31
+; SI-NEXT:    ds_write2_b64 v0, v[12:13], v[44:45] offset0:28 offset1:29
+; SI-NEXT:    ds_write2_b64 v0, v[10:11], v[42:43] offset0:26 offset1:27
+; SI-NEXT:    ds_write2_b64 v0, v[8:9], v[2:3] offset0:24 offset1:25
+; SI-NEXT:    ds_write2_b64 v0, v[4:5], v[6:7] offset0:22 offset1:23
+; SI-NEXT:    ds_write2_b64 v0, v[20:21], v[28:29] offset0:20 offset1:21
+; SI-NEXT:    ds_write2_b64 v0, v[26:27], v[24:25] offset0:18 offset1:19
+; SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ds_write2_b64 v0, v[22:23], v[1:2] offset0:16 offset1:17
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; VI-NO-DS128-NEXT:    s_mov_b32 s90, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v28, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v28 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    s_mov_b32 s91, 0xe80000
+; VI-NO-DS128-NEXT:    s_add_u32 s88, s88, s11
+; VI-NO-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
+; VI-NO-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v30
+; VI-NO-DS128-NEXT:    v_bfe_i32 v24, v30, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v29
+; VI-NO-DS128-NEXT:    v_bfe_i32 v26, v29, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v38, 16, v32
+; VI-NO-DS128-NEXT:    v_bfe_i32 v37, v32, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v40, 16, v31
+; VI-NO-DS128-NEXT:    v_bfe_i32 v39, v31, 0, 16
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v42, 16, v34
+; VI-NO-DS128-NEXT:    v_bfe_i32 v41, v34, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v44, 16, v33
+; VI-NO-DS128-NEXT:    v_bfe_i32 v43, v33, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v46, 16, v36
+; VI-NO-DS128-NEXT:    v_bfe_i32 v45, v36, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v48, 16, v35
+; VI-NO-DS128-NEXT:    v_bfe_i32 v47, v35, 0, 16
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v50, 16, v30
+; VI-NO-DS128-NEXT:    v_bfe_i32 v49, v30, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v52, 16, v29
+; VI-NO-DS128-NEXT:    v_bfe_i32 v51, v29, 0, 16
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v56, 16, v31
+; VI-NO-DS128-NEXT:    v_bfe_i32 v55, v31, 0, 16
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v54, 16, v32
+; VI-NO-DS128-NEXT:    v_bfe_i32 v53, v32, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v32, 16, v31
+; VI-NO-DS128-NEXT:    v_bfe_i32 v31, v31, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v30
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v30, 0, 16
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v30, s0
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v13, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v12, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v14
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v17
+; VI-NO-DS128-NEXT:    v_bfe_i32 v8, v15, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v10, v14, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v12, v17, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v16, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v21
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v21, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v20
+; VI-NO-DS128-NEXT:    v_bfe_i32 v18, v20, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v23
+; VI-NO-DS128-NEXT:    v_bfe_i32 v20, v23, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v23, 16, v22
+; VI-NO-DS128-NEXT:    v_bfe_i32 v22, v22, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v58, 16, v34
+; VI-NO-DS128-NEXT:    v_bfe_i32 v57, v34, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v34, 16, v33
+; VI-NO-DS128-NEXT:    v_bfe_i32 v33, v33, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v60, 16, v36
+; VI-NO-DS128-NEXT:    v_bfe_i32 v59, v36, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v36, 16, v35
+; VI-NO-DS128-NEXT:    v_bfe_i32 v35, v35, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v62, 16, v29
+; VI-NO-DS128-NEXT:    v_bfe_i32 v61, v29, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v28
+; VI-NO-DS128-NEXT:    v_bfe_i32 v28, v28, 0, 16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[0:1], v[31:32] offset0:30 offset1:31
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[28:29], v[61:62] offset0:28 offset1:29
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[35:36], v[59:60] offset0:26 offset1:27
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[33:34], v[57:58] offset0:24 offset1:25
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[55:56], v[53:54] offset0:22 offset1:23
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[51:52], v[49:50] offset0:20 offset1:21
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[47:48], v[45:46] offset0:18 offset1:19
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[43:44], v[41:42] offset0:16 offset1:17
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[39:40], v[37:38] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[26:27], v[24:25] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[22:23], v[20:21] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[18:19], v[16:17] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[14:15], v[12:13] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[10:11], v[8:9] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[6:7], v[4:5] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
+; VI-NO-DS128-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; VI-NO-DS128-NEXT:    s_waitcnt vmcnt(0)
+; VI-NO-DS128-NEXT:    ds_write2_b64 v30, v[2:3], v[0:1] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s14, -1
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s15, 0xe00000
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v28, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v28 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    s_add_u32 s12, s12, s11
+; GFX9-NO-DS128-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v11
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    s_nop 0
+; GFX9-NO-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v10
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v13
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v12
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v30
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v24, v30, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v29
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v26, v29, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v38, 16, v32
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v37, v32, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v40, 16, v31
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v39, v31, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v42, 16, v34
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v41, v34, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v44, 16, v33
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v43, v33, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v46, 16, v36
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v45, v36, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v48, 16, v35
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v47, v35, 0, 16
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v50, 16, v30
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v49, v30, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v52, 16, v29
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v51, v29, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[33:36], v28 offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v56, 16, v31
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v55, v31, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v54, 16, v32
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v53, v32, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v15
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v10, 0, 16
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v32, 16, v31
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v31, v31, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v30
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v30, 0, 16
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v30, s0
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v13, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v12, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v14
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v17
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v15, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v10, v14, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v12, v17, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v16, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v21
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v21, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v20
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v18, v20, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v23
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v20, v23, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v23, 16, v22
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v22, v22, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v58, 16, v34
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v57, v34, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v34, 16, v33
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v33, v33, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v60, 16, v36
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v59, v36, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v36, 16, v35
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v35, v35, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v62, 16, v29
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v61, v29, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v28
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v28, v28, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[0:1], v[31:32] offset0:30 offset1:31
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[28:29], v[61:62] offset0:28 offset1:29
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[35:36], v[59:60] offset0:26 offset1:27
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[33:34], v[57:58] offset0:24 offset1:25
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[55:56], v[53:54] offset0:22 offset1:23
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[51:52], v[49:50] offset0:20 offset1:21
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[47:48], v[45:46] offset0:18 offset1:19
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[43:44], v[41:42] offset0:16 offset1:17
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[39:40], v[37:38] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[26:27], v[24:25] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[22:23], v[20:21] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[18:19], v[16:17] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[14:15], v[12:13] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[10:11], v[8:9] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[6:7], v[4:5] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GFX9-NO-DS128-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-NO-DS128-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v30, v[2:3], v[0:1] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v64i16_to_v64i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 116, @30, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.W, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Y, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Z, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.W, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Y, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Z, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    76(1.064987e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.W, OQAP,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT:    72(1.008935e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.Y, OQAP,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT:    68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.Z, OQAP,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.x,
+; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.W, OQAP,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT:    92(1.289195e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T6.W
+; EG-NEXT:     MOV T6.Y, OQAP,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT:    88(1.233143e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T6.W
+; EG-NEXT:     MOV T6.Z, OQAP,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Z, literal.x,
+; EG-NEXT:    84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T6.W
+; EG-NEXT:     MOV T6.W, OQAP,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T7.W
+; EG-NEXT:     MOV T7.Y, OQAP,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT:    108(1.513402e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T7.W
+; EG-NEXT:     MOV T7.Z, OQAP,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Z, literal.x,
+; EG-NEXT:    104(1.457350e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T7.W
+; EG-NEXT:     MOV T7.W, OQAP,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT:    100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T8.W
+; EG-NEXT:     MOV T8.Y, OQAP,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T8.W
+; EG-NEXT:     MOV T8.Z, OQAP,
+; EG-NEXT:     ADD_INT * T8.W, KC0[2].Z, literal.x,
+; EG-NEXT:    124(1.737610e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T8.W
+; EG-NEXT:     MOV T8.W, OQAP,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT:    120(1.681558e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T9.W
+; EG-NEXT:     MOV T9.Y, OQAP,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT:    116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T9.W
+; EG-NEXT:     MOV T9.Z, OQAP,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU 85, @31, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_READ_RET * OQAP, T9.W
+; EG-NEXT:     MOV T9.W, OQAP,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T10.W
+; EG-NEXT:     MOV T10.Y, OQAP,
+; EG-NEXT:     LSHR T10.W, T9.W, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 112(1.569454e-43)
+; EG-NEXT:     LDS_READ_RET * OQAP, T11.W
+; EG-NEXT:     MOV T10.Z, OQAP,
+; EG-NEXT:     LSHR * T11.Z, T10.Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T10.W, T10.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T0.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T0.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T0.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T1.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T1.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 52(7.286752e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T1.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T2.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 36(5.044674e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T2.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 92(1.289195e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T2.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 84(1.177091e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T3.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 76(1.064987e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T3.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 68(9.528830e-44)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T3.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 124(1.737610e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T4.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 116(1.625506e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T4.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 108(1.513402e-43)
+; EG-NEXT:    ALU 83, @32, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T4.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 100(1.401298e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T5.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 156(2.186026e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T5.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 148(2.073922e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T5.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 140(1.961818e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T6.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 132(1.849714e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T6.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 188(2.634441e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T6.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 180(2.522337e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T7.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 172(2.410233e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T7.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 164(2.298129e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T7.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 220(3.082857e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T8.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 212(2.970753e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T8.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 204(2.858649e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T8.W, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 196(2.746545e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T9.Y, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 252(3.531272e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T12.Z, T9.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 244(3.419168e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     LSHR T11.Z, T10.Z, literal.x,
+; EG-NEXT:     BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 236(3.307064e-43)
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     BFE_INT T10.W, T11.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T11.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 228(3.194960e-43)
+; EG-NEXT:    ALU 94, @33, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T11.W, T10.W,
+; EG-NEXT:     BFE_INT T9.W, T9.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     BFE_INT T9.W, T10.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     BFE_INT T9.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T10.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     BFE_INT T9.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     MOV * T10.W, KC0[2].Y,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T10.W, T9.W,
+; EG-NEXT:     BFE_INT T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T9.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T9.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T1.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T2.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T2.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 80(1.121039e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T2.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T3.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 64(8.968310e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T3.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T3.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 112(1.569454e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T4.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T4.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 96(1.345247e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T4.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 152(2.129974e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T5.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 144(2.017870e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 136(1.905766e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T5.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 128(1.793662e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T6.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 184(2.578389e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T6.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 176(2.466285e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T6.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 168(2.354181e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT * T0.W, T7.Y, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU 34, @34, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    160(2.242078e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T7.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 216(3.026805e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T7.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 208(2.914701e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T8.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 200(2.802597e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T8.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 192(2.690493e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T8.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 248(3.475220e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T9.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 240(3.363116e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T9.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 232(3.251012e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     BFE_INT T0.W, T10.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 224(3.138909e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v64i16_to_v64i32:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; VI-DS128-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; VI-DS128-NEXT:    s_mov_b32 s90, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v32, s1
+; VI-DS128-NEXT:    ds_read_b128 v[8:11], v32
+; VI-DS128-NEXT:    ds_read_b128 v[16:19], v32 offset:16
+; VI-DS128-NEXT:    s_mov_b32 s91, 0xe80000
+; VI-DS128-NEXT:    s_add_u32 s88, s88, s11
+; VI-DS128-NEXT:    s_addc_u32 s89, s89, 0
+; VI-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v19
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v18
+; VI-DS128-NEXT:    v_bfe_i32 v2, v19, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v18, 0, 16
+; VI-DS128-NEXT:    buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v6, 16, v17
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v4, 16, v16
+; VI-DS128-NEXT:    v_bfe_i32 v5, v17, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v3, v16, 0, 16
+; VI-DS128-NEXT:    buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
+; VI-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
+; VI-DS128-NEXT:    ds_read_b128 v[40:43], v32 offset:80
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v27
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v26
+; VI-DS128-NEXT:    v_bfe_i32 v18, v27, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v16, v26, 0, 16
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v36
+; VI-DS128-NEXT:    v_bfe_i32 v26, v36, 0, 16
+; VI-DS128-NEXT:    ds_read_b128 v[36:39], v32 offset:64
+; VI-DS128-NEXT:    ds_read_b128 v[56:59], v32 offset:96
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v53, 16, v40
+; VI-DS128-NEXT:    v_bfe_i32 v52, v40, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v11
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v47, 16, v39
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v45, 16, v38
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v51, 16, v37
+; VI-DS128-NEXT:    v_bfe_i32 v46, v39, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v44, v38, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v50, v37, 0, 16
+; VI-DS128-NEXT:    ds_read_b128 v[37:40], v32 offset:112
+; VI-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v10
+; VI-DS128-NEXT:    v_mov_b32_e32 v23, v15
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v9
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v38
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v37
+; VI-DS128-NEXT:    v_bfe_i32 v2, v38, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v37, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v8
+; VI-DS128-NEXT:    v_bfe_i32 v22, v11, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v20, v10, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v14, v9, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v12, v8, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v25
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v24
+; VI-DS128-NEXT:    v_bfe_i32 v10, v25, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v8, v24, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v31, 16, v34
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v33
+; VI-DS128-NEXT:    v_bfe_i32 v24, v35, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v30, v34, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v28, v33, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v49, 16, v36
+; VI-DS128-NEXT:    v_bfe_i32 v48, v36, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v36, 16, v43
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v34, 16, v42
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v55, 16, v41
+; VI-DS128-NEXT:    v_bfe_i32 v35, v43, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v33, v42, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v54, v41, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v62, 16, v59
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v60, 16, v58
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v57
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v56
+; VI-DS128-NEXT:    v_bfe_i32 v61, v59, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v59, v58, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v6, v57, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v4, v56, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v43, 16, v40
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v41, 16, v39
+; VI-DS128-NEXT:    v_bfe_i32 v42, v40, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v40, v39, 0, 16
+; VI-DS128-NEXT:    ds_write_b128 v32, v[0:3] offset:224
+; VI-DS128-NEXT:    ds_write_b128 v32, v[40:43] offset:240
+; VI-DS128-NEXT:    ds_write_b128 v32, v[4:7] offset:192
+; VI-DS128-NEXT:    ds_write_b128 v32, v[59:62] offset:208
+; VI-DS128-NEXT:    ds_write_b128 v32, v[52:55] offset:160
+; VI-DS128-NEXT:    ds_write_b128 v32, v[33:36] offset:176
+; VI-DS128-NEXT:    ds_write_b128 v32, v[48:51] offset:128
+; VI-DS128-NEXT:    ds_write_b128 v32, v[44:47] offset:144
+; VI-DS128-NEXT:    ds_write_b128 v32, v[28:31] offset:96
+; VI-DS128-NEXT:    ds_write_b128 v32, v[24:27] offset:112
+; VI-DS128-NEXT:    ds_write_b128 v32, v[8:11] offset:64
+; VI-DS128-NEXT:    ds_write_b128 v32, v[16:19] offset:80
+; VI-DS128-NEXT:    buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    s_waitcnt vmcnt(0)
+; VI-DS128-NEXT:    ds_write_b128 v32, v[0:3] offset:32
+; VI-DS128-NEXT:    buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
+; VI-DS128-NEXT:    s_waitcnt vmcnt(0)
+; VI-DS128-NEXT:    ds_write_b128 v32, v[0:3] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v32, v[12:15]
+; VI-DS128-NEXT:    ds_write_b128 v32, v[20:23] offset:16
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v64i16_to_v64i32:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-DS128-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-DS128-NEXT:    s_mov_b32 s14, -1
+; GFX9-DS128-NEXT:    s_mov_b32 s15, 0xe00000
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v32, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[8:11], v32
+; GFX9-DS128-NEXT:    ds_read_b128 v[16:19], v32 offset:16
+; GFX9-DS128-NEXT:    s_add_u32 s12, s12, s11
+; GFX9-DS128-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-DS128-NEXT:    ds_read_b128 v[24:27], v32 offset:32
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v11
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v19
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v18
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v19, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v18, 0, 16
+; GFX9-DS128-NEXT:    buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v6, 16, v17
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v4, 16, v16
+; GFX9-DS128-NEXT:    v_bfe_i32 v5, v17, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v3, v16, 0, 16
+; GFX9-DS128-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    s_nop 0
+; GFX9-DS128-NEXT:    buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GFX9-DS128-NEXT:    ds_read_b128 v[33:36], v32 offset:48
+; GFX9-DS128-NEXT:    ds_read_b128 v[40:43], v32 offset:80
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v19, 16, v27
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v17, 16, v26
+; GFX9-DS128-NEXT:    v_bfe_i32 v18, v27, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v16, v26, 0, 16
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v27, 16, v36
+; GFX9-DS128-NEXT:    v_bfe_i32 v26, v36, 0, 16
+; GFX9-DS128-NEXT:    ds_read_b128 v[36:39], v32 offset:64
+; GFX9-DS128-NEXT:    ds_read_b128 v[56:59], v32 offset:96
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v53, 16, v40
+; GFX9-DS128-NEXT:    v_bfe_i32 v52, v40, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v21, 16, v10
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v47, 16, v39
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v45, 16, v38
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v51, 16, v37
+; GFX9-DS128-NEXT:    v_bfe_i32 v46, v39, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v44, v38, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v50, v37, 0, 16
+; GFX9-DS128-NEXT:    ds_read_b128 v[37:40], v32 offset:112
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v32, s0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v23, v15
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 16, v9
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v13, 16, v8
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 16, v38
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 16, v37
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v38, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v37, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v22, v11, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v20, v10, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v14, v9, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v12, v8, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 16, v25
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 16, v24
+; GFX9-DS128-NEXT:    v_bfe_i32 v10, v25, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v8, v24, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v25, 16, v35
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v31, 16, v34
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v29, 16, v33
+; GFX9-DS128-NEXT:    v_bfe_i32 v24, v35, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v30, v34, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v28, v33, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v49, 16, v36
+; GFX9-DS128-NEXT:    v_bfe_i32 v48, v36, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v36, 16, v43
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v34, 16, v42
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v55, 16, v41
+; GFX9-DS128-NEXT:    v_bfe_i32 v35, v43, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v33, v42, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v54, v41, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v62, 16, v59
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v60, 16, v58
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 16, v57
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 16, v56
+; GFX9-DS128-NEXT:    v_bfe_i32 v61, v59, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v59, v58, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v57, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v56, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v43, 16, v40
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v41, 16, v39
+; GFX9-DS128-NEXT:    v_bfe_i32 v42, v40, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v40, v39, 0, 16
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[0:3] offset:224
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[40:43] offset:240
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[4:7] offset:192
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[59:62] offset:208
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[52:55] offset:160
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[33:36] offset:176
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[48:51] offset:128
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[44:47] offset:144
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[28:31] offset:96
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[24:27] offset:112
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[8:11] offset:64
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[16:19] offset:80
+; GFX9-DS128-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[0:3] offset:32
+; GFX9-DS128-NEXT:    buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GFX9-DS128-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[0:3] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[12:15]
+; GFX9-DS128-NEXT:    ds_write_b128 v32, v[20:23] offset:16
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <64 x i16>, ptr addrspace(3) %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_i16_to_i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-; GCN-DAG: ds_read_u16 v[[LO:[0-9]+]],
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
-
-; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]]
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: LDS_WRITE
 define amdgpu_kernel void @local_zextload_i16_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_i16_to_i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_u16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_zextload_i16_to_i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_u16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NEXT:    ds_write_b64 v2, v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_zextload_i16_to_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_i16_to_i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 8, @35, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     MOV T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    0(0.000000e+00), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %a = load i16, ptr addrspace(3) %in
   %ext = zext i16 %a to i64
   store i64 %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_i16_to_i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
 ; FIXME: Need to optimize this sequence to avoid an extra shift.
 ;  t25: i32,ch = load<LD2[%in(addrspace=3)], anyext from i16> t12, t10, undef:i32
 ;          t28: i64 = any_extend t25
 ;        t30: i64 = sign_extend_inreg t28, ValueType:ch:i16
-; SI: ds_read_i16 v[[LO:[0-9]+]],
-; GFX89: ds_read_u16 v[[ULO:[0-9]+]]
-; GFX89: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
-; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
-
-; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]]
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
-; EG-DAG: LDS_WRITE
-; EG-DAG: 16
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_sextload_i16_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_i16_to_i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_i16 v0, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_sextload_i16_to_i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_u16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-NEXT:    ds_write_b64 v2, v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_sextload_i16_to_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_i16_to_i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 10, @36, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.X, OQAP,
+; EG-NEXT:     BFE_INT * T0.W, PV.X, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T1.W, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %a = load i16, ptr addrspace(3) %in
   %ext = sext i16 %a to i64
   store i64 %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: LDS_WRITE
 define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v1i16_to_v1i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_u16 v0, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_zextload_v1i16_to_v1i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_u16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NEXT:    ds_write_b64 v2, v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_zextload_v1i16_to_v1i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v1i16_to_v1i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 8, @37, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     MOV T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    0(0.000000e+00), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %load = load <1 x i16>, ptr addrspace(3) %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
-; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
-; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
-; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
-; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
-; EG-DAG: LDS_WRITE
-; EG-DAG: 16
-; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
 define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v1i16_to_v1i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_i16 v0, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: local_sextload_v1i16_to_v1i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    ds_read_u16 v0, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-NEXT:    ds_write_b64 v2, v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: local_sextload_v1i16_to_v1i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    ds_read_u16 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v1i16_to_v1i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 10, @38, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_USHORT_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.X, OQAP,
+; EG-NEXT:     BFE_INT * T0.W, PV.X, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T1.W, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
   %load = load <1 x i16>, ptr addrspace(3) %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v2i16_to_v2i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b32 v2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v2i16_to_v2i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read_b32 v0, v0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v2i16_to_v2i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read_b32 v0, v0
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s1, 0xffff
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_and_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v2i16_to_v2i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 17, @39, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.Y, OQAP,
+; EG-NEXT:     AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T1.W, literal.y,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v2i16_to_v2i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    v_mov_b32_e32 v1, 0
+; VI-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b32 v0, v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v2i16_to_v2i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b32 v2, v0
+; GFX9-DS128-NEXT:    s_mov_b32 s1, 0xffff
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX9-DS128-NEXT:    v_and_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <2 x i16>, ptr addrspace(3) %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
 define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v2i16_to_v2i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b32 v0, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v2i16_to_v2i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read_b32 v0, v0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v2i16_to_v2i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read_b32 v0, v0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v2i16_to_v2i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 18, @40, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.Y, OQAP,
+; EG-NEXT:     BFE_INT * T0.W, PV.Y, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     ASHR T1.W, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 4(5.605194e-45)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v2i16_to_v2i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b32 v1, v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; VI-DS128-NEXT:    v_bfe_i32 v0, v1, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v2i16_to_v2i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b32 v1, v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <2 x i16>, ptr addrspace(3) %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v4i16_to_v4i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v4i16_to_v4i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    v_mov_b32_e32 v3, 0
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_mov_b32_e32 v7, v3
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_mov_b32_e32 v10, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v1
+; SI-NEXT:    ds_write2_b64 v10, v[4:5], v[2:3] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v10, v[6:7], v[8:9] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v4i16_to_v4i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, v2
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v6, v2
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v9, s0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v8, v2
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v7, 0xffff, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v9, v[3:4], v[1:2] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v9, v[7:8], v[5:6] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v4i16_to_v4i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v6, v2
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v9, s0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_and_b32_sdwa v7, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NO-DS128-NEXT:    v_and_b32_sdwa v5, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v9, v[3:4], v[5:6] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v9, v[1:2], v[7:8] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v4i16_to_v4i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 35, @41, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T1.W, literal.y,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v4i16_to_v4i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    v_mov_b32_e32 v1, 0
+; VI-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; VI-DS128-NEXT:    v_mov_b32_e32 v5, v1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b64 v[7:8], v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v9, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v8
+; VI-DS128-NEXT:    v_mov_b32_e32 v7, v1
+; VI-DS128-NEXT:    ds_write_b128 v9, v[0:3] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v9, v[4:7]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v4i16_to_v4i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b64 v[6:7], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX9-DS128-NEXT:    v_and_b32_sdwa v6, s2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v7
+; GFX9-DS128-NEXT:    v_and_b32_sdwa v2, s2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[4:7]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <4 x i16>, ptr addrspace(3) %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
 define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v4i16_to_v4i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read_b64 v[0:1], v0
+; SI-NEXT:    v_mov_b32_e32 v9, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
+; SI-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; SI-NEXT:    v_bfe_i32 v5, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v7, v4, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; SI-NEXT:    ds_write2_b64 v9, v[3:4], v[1:2] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v9, v[5:6], v[7:8] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v3, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v8, v[0:1], v[2:3] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v8, v[0:1], v[2:3] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v4i16_to_v4i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 39, @42, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     BFE_INT * T0.W, T0.Y, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     BFE_INT T1.Z, PV.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T1.W, PV.W, literal.y,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 20(2.802597e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.Z,
+; EG-NEXT:     ASHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v4i16_to_v4i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; VI-DS128-NEXT:    v_bfe_i32 v4, v3, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    ds_write_b128 v8, v[4:7] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b64 v[0:1], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v3, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[4:7] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <4 x i16>, ptr addrspace(3) %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v8i16_to_v8i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v5, 0
+; SI-NEXT:    v_mov_b32_e32 v7, v5
+; SI-NEXT:    v_mov_b32_e32 v9, v5
+; SI-NEXT:    v_mov_b32_e32 v11, v5
+; SI-NEXT:    v_mov_b32_e32 v13, v5
+; SI-NEXT:    v_mov_b32_e32 v15, v5
+; SI-NEXT:    v_mov_b32_e32 v17, v5
+; SI-NEXT:    v_mov_b32_e32 v19, v5
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    ds_write2_b64 v0, v[8:9], v[6:7] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v0, v[12:13], v[4:5] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v0, v[10:11], v[16:17] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v0, v[14:15], v[18:19] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v8i16_to_v8i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v11, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v3
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v10, v3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[9:10], v[2:3] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v9, v3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v7, v3
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v5, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[6:7], v[8:9] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v2, v3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v6, v3
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[5:6], v[1:2] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v1, v3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v5, v3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[4:5], v[0:1] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v8i16_to_v8i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, v12
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v10, v12
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NO-DS128-NEXT:    s_mov_b32 s1, 0xffff
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v13, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_and_b32_sdwa v7, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; GFX9-NO-DS128-NEXT:    v_and_b32_sdwa v6, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v3, v12
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v13, v[11:12], v[7:8] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v7, v12
+; GFX9-NO-DS128-NEXT:    v_and_b32_sdwa v5, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v9, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v13, v[2:3], v[6:7] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v6, v12
+; GFX9-NO-DS128-NEXT:    v_and_b32_sdwa v4, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v1, v12
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v13, v[9:10], v[5:6] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v13, v[0:1], v[4:5] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v8i16_to_v8i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 71, @43, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     AND_INT T1.W, T0.W, literal.x,
+; EG-NEXT:     MOV * T2.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T1.W, literal.y,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v8i16_to_v8i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v14, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; VI-DS128-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; VI-DS128-NEXT:    v_mov_b32_e32 v1, 0
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; VI-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; VI-DS128-NEXT:    v_mov_b32_e32 v11, v1
+; VI-DS128-NEXT:    v_mov_b32_e32 v13, v1
+; VI-DS128-NEXT:    ds_write_b128 v14, v[10:13] offset:32
+; VI-DS128-NEXT:    v_mov_b32_e32 v8, v1
+; VI-DS128-NEXT:    v_mov_b32_e32 v10, v1
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; VI-DS128-NEXT:    v_mov_b32_e32 v3, v1
+; VI-DS128-NEXT:    ds_write_b128 v14, v[7:10] offset:16
+; VI-DS128-NEXT:    v_mov_b32_e32 v5, v1
+; VI-DS128-NEXT:    v_mov_b32_e32 v7, v1
+; VI-DS128-NEXT:    ds_write_b128 v14, v[0:3] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v14, v[4:7]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v8i16_to_v8i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v13, v11
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v8, v11
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v5, v11
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; GFX9-DS128-NEXT:    s_mov_b32 s1, 0xffff
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v14, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX9-DS128-NEXT:    v_and_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; GFX9-DS128-NEXT:    v_and_b32_sdwa v9, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DS128-NEXT:    ds_write_b128 v14, v[10:13] offset:48
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v10, v11
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GFX9-DS128-NEXT:    v_and_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v2
+; GFX9-DS128-NEXT:    v_and_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v1, v11
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v3, v11
+; GFX9-DS128-NEXT:    ds_write_b128 v14, v[7:10] offset:16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v7, v11
+; GFX9-DS128-NEXT:    ds_write_b128 v14, v[0:3] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v14, v[4:7]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <8 x i16>, ptr addrspace(3) %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
 define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v8i16_to_v8i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v16, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v9, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 16, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v6, 16, v3
+; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v8, v1, 0, 16
+; SI-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT:    v_bfe_i32 v10, v9, 0, 16
+; SI-NEXT:    v_bfe_i32 v12, v12, 0, 16
+; SI-NEXT:    v_bfe_i32 v14, v11, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; SI-NEXT:    ds_write2_b64 v16, v[10:11], v[6:7] offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v16, v[8:9], v[4:5] offset0:2 offset1:3
+; SI-NEXT:    ds_write2_b64 v16, v[2:3], v[14:15] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v16, v[0:1], v[12:13] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v8i16_to_v8i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v16, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v2, v3
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; VI-NO-DS128-NEXT:    v_bfe_i32 v10, v10, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v5, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v8, v7, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; VI-NO-DS128-NEXT:    v_bfe_i32 v12, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[2:3], v[10:11] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[14:15], v[8:9] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[0:1], v[6:7] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v16, v[12:13], v[4:5] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v8i16_to_v8i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v16, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v10, v9, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v5, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v7, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v12, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[2:3], v[10:11] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[14:15], v[8:9] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[0:1], v[6:7] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v16, v[12:13], v[4:5] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v8i16_to_v8i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 80, @44, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV * T0.W, OQAP,
+; EG-NEXT:     BFE_INT T1.W, T0.Z, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     BFE_INT T1.Z, T0.W, 0.0, literal.x,
+; EG-NEXT:     ASHR T2.W, T1.W, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     BFE_INT T2.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T2.W, T1.Z, literal.y,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     BFE_INT T3.Z, T1.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T2.W, T2.Z, literal.y,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     ASHR T2.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 52(7.286752e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     MOV * T2.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T1.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T1.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T1.Z,
+; EG-NEXT:     ASHR T1.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T2.Z,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T3.Z,
+; EG-NEXT:     ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v8i16_to_v8i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v16, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; VI-DS128-NEXT:    v_bfe_i32 v6, v0, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; VI-DS128-NEXT:    v_bfe_i32 v12, v2, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v14, v0, 0, 16
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, v3
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; VI-DS128-NEXT:    v_bfe_i32 v8, v1, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v10, v1, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-DS128-NEXT:    ds_write_b128 v16, v[0:3] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v16, v[12:15] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v16, v[8:11] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v16, v[4:7]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v8i16_to_v8i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v16, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v0, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX9-DS128-NEXT:    v_bfe_i32 v12, v2, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v14, v0, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX9-DS128-NEXT:    v_bfe_i32 v8, v1, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v10, v1, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-DS128-NEXT:    ds_write_b128 v16, v[0:3] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v16, v[12:15] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v16, v[8:11] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v16, v[4:7]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <8 x i16>, ptr addrspace(3) %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v16i16_to_v16i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
+; SI-NEXT:    v_mov_b32_e32 v9, 0
+; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v11, v9
+; SI-NEXT:    v_mov_b32_e32 v13, v9
+; SI-NEXT:    v_mov_b32_e32 v15, v9
+; SI-NEXT:    v_mov_b32_e32 v17, v9
+; SI-NEXT:    v_mov_b32_e32 v20, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v1
+; SI-NEXT:    ds_write2_b64 v20, v[16:17], v[14:15] offset0:10 offset1:11
+; SI-NEXT:    v_mov_b32_e32 v16, v9
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v3
+; SI-NEXT:    ds_write2_b64 v20, v[14:15], v[12:13] offset0:14 offset1:15
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v7
+; SI-NEXT:    ds_write2_b64 v20, v[15:16], v[10:11] offset0:6 offset1:7
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v0
+; SI-NEXT:    v_mov_b32_e32 v5, v9
+; SI-NEXT:    ds_write2_b64 v20, v[4:5], v[8:9] offset0:2 offset1:3
+; SI-NEXT:    v_mov_b32_e32 v19, v9
+; SI-NEXT:    v_mov_b32_e32 v8, v9
+; SI-NEXT:    v_mov_b32_e32 v15, v9
+; SI-NEXT:    v_mov_b32_e32 v2, v9
+; SI-NEXT:    v_mov_b32_e32 v4, v9
+; SI-NEXT:    ds_write2_b64 v20, v[18:19], v[12:13] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v20, v[16:17], v[14:15] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v20, v[10:11], v[1:2] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v20, v[7:8], v[3:4] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v16i16_to_v16i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v8, 0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v10, v8
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v13, v8
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v14, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v5
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[12:13], v[9:10] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v6, v8
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[5:6], v[9:10] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v5, 0xffff, v7
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[5:6], v[9:10] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v4
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, v8
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[12:13], v[7:8] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[3:4], v[9:10] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v3, v8
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v7, v8
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[2:3], v[6:7] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v2, v8
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v6, v8
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[1:2], v[5:6] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v1, v8
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v12, v8
+; VI-NO-DS128-NEXT:    ds_write2_b64 v14, v[0:1], v[11:12] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v16i16_to_v16i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v12, v8
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v14, v8
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v15, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v11, 0xffff, v5
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[11:12], v[9:10] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[5:6], v[9:10] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v5, 0xffff, v7
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[5:6], v[9:10] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v4
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v11, v8
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[10:11], v[7:8] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v10, v8
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[3:4], v[9:10] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v3, v8
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v7, v8
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[2:3], v[6:7] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[1:2], v[5:6] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v1, v8
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[0:1], v[13:14] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v16i16_to_v16i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 100, @45, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     MOV * T2.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT:     MOV * T3.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T1.W, literal.y,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:    ALU 42, @46, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    76(1.064987e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    92(1.289195e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    108(1.513402e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    124(1.737610e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v16i16_to_v16i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    v_mov_b32_e32 v26, 0
+; VI-DS128-NEXT:    v_mov_b32_e32 v22, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v24, v26
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v5, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v5
+; VI-DS128-NEXT:    ds_read_b128 v[13:16], v5 offset:16
+; VI-DS128-NEXT:    v_mov_b32_e32 v11, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v19, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v8, v26
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; VI-DS128-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v23, 16, v13
+; VI-DS128-NEXT:    v_and_b32_e32 v21, 0xffff, v13
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v27, 16, v14
+; VI-DS128-NEXT:    v_and_b32_e32 v25, 0xffff, v14
+; VI-DS128-NEXT:    v_mov_b32_e32 v14, s0
+; VI-DS128-NEXT:    v_mov_b32_e32 v13, v26
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; VI-DS128-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v20, 16, v16
+; VI-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v16
+; VI-DS128-NEXT:    ds_write_b128 v14, v[21:24] offset:64
+; VI-DS128-NEXT:    v_mov_b32_e32 v21, v26
+; VI-DS128-NEXT:    ds_write_b128 v14, v[10:13] offset:32
+; VI-DS128-NEXT:    v_mov_b32_e32 v10, v26
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; VI-DS128-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; VI-DS128-NEXT:    ds_write_b128 v14, v[18:21] offset:112
+; VI-DS128-NEXT:    v_mov_b32_e32 v16, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v18, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v1, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v3, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v28, v26
+; VI-DS128-NEXT:    ds_write_b128 v14, v[7:10] offset:16
+; VI-DS128-NEXT:    v_mov_b32_e32 v5, v26
+; VI-DS128-NEXT:    v_mov_b32_e32 v7, v26
+; VI-DS128-NEXT:    ds_write_b128 v14, v[15:18] offset:96
+; VI-DS128-NEXT:    ds_write_b128 v14, v[0:3] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v14, v[25:28] offset:80
+; VI-DS128-NEXT:    ds_write_b128 v14, v[4:7]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v16i16_to_v16i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v25, 0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v21, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v23, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v18, v25
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v4
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v4 offset:16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v28, s0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v15, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v12, v25
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v22, 16, v7
+; GFX9-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v7
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-DS128-NEXT:    v_and_b32_e32 v17, 0xffff, v6
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[20:23] offset:112
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v20, v25
+; GFX9-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v2
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[17:20] offset:96
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v17, v25
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GFX9-DS128-NEXT:    v_and_b32_e32 v11, 0xffff, v1
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[14:17] offset:32
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v14, v25
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX9-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v0
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v26, 16, v5
+; GFX9-DS128-NEXT:    v_and_b32_e32 v24, 0xffff, v5
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v5, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v7, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v1, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v3, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v27, v25
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[11:14] offset:16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v9, v25
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v11, v25
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[4:7] offset:64
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[0:3] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[24:27] offset:80
+; GFX9-DS128-NEXT:    ds_write_b128 v28, v[8:11]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <16 x i16>, ptr addrspace(3) %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
 define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v16i16_to_v16i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[4:7], v4 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v18, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v12, v3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v14, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v4
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v8, 16, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v10, 16, v3
+; SI-NEXT:    v_bfe_i32 v12, v12, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT:    ds_write2_b64 v18, v[12:13], v[10:11] offset0:14 offset1:15
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v10, 16, v1
+; SI-NEXT:    v_bfe_i32 v12, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT:    ds_write2_b64 v18, v[12:13], v[10:11] offset0:10 offset1:11
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v7
+; SI-NEXT:    v_ashrrev_i32_e32 v10, 16, v7
+; SI-NEXT:    v_bfe_i32 v12, v14, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT:    ds_write2_b64 v18, v[12:13], v[10:11] offset0:6 offset1:7
+; SI-NEXT:    v_bfe_i32 v1, v4, 0, 16
+; SI-NEXT:    v_bfe_i32 v3, v5, 0, 16
+; SI-NEXT:    v_bfe_i32 v5, v6, 0, 16
+; SI-NEXT:    v_bfe_i32 v7, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v10, v2, 0, 16
+; SI-NEXT:    v_bfe_i32 v12, v19, 0, 16
+; SI-NEXT:    v_bfe_i32 v14, v17, 0, 16
+; SI-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; SI-NEXT:    ds_write2_b64 v18, v[3:4], v[8:9] offset0:2 offset1:3
+; SI-NEXT:    v_bfe_i32 v3, v15, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; SI-NEXT:    ds_write2_b64 v18, v[10:11], v[3:4] offset0:12 offset1:13
+; SI-NEXT:    ds_write2_b64 v18, v[7:8], v[16:17] offset0:8 offset1:9
+; SI-NEXT:    ds_write2_b64 v18, v[5:6], v[14:15] offset0:4 offset1:5
+; SI-NEXT:    ds_write2_b64 v18, v[1:2], v[12:13] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v19, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[16:17], v[14:15] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v5, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[4:5], v[14:15] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v6, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v16, v7
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v18, 0, 16
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v18, v3
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v18, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v10, v9, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v12, v11, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v19, v[0:1], v[8:9] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v19, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[16:17], v[14:15] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v5, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[4:5], v[14:15] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v6, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v16, v7
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v4, v18, 0, 16
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v18, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v14, v18, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v10, v9, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v12, v11, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v19, v[0:1], v[8:9] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v16i16_to_v16i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 101, @47, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     MOV * T1.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     BFE_INT T2.W, T1.W, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV * T2.Z, OQAP,
+; EG-NEXT:     BFE_INT T3.Z, T2.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T3.W, T2.W, literal.y,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     BFE_INT T4.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T3.W, T3.Z, literal.y,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     BFE_INT T5.Z, T0.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T3.W, T4.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     BFE_INT T6.Z, T0.W, 0.0, literal.x,
+; EG-NEXT:     ASHR T3.W, T5.Z, literal.y,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     BFE_INT T7.Z, T1.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T3.W, T6.Z, literal.y,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     BFE_INT T8.Z, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T3.W, T7.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     BFE_INT T9.Z, T2.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T3.W, T8.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     ASHR T3.W, T9.Z, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 116(1.625506e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     ASHR T3.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     ASHR T1.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T3.W, T1.W,
+; EG-NEXT:     MOV * T1.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T1.W, T2.W,
+; EG-NEXT:     ASHR T1.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T1.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T3.Z,
+; EG-NEXT:     ASHR T1.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 44(6.165713e-44)
+; EG-NEXT:    ALU 62, @48, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T1.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T4.Z,
+; EG-NEXT:     ASHR T1.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T1.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T5.Z,
+; EG-NEXT:     ASHR T1.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 76(1.064987e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     ASHR T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T6.Z,
+; EG-NEXT:     ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 92(1.289195e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T7.Z,
+; EG-NEXT:     ASHR T0.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 108(1.513402e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T8.Z,
+; EG-NEXT:     ASHR T0.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 124(1.737610e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T9.Z,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v16i16_to_v16i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[3:6], v0
+; VI-DS128-NEXT:    ds_read_b128 v[7:10], v0 offset:16
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_mov_b32_e32 v18, v6
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_bfe_i32 v11, v8, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; VI-DS128-NEXT:    v_bfe_i32 v13, v8, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; VI-DS128-NEXT:    ds_write_b128 v8, v[11:14] offset:80
+; VI-DS128-NEXT:    v_bfe_i32 v11, v7, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; VI-DS128-NEXT:    v_bfe_i32 v13, v7, 0, 16
+; VI-DS128-NEXT:    v_mov_b32_e32 v15, v10
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
+; VI-DS128-NEXT:    ds_write_b128 v8, v[11:14] offset:64
+; VI-DS128-NEXT:    v_bfe_i32 v11, v15, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v13, v7, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v9
+; VI-DS128-NEXT:    ds_write_b128 v8, v[11:14] offset:112
+; VI-DS128-NEXT:    v_bfe_i32 v14, v9, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; VI-DS128-NEXT:    v_bfe_i32 v10, v4, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; VI-DS128-NEXT:    ds_write_b128 v8, v[14:17] offset:96
+; VI-DS128-NEXT:    v_bfe_i32 v14, v18, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v16, v19, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v3, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v12, v4, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v4, v5, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v6, v7, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT:    ds_write_b128 v8, v[14:17] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v8, v[4:7] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v8, v[10:13] offset:16
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v16i16_to_v16i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[3:6], v0
+; GFX9-DS128-NEXT:    ds_read_b128 v[7:10], v0 offset:16
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_bfe_i32 v0, v3, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX9-DS128-NEXT:    v_bfe_i32 v11, v8, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v3, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[11:14] offset:80
+; GFX9-DS128-NEXT:    v_bfe_i32 v11, v7, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v7, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v15, v10
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[11:14] offset:64
+; GFX9-DS128-NEXT:    v_bfe_i32 v11, v15, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v7, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v9
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[11:14] offset:112
+; GFX9-DS128-NEXT:    v_bfe_i32 v14, v9, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v18, v6
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-DS128-NEXT:    v_bfe_i32 v10, v4, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[14:17] offset:96
+; GFX9-DS128-NEXT:    v_bfe_i32 v14, v18, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v16, v19, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v12, v4, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v5, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v7, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[14:17] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[4:7] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[10:13] offset:16
+; GFX9-DS128-NEXT:    ds_write_b128 v8, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <16 x i16>, ptr addrspace(3) %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_zextload_v32i16_to_v32i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[2:5], v0 offset0:2 offset1:3
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    ds_read2_b64 v[6:9], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v19, v1
+; SI-NEXT:    v_mov_b32_e32 v21, v1
+; SI-NEXT:    v_mov_b32_e32 v22, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v5
+; SI-NEXT:    ds_read2_b64 v[10:13], v0 offset0:4 offset1:5
+; SI-NEXT:    ds_read2_b64 v[14:17], v0 offset0:6 offset1:7
+; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:14 offset1:15
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v3
+; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:10 offset1:11
+; SI-NEXT:    s_waitcnt lgkmcnt(4)
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v9
+; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:6 offset1:7
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v7
+; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:2 offset1:3
+; SI-NEXT:    s_waitcnt lgkmcnt(4)
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v17
+; SI-NEXT:    ds_write2_b64 v22, v[20:21], v[18:19] offset0:30 offset1:31
+; SI-NEXT:    v_mov_b32_e32 v18, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; SI-NEXT:    v_mov_b32_e32 v20, v1
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v15
+; SI-NEXT:    ds_write2_b64 v22, v[19:20], v[17:18] offset0:26 offset1:27
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; SI-NEXT:    ds_write2_b64 v22, v[19:20], v[17:18] offset0:22 offset1:23
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    ds_write2_b64 v22, v[4:5], v[17:18] offset0:12 offset1:13
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v2
+; SI-NEXT:    v_mov_b32_e32 v4, v1
+; SI-NEXT:    ds_write2_b64 v22, v[17:18], v[3:4] offset0:8 offset1:9
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v7, v1
+; SI-NEXT:    v_mov_b32_e32 v3, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; SI-NEXT:    ds_write2_b64 v22, v[8:9], v[2:3] offset0:4 offset1:5
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v10
+; SI-NEXT:    ds_write2_b64 v22, v[6:7], v[4:5] offset1:1
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v10
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v11
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v14
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v16
+; SI-NEXT:    v_mov_b32_e32 v6, v1
+; SI-NEXT:    ds_write2_b64 v22, v[5:6], v[0:1] offset0:18 offset1:19
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v13, v1
+; SI-NEXT:    v_mov_b32_e32 v16, v1
+; SI-NEXT:    ds_write2_b64 v22, v[19:20], v[12:13] offset0:28 offset1:29
+; SI-NEXT:    ds_write2_b64 v22, v[17:18], v[15:16] offset0:24 offset1:25
+; SI-NEXT:    ds_write2_b64 v22, v[10:11], v[2:3] offset0:20 offset1:21
+; SI-NEXT:    ds_write2_b64 v22, v[4:5], v[8:9] offset0:16 offset1:17
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_zextload_v32i16_to_v32i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v5, 0
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v19, v5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v21, v5
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[6:9], v4 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v22, s0
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v4 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v4 offset1:1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v2
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v1
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v2, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[18:19], v[1:2] offset0:26 offset1:27
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(5)
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v9
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v1, v5
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[0:1], v[18:19] offset0:22 offset1:23
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v8
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v9, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[0:1], v[8:9] offset0:20 offset1:21
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v7
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v8, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[0:1], v[7:8] offset0:18 offset1:19
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v7, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(8)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v13
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v12
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v11
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(10)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v14
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v3
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v14, v5
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v10
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v10, 16, v17
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v17
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[4:5], v[13:14] offset0:30 offset1:31
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v13, v5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v11, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v9, 0xffff, v16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[12:13], v[10:11] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v10, v5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v3, v5
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v15
+; VI-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v15
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[9:10], v[2:3] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v2, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[1:2] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v9, v5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v1, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v22, v[8:9], v[0:1] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_zextload_v32i16_to_v32i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v19, v5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v21, v5
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[6:9], v4 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v4 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v22, s0
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[10:13], v4 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[14:17], v4 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(2)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v2
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v18, 0xffff, v1
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[18:19], v[1:2] offset0:26 offset1:27
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v9
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[0:1], v[18:19] offset0:22 offset1:23
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v8
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[0:1], v[8:9] offset0:20 offset1:21
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v7
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[0:1], v[7:8] offset0:18 offset1:19
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(7)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v17
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v16
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v15
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v14
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v8, 0xffff, v10
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v9, 0xffff, v12
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v10, 16, v13
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v3
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v14, v5
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; GFX9-NO-DS128-NEXT:    v_and_b32_e32 v6, 0xffff, v11
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[4:5], v[13:14] offset0:30 offset1:31
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v13, v5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v11, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[12:13], v[10:11] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v10, v5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[9:10], v[2:3] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[6:7], v[1:2] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v9, v5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v22, v[8:9], v[0:1] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_zextload_v32i16_to_v32i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 105, @49, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    56(7.847271e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.W, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Z, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T1.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T2.W
+; EG-NEXT:     MOV T2.W, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Y, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.Z, OQAP,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T3.W
+; EG-NEXT:     MOV T3.W, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Y, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.Z, OQAP,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T4.W
+; EG-NEXT:     MOV T4.W, OQAP,
+; EG-NEXT:     MOV * T5.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV T5.Y, OQAP,
+; EG-NEXT:     LSHR T5.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T5.W,
+; EG-NEXT:     AND_INT T4.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T5.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T5.Y, literal.x,
+; EG-NEXT:     MOV * T5.W, KC0[2].Y,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T4.Z, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T4.Z, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 48(6.726233e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T4.W, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 32(4.484155e-44)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     LSHR T4.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 88(1.233143e-43)
+; EG-NEXT:     LDS_WRITE * T5.W, T4.W,
+; EG-NEXT:     AND_INT T3.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 80(1.121039e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT:    ALU 93, @50, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T3.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 64(8.968310e-44)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T3.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 112(1.569454e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     LSHR T3.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T4.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T4.W, T3.W,
+; EG-NEXT:     AND_INT T2.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 96(1.345247e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 152(2.129974e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 144(2.017870e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 136(1.905766e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T2.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 128(1.793662e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     LSHR T2.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 184(2.578389e-43)
+; EG-NEXT:     LDS_WRITE * T3.W, T2.W,
+; EG-NEXT:     AND_INT T1.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 176(2.466285e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 168(2.354181e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 160(2.242078e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 216(3.026805e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T1.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 208(2.914701e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     LSHR T1.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 200(2.802597e-43)
+; EG-NEXT:     LDS_WRITE * T2.W, T1.W,
+; EG-NEXT:     AND_INT T0.W, T0.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 192(2.690493e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 248(3.475220e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 240(3.363116e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     LSHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 232(3.251012e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 224(3.138909e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T1.W, literal.y,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU 87, @51, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    92(1.289195e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    76(1.064987e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    124(1.737610e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    108(1.513402e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    100(1.401298e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    156(2.186026e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    148(2.073922e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    140(1.961818e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    132(1.849714e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    188(2.634441e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    180(2.522337e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    172(2.410233e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    164(2.298129e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    220(3.082857e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    212(2.970753e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    204(2.858649e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    196(2.746545e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    252(3.531272e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    244(3.419168e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    236(3.307064e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    228(3.194960e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T1.W,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_zextload_v32i16_to_v32i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DS128-NEXT:    ds_read_b128 v[3:6], v1
+; VI-DS128-NEXT:    ds_read_b128 v[7:10], v1 offset:16
+; VI-DS128-NEXT:    v_mov_b32_e32 v52, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
+; VI-DS128-NEXT:    v_and_b32_e32 v17, 0xffff, v8
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v22, 16, v7
+; VI-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v7
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; VI-DS128-NEXT:    v_and_b32_e32 v23, 0xffff, v10
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
+; VI-DS128-NEXT:    v_and_b32_e32 v26, 0xffff, v9
+; VI-DS128-NEXT:    ds_read_b128 v[7:10], v1 offset:32
+; VI-DS128-NEXT:    ds_read_b128 v[29:32], v1 offset:48
+; VI-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v6
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; VI-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v38, 16, v7
+; VI-DS128-NEXT:    v_and_b32_e32 v36, 0xffff, v7
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v9
+; VI-DS128-NEXT:    v_and_b32_e32 v42, 0xffff, v9
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v30
+; VI-DS128-NEXT:    v_and_b32_e32 v7, 0xffff, v30
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v32
+; VI-DS128-NEXT:    v_and_b32_e32 v48, 0xffff, v32
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
+; VI-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v31, 0
+; VI-DS128-NEXT:    v_mov_b32_e32 v49, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v51, v31
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v47, 16, v29
+; VI-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v29
+; VI-DS128-NEXT:    ds_write_b128 v52, v[48:51] offset:240
+; VI-DS128-NEXT:    v_mov_b32_e32 v46, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v48, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v27, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v29, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[45:48] offset:192
+; VI-DS128-NEXT:    v_mov_b32_e32 v43, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v45, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[26:29] offset:96
+; VI-DS128-NEXT:    v_mov_b32_e32 v24, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v26, v31
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v41, 16, v10
+; VI-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v10
+; VI-DS128-NEXT:    ds_write_b128 v52, v[42:45] offset:160
+; VI-DS128-NEXT:    v_mov_b32_e32 v40, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v42, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[23:26] offset:112
+; VI-DS128-NEXT:    v_mov_b32_e32 v21, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v23, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[39:42] offset:176
+; VI-DS128-NEXT:    v_mov_b32_e32 v37, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v39, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[20:23] offset:64
+; VI-DS128-NEXT:    v_mov_b32_e32 v18, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v20, v31
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; VI-DS128-NEXT:    v_and_b32_e32 v33, 0xffff, v8
+; VI-DS128-NEXT:    v_mov_b32_e32 v8, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v10, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[36:39] offset:128
+; VI-DS128-NEXT:    v_mov_b32_e32 v34, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v36, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[17:20] offset:80
+; VI-DS128-NEXT:    v_mov_b32_e32 v15, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v17, v31
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; VI-DS128-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; VI-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; VI-DS128-NEXT:    ds_write_b128 v52, v[7:10] offset:208
+; VI-DS128-NEXT:    ds_write_b128 v52, v[33:36] offset:144
+; VI-DS128-NEXT:    v_mov_b32_e32 v5, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v7, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v33, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[14:17] offset:48
+; VI-DS128-NEXT:    v_mov_b32_e32 v12, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v14, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v1, v31
+; VI-DS128-NEXT:    v_mov_b32_e32 v3, v31
+; VI-DS128-NEXT:    ds_write_b128 v52, v[4:7] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v52, v[30:33] offset:224
+; VI-DS128-NEXT:    ds_write_b128 v52, v[11:14]
+; VI-DS128-NEXT:    ds_write_b128 v52, v[0:3] offset:16
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_zextload_v32i16_to_v32i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[3:6], v1
+; GFX9-DS128-NEXT:    ds_read_b128 v[7:10], v1 offset:16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v52, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
+; GFX9-DS128-NEXT:    v_and_b32_e32 v17, 0xffff, v8
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v22, 16, v7
+; GFX9-DS128-NEXT:    v_and_b32_e32 v20, 0xffff, v7
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GFX9-DS128-NEXT:    v_and_b32_e32 v23, 0xffff, v10
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
+; GFX9-DS128-NEXT:    v_and_b32_e32 v26, 0xffff, v9
+; GFX9-DS128-NEXT:    ds_read_b128 v[7:10], v1 offset:32
+; GFX9-DS128-NEXT:    ds_read_b128 v[29:32], v1 offset:48
+; GFX9-DS128-NEXT:    v_and_b32_e32 v14, 0xffff, v6
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX9-DS128-NEXT:    v_and_b32_e32 v0, 0xffff, v4
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v38, 16, v7
+; GFX9-DS128-NEXT:    v_and_b32_e32 v36, 0xffff, v7
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v44, 16, v9
+; GFX9-DS128-NEXT:    v_and_b32_e32 v42, 0xffff, v9
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v30
+; GFX9-DS128-NEXT:    v_and_b32_e32 v7, 0xffff, v30
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v50, 16, v32
+; GFX9-DS128-NEXT:    v_and_b32_e32 v48, 0xffff, v32
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
+; GFX9-DS128-NEXT:    v_and_b32_e32 v30, 0xffff, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v31, 0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v49, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v51, v31
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v47, 16, v29
+; GFX9-DS128-NEXT:    v_and_b32_e32 v45, 0xffff, v29
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[48:51] offset:240
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v46, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v48, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v27, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v29, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[45:48] offset:192
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v43, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v45, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[26:29] offset:96
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v24, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v26, v31
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v41, 16, v10
+; GFX9-DS128-NEXT:    v_and_b32_e32 v39, 0xffff, v10
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[42:45] offset:160
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v40, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v42, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[23:26] offset:112
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v21, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v23, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[39:42] offset:176
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v37, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v39, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[20:23] offset:64
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v18, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v20, v31
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; GFX9-DS128-NEXT:    v_and_b32_e32 v33, 0xffff, v8
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v8, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v10, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[36:39] offset:128
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v34, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v36, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[17:20] offset:80
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v15, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v17, v31
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX9-DS128-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX9-DS128-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[7:10] offset:208
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[33:36] offset:144
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v5, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v7, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v33, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[14:17] offset:48
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v12, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v14, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v1, v31
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v3, v31
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[4:7] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[30:33] offset:224
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[11:14]
+; GFX9-DS128-NEXT:    ds_write_b128 v52, v[0:3] offset:16
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <32 x i16>, ptr addrspace(3) %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, ptr addrspace(3) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i64:
-; GFX9-NOT: m0
-; SICIVI: s_mov_b32 m0
-
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: BFE_INT
-; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
 define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
+; SI-LABEL: local_sextload_v32i16_to_v32i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v12, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
+; SI-NEXT:    ds_read2_b64 v[0:3], v12 offset1:1
+; SI-NEXT:    ds_read2_b64 v[8:11], v12 offset0:6 offset1:7
+; SI-NEXT:    ds_read2_b64 v[12:15], v12 offset0:4 offset1:5
+; SI-NEXT:    s_waitcnt lgkmcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v18, v7
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v7
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v7
+; SI-NEXT:    v_bfe_i32 v18, v18, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; SI-NEXT:    v_mov_b32_e32 v7, s0
+; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:14 offset1:15
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v5
+; SI-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:10 offset1:11
+; SI-NEXT:    s_waitcnt lgkmcnt(4)
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v3
+; SI-NEXT:    v_bfe_i32 v18, v5, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:6 offset1:7
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v1
+; SI-NEXT:    v_bfe_i32 v18, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:2 offset1:3
+; SI-NEXT:    s_waitcnt lgkmcnt(5)
+; SI-NEXT:    v_mov_b32_e32 v1, v11
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v11
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v11
+; SI-NEXT:    v_bfe_i32 v18, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:30 offset1:31
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v9
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 16, v9
+; SI-NEXT:    v_bfe_i32 v18, v9, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; SI-NEXT:    ds_write2_b64 v7, v[18:19], v[16:17] offset0:26 offset1:27
+; SI-NEXT:    s_waitcnt lgkmcnt(6)
+; SI-NEXT:    v_mov_b32_e32 v1, v15
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v15
+; SI-NEXT:    v_bfe_i32 v17, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; SI-NEXT:    ds_write2_b64 v7, v[17:18], v[15:16] offset0:22 offset1:23
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v13
+; SI-NEXT:    v_ashrrev_i32_e32 v15, 16, v13
+; SI-NEXT:    v_bfe_i32 v17, v13, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; SI-NEXT:    ds_write2_b64 v7, v[17:18], v[15:16] offset0:18 offset1:19
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; SI-NEXT:    v_bfe_i32 v5, v6, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; SI-NEXT:    v_bfe_i32 v15, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; SI-NEXT:    ds_write2_b64 v7, v[5:6], v[15:16] offset0:12 offset1:13
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; SI-NEXT:    v_bfe_i32 v3, v4, 0, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v8
+; SI-NEXT:    v_bfe_i32 v5, v1, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; SI-NEXT:    ds_write2_b64 v7, v[3:4], v[5:6] offset0:8 offset1:9
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v12
+; SI-NEXT:    v_bfe_i32 v1, v12, 0, 16
+; SI-NEXT:    v_bfe_i32 v3, v14, 0, 16
+; SI-NEXT:    v_bfe_i32 v5, v8, 0, 16
+; SI-NEXT:    v_bfe_i32 v8, v10, 0, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
+; SI-NEXT:    v_bfe_i32 v9, v0, 0, 16
+; SI-NEXT:    v_bfe_i32 v10, v2, 0, 16
+; SI-NEXT:    v_bfe_i32 v12, v11, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; SI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; SI-NEXT:    ds_write2_b64 v7, v[10:11], v[12:13] offset0:4 offset1:5
+; SI-NEXT:    v_bfe_i32 v11, v6, 0, 16
+; SI-NEXT:    v_bfe_i32 v13, v4, 0, 16
+; SI-NEXT:    v_bfe_i32 v15, v15, 0, 16
+; SI-NEXT:    v_bfe_i32 v16, v14, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; SI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; SI-NEXT:    ds_write2_b64 v7, v[9:10], v[16:17] offset1:1
+; SI-NEXT:    v_bfe_i32 v17, v18, 0, 16
+; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; SI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; SI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; SI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; SI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; SI-NEXT:    ds_write2_b64 v7, v[8:9], v[17:18] offset0:28 offset1:29
+; SI-NEXT:    ds_write2_b64 v7, v[5:6], v[15:16] offset0:24 offset1:25
+; SI-NEXT:    ds_write2_b64 v7, v[3:4], v[13:14] offset0:20 offset1:21
+; SI-NEXT:    ds_write2_b64 v7, v[1:2], v[11:12] offset0:16 offset1:17
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v7, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v7 offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[12:15], v7 offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v11, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v18, v3, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[3:6], v7 offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[7:10], v7 offset1:1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[18:19], v[16:17] offset0:30 offset1:31
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v18, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[18:19], v[16:17] offset0:28 offset1:29
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v2, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[1:2], v[16:17] offset0:26 offset1:27
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v18, v17, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[16:17], v[18:19] offset0:24 offset1:25
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(6)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v18, v15, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[18:19], v[16:17] offset0:22 offset1:23
+; VI-NO-DS128-NEXT:    v_bfe_i32 v15, v15, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v17, v14, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v14, 16, v13
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[17:18], v[15:16] offset0:20 offset1:21
+; VI-NO-DS128-NEXT:    v_bfe_i32 v14, v14, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v16, v13, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[16:17], v[14:15] offset0:18 offset1:19
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v12
+; VI-NO-DS128-NEXT:    v_bfe_i32 v15, v12, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v17, v16, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[15:16], v[17:18] offset0:16 offset1:17
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(9)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v15, 16, v6
+; VI-NO-DS128-NEXT:    v_bfe_i32 v15, v15, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v17, v6, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[17:18], v[15:16] offset0:14 offset1:15
+; VI-NO-DS128-NEXT:    v_bfe_i32 v15, v6, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(9)
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[5:6], v[15:16] offset0:12 offset1:13
+; VI-NO-DS128-NEXT:    v_bfe_i32 v5, v12, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v15, v4, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v13, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[15:16], v[5:6] offset0:10 offset1:11
+; VI-NO-DS128-NEXT:    v_bfe_i32 v15, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
+; VI-NO-DS128-NEXT:    v_bfe_i32 v17, v3, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v21, v0, 0, 16
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; VI-NO-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; VI-NO-DS128-NEXT:    v_bfe_i32 v19, v19, 0, 16
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[17:18], v[21:22] offset0:8 offset1:9
+; VI-NO-DS128-NEXT:    v_bfe_i32 v17, v10, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v4, v7, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v6, v8, 0, 16
+; VI-NO-DS128-NEXT:    v_bfe_i32 v8, v9, 0, 16
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[17:18], v[19:20] offset0:6 offset1:7
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[8:9], v[15:16] offset0:4 offset1:5
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[6:7], v[13:14] offset0:2 offset1:3
+; VI-NO-DS128-NEXT:    ds_write2_b64 v11, v[4:5], v[1:2] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v8, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[4:7], v8 offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v8 offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v15, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v9, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v18, v7, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[11:14], v8 offset1:1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[7:10], v8 offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[18:19], v[16:17] offset0:30 offset1:31
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v16, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v18, v6, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[18:19], v[16:17] offset0:28 offset1:29
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v6, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[5:6], v[16:17] offset0:26 offset1:27
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v18, v17, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(5)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[16:17], v[18:19] offset0:24 offset1:25
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[3:4], v[16:17] offset0:22 offset1:23
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[16:17], v[3:4] offset0:20 offset1:21
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v1, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[16:17], v[2:3] offset0:18 offset1:19
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[3:4], v[16:17] offset0:16 offset1:17
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(8)
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v10, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[16:17], v[3:4] offset0:14 offset1:15
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v9, v9, 0, 16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[9:10], v[3:4] offset0:12 offset1:13
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v8, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v20, 16, v12
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[8:9], v[3:4] offset0:10 offset1:11
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v1, v20, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v7, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v20, v4, 0, 16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, v14
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v11
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v8, v12, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v12, v18, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v18, v19, 0, 16
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[16:17], v[20:21] offset0:8 offset1:9
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v16, v0, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v3, v11, 0, 16
+; GFX9-NO-DS128-NEXT:    v_bfe_i32 v10, v13, 0, 16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-NO-DS128-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[16:17], v[18:19] offset0:6 offset1:7
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[10:11], v[12:13] offset0:4 offset1:5
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[8:9], v[1:2] offset0:2 offset1:3
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v15, v[3:4], v[5:6] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_sextload_v32i16_to_v32i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 107, @52, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T1.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T1.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T1.W, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    28(3.923636e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T2.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T2.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T2.W, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T3.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    44(6.165713e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T3.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    40(5.605194e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T3.W, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T4.Y, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T4.Z, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    60(8.407791e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T4.W, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T5.Y, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T5.Z, OQAP,
+; EG-NEXT:     BFE_INT T0.W, T5.Y, 0.0, literal.x,
+; EG-NEXT:     ADD_INT * T5.W, KC0[2].Z, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_READ_RET * OQAP, T5.W
+; EG-NEXT:     MOV * T5.W, OQAP,
+; EG-NEXT:     BFE_INT T0.Z, T5.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T0.W, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    20(2.802597e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T6.Z, T0.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T0.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T7.Z, T1.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T6.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    52(7.286752e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T8.Z, T1.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T7.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    36(5.044674e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T9.Z, T1.W, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T8.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    84(1.177091e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T10.Z, T2.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T9.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    68(9.528830e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T11.Z, T2.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T10.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    116(1.625506e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT * T12.Z, T2.W, 0.0, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU 98, @53, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ASHR T6.W, T11.Z, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 100(1.401298e-43)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T13.Z, T3.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T12.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    148(2.073922e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T14.Z, T3.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T13.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    132(1.849714e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T15.Z, T3.W, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T14.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    180(2.522337e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T16.Z, T4.Y, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T15.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    164(2.298129e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T17.Z, T4.Z, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T16.Z, literal.y, BS:VEC_120/SCL_212
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    212(2.970753e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T18.Z, T4.W, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T17.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    196(2.746545e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     BFE_INT T19.Z, T5.W, 0.0, literal.x,
+; EG-NEXT:     ASHR T6.W, T18.Z, literal.y,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.z,
+; EG-NEXT:    16(2.242078e-44), 31(4.344025e-44)
+; EG-NEXT:    244(3.419168e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     ASHR T6.W, T19.Z, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 228(3.194960e-43)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     ASHR T6.W, T5.Y, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 28(3.923636e-44)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     ASHR T6.W, T5.Y, literal.x,
+; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
+; EG-NEXT:     LDS_WRITE * T7.W, T6.W,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T5.Z, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 12(1.681558e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T5.Z, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 8(1.121039e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.Z,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 60(8.407791e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 56(7.847271e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T6.Z,
+; EG-NEXT:     ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 44(6.165713e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T1.Y, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 40(5.605194e-44)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T7.Z,
+; EG-NEXT:     ASHR T0.W, T1.Z, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 92(1.289195e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ASHR * T0.W, T1.Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU 99, @54, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.x,
+; EG-NEXT:    88(1.233143e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    80(1.121039e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T8.Z,
+; EG-NEXT:     ASHR T0.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T6.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 76(1.064987e-43)
+; EG-NEXT:     LDS_WRITE * T6.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T1.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 72(1.008935e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    64(8.968310e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T9.Z,
+; EG-NEXT:     ASHR T0.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 124(1.737610e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T2.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 120(1.681558e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    112(1.569454e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T10.Z,
+; EG-NEXT:     ASHR T0.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 108(1.513402e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T2.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 104(1.457350e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    96(1.345247e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T11.Z,
+; EG-NEXT:     ASHR T0.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 156(2.186026e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T2.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 152(2.129974e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    144(2.017870e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T12.Z,
+; EG-NEXT:     ASHR T0.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 140(1.961818e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T3.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 136(1.905766e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    128(1.793662e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T13.Z,
+; EG-NEXT:     ASHR T0.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 188(2.634441e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T3.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 184(2.578389e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    176(2.466285e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T14.Z,
+; EG-NEXT:     ASHR T0.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 172(2.410233e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T3.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 168(2.354181e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    160(2.242078e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T15.Z,
+; EG-NEXT:     ASHR T0.W, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 220(3.082857e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T4.Y, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 216(3.026805e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    208(2.914701e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T16.Z,
+; EG-NEXT:     ASHR T0.W, T4.Z, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 204(2.858649e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR * T0.W, T4.Z, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:    ALU 27, @55, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    200(2.802597e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    192(2.690493e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T17.Z,
+; EG-NEXT:     ASHR T0.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 252(3.531272e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T4.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 248(3.475220e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    240(3.363116e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T18.Z,
+; EG-NEXT:     ASHR T0.W, T5.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    31(4.344025e-44), 236(3.307064e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ASHR T0.W, T5.W, literal.x,
+; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    16(2.242078e-44), 232(3.251012e-43)
+; EG-NEXT:     LDS_WRITE * T1.W, T0.W,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    224(3.138909e-43), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T19.Z,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_sextload_v32i16_to_v32i64:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v4 offset:48
+; VI-DS128-NEXT:    ds_read_b128 v[9:12], v4 offset:32
+; VI-DS128-NEXT:    v_mov_b32_e32 v8, s0
+; VI-DS128-NEXT:    ds_read_b128 v[17:20], v4 offset:16
+; VI-DS128-NEXT:    ds_read_b128 v[4:7], v4
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; VI-DS128-NEXT:    v_bfe_i32 v13, v2, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; VI-DS128-NEXT:    v_bfe_i32 v15, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT:    v_mov_b32_e32 v2, v3
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VI-DS128-NEXT:    ds_write_b128 v8, v[13:16] offset:224
+; VI-DS128-NEXT:    v_bfe_i32 v13, v2, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v15, v3, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; VI-DS128-NEXT:    ds_write_b128 v8, v[13:16] offset:240
+; VI-DS128-NEXT:    v_bfe_i32 v15, v2, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; VI-DS128-NEXT:    v_bfe_i32 v13, v0, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v0, v1, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:208
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(5)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v11
+; VI-DS128-NEXT:    v_bfe_i32 v0, v11, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    ds_write_b128 v8, v[13:16] offset:192
+; VI-DS128-NEXT:    v_mov_b32_e32 v13, v12
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:160
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
+; VI-DS128-NEXT:    v_bfe_i32 v0, v13, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:176
+; VI-DS128-NEXT:    v_bfe_i32 v0, v9, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; VI-DS128-NEXT:    v_bfe_i32 v9, v10, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v11, v11, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    ds_write_b128 v8, v[9:12] offset:144
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(8)
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    v_bfe_i32 v9, v19, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v11, v11, 0, 16
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:128
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(8)
+; VI-DS128-NEXT:    v_bfe_i32 v0, v5, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; VI-DS128-NEXT:    v_mov_b32_e32 v5, v20
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    ds_write_b128 v8, v[9:12] offset:96
+; VI-DS128-NEXT:    v_bfe_i32 v9, v5, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v20
+; VI-DS128-NEXT:    v_bfe_i32 v11, v5, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v17
+; VI-DS128-NEXT:    ds_write_b128 v8, v[9:12] offset:112
+; VI-DS128-NEXT:    v_bfe_i32 v9, v17, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v11, v5, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
+; VI-DS128-NEXT:    ds_write_b128 v8, v[9:12] offset:64
+; VI-DS128-NEXT:    v_bfe_i32 v9, v4, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; VI-DS128-NEXT:    v_bfe_i32 v13, v18, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v15, v5, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v11, v4, 0, 16
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, v7
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT:    ds_write_b128 v8, v[13:16] offset:80
+; VI-DS128-NEXT:    v_bfe_i32 v13, v4, 0, 16
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; VI-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; VI-DS128-NEXT:    v_bfe_i32 v15, v4, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v4, v6, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v6, v7, 0, 16
+; VI-DS128-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; VI-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; VI-DS128-NEXT:    ds_write_b128 v8, v[4:7] offset:32
+; VI-DS128-NEXT:    ds_write_b128 v8, v[13:16] offset:48
+; VI-DS128-NEXT:    ds_write_b128 v8, v[9:12]
+; VI-DS128-NEXT:    ds_write_b128 v8, v[0:3] offset:16
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_sextload_v32i16_to_v32i64:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v13, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[4:7], v13 offset:48
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v13 offset:32
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v12, s0
+; GFX9-DS128-NEXT:    ds_read_b128 v[8:11], v13
+; GFX9-DS128-NEXT:    ds_read_b128 v[18:21], v13 offset:16
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(3)
+; GFX9-DS128-NEXT:    v_bfe_i32 v14, v6, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-DS128-NEXT:    v_bfe_i32 v16, v6, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[14:17] offset:224
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v6, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v7, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:240
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v6, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v4, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v5, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[4:7] offset:208
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(5)
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v2, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v2, 0, 16
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:192
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v13, v3
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[4:7] offset:160
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v13, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v2, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[4:7] offset:176
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v6, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(6)
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:144
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v20, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v1, 0, 16
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[2:5] offset:128
+; GFX9-DS128-NEXT:    v_bfe_i32 v4, v0, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, v21
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:96
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v0, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v0, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:112
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v18, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v0, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:64
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v19, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v1, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v6, v8, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v8, v0, 0, 16
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, v11
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:80
+; GFX9-DS128-NEXT:    v_bfe_i32 v13, v0, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX9-DS128-NEXT:    v_bfe_i32 v15, v0, 0, 16
+; GFX9-DS128-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX9-DS128-NEXT:    v_bfe_i32 v17, v10, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v19, v0, 0, 16
+; GFX9-DS128-NEXT:    v_bfe_i32 v2, v9, 0, 16
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX9-DS128-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[17:20] offset:32
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[13:16] offset:48
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[6:9]
+; GFX9-DS128-NEXT:    ds_write_b128 v12, v[2:5] offset:16
+; GFX9-DS128-NEXT:    s_endpgm
   %load = load <32 x i16>, ptr addrspace(3) %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, ptr addrspace(3) %out
@@ -948,19 +8949,95 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out
 ; }
 
 ; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load.
-; FUNC-LABEL: {{^}}local_v8i16_to_128:
-
-; SI-NOT: ds_read_b128
-; SI-NOT: ds_write_b128
-
-; CIVI: ds_read_b128
-; CIVI: ds_write_b128
-
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
-; EG: LDS_READ_RET
 define amdgpu_kernel void @local_v8i16_to_128(ptr addrspace(3) %out, ptr addrspace(3) %in) {
+; SI-LABEL: local_v8i16_to_128:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; SI-NEXT:    v_mov_b32_e32 v4, s0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; SI-NEXT:    s_endpgm
+;
+; VI-NO-DS128-LABEL: local_v8i16_to_128:
+; VI-NO-DS128:       ; %bb.0:
+; VI-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NO-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; VI-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NO-DS128-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; VI-NO-DS128-NEXT:    s_endpgm
+;
+; GFX9-NO-DS128-LABEL: local_v8i16_to_128:
+; GFX9-NO-DS128:       ; %bb.0:
+; GFX9-NO-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NO-DS128-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
+; GFX9-NO-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NO-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NO-DS128-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
+; GFX9-NO-DS128-NEXT:    s_endpgm
+;
+; EG-LABEL: local_v8i16_to_128:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 25, @56, KC0[CB0:0-32], KC1[]
+; EG-NEXT:     MOV * T0.W, KC0[2].Z,
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     MOV * T0.W, KC0[2].Y,
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_READ_RET * OQAP, T0.W
+; EG-NEXT:     MOV T0.X, OQAP,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT:     LDS_WRITE * T0.W, T0.X,
+; EG-NEXT:    RETURN
+;
+; VI-DS128-LABEL: local_v8i16_to_128:
+; VI-DS128:       ; %bb.0:
+; VI-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-DS128-NEXT:    s_mov_b32 m0, -1
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; VI-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; VI-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; VI-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; VI-DS128-NEXT:    s_endpgm
+;
+; GFX9-DS128-LABEL: local_v8i16_to_128:
+; GFX9-DS128:       ; %bb.0:
+; GFX9-DS128-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-DS128-NEXT:    ds_read_b128 v[0:3], v0
+; GFX9-DS128-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-DS128-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DS128-NEXT:    ds_write_b128 v4, v[0:3]
+; GFX9-DS128-NEXT:    s_endpgm
   %ld = load <8 x i16>, ptr addrspace(3) %in, align 16
   store <8 x i16> %ld, ptr addrspace(3) %out, align 16
   ret void


        


More information about the llvm-commits mailing list