[llvm] [AMDGPU] Promote uniform ops to I32 in ISel (PR #106383)

Pierre van Houtryve via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 28 05:55:06 PDT 2024


https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/106383

See #106382 for NFC test updates.

Promote uniform binops, selects and setcc in Global & DAGISel instead of CGP.

Solves #64591

>From ddae283c0cac640c5d27972e8d62c57db7e722dc Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 28 Aug 2024 14:30:03 +0200
Subject: [PATCH 1/2] [NFC][AMDGPU] Autogenerate a few tests for uniform to i16
 promotion in ISel

---
 .../CodeGen/AMDGPU/extract_vector_dynelt.ll   | 1463 +++++++++++++----
 .../CodeGen/AMDGPU/extract_vector_elt-i8.ll   |  656 ++++++--
 llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll     | 1104 ++++++++++++-
 3 files changed, 2712 insertions(+), 511 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index ee1df9aa0d6cea..0a2cac5a3e26ba 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -1,415 +1,729 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
 
-; GCN-LABEL: {{^}}float4_extelt:
-; GCN-NOT: buffer_
-; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
-; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
-; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2.0, [[V1]], [[C2]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4.0, [[V2]], [[C3]]
-; GCN:     store_dword v[{{[0-9:]+}}], [[V3]]
 define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: float4_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s4, 1
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 2
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s[2:3]
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 2.0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v2, 4.0, v0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x float> <float 0.0, float 1.0, float 2.0, float 4.0>, i32 %sel
   store float %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}int4_extelt:
-; GCN-NOT: buffer_
-; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 2
-; GCN-DAG: s_cmp_eq_u32 [[IDX]], 1
-; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2, [[V1]], vcc
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4, [[V2]], vcc
-; GCN: store_dword v[{{[0-9:]+}}], [[V3]]
 define amdgpu_kernel void @int4_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: int4_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s4, 1
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 2
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 2, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v2, 4, v0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 4>, i32 %sel
   store i32 %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}double4_extelt:
-; GCN-NOT: buffer_
-; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3f847ae1
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x47ae147b
-; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0xe147ae14, s{{[0-9]+}}
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x4000147a, s{{[0-9]+}}
-; GCN-DAG: s_cmp_eq_u32 s{{[[0-9]+}}, 3
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x40100a3d, s{{[0-9]+}}
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x70a3d70a, s{{[0-9]+}}
-; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: double4_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_mov_b32 s2, 0x3ff028f5
+; GCN-NEXT:    s_mov_b32 s3, 0xc28f5c29
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s4, 1
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0x3f847ae1
+; GCN-NEXT:    s_cselect_b32 s3, s3, 0x47ae147b
+; GCN-NEXT:    s_cmp_eq_u32 s4, 2
+; GCN-NEXT:    s_cselect_b32 s3, 0xe147ae14, s3
+; GCN-NEXT:    s_cselect_b32 s2, 0x4000147a, s2
+; GCN-NEXT:    s_cmp_eq_u32 s4, 3
+; GCN-NEXT:    s_cselect_b32 s2, 0x40100a3d, s2
+; GCN-NEXT:    s_cselect_b32 s3, 0x70a3d70a, s3
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x double> <double 0.01, double 1.01, double 2.01, double 4.01>, i32 %sel
   store double %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}double5_extelt:
-; GCN-NOT: buffer_
-; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3f847ae1
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x47ae147b
-; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 2
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0xe147ae14, s{{[0-9]+}}
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x4000147a, s{{[0-9]+}}
-; GCN-DAG: s_cmp_eq_u32 s{{[[0-9]+}}, 3
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x40100a3d, s{{[0-9]+}}
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x70a3d70a, s{{[0-9]+}}
-; GCN-DAG: s_cmp_eq_u32 s{{[[0-9]+}}, 4
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, 0x40140a3d, s{{[0-9]+}}
-; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: double5_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_mov_b32 s2, 0x3ff028f5
+; GCN-NEXT:    s_mov_b32 s3, 0xc28f5c29
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s6, 1
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0x3f847ae1
+; GCN-NEXT:    s_cselect_b32 s3, s3, 0x47ae147b
+; GCN-NEXT:    s_cmp_eq_u32 s6, 2
+; GCN-NEXT:    s_cselect_b32 s8, 0xe147ae14, s3
+; GCN-NEXT:    s_cselect_b32 s7, 0x4000147a, s2
+; GCN-NEXT:    s_cmp_eq_u32 s6, 3
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_and_b64 s[4:5], s[2:3], exec
+; GCN-NEXT:    s_cselect_b32 s9, 0x40100a3d, s7
+; GCN-NEXT:    s_cmp_eq_u32 s6, 4
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_and_b64 s[6:7], s[4:5], exec
+; GCN-NEXT:    s_cselect_b32 s6, 0x40140a3d, s9
+; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GCN-NEXT:    s_cselect_b32 s2, 0x70a3d70a, s8
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
   store double %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}half4_extelt:
-; GCN-NOT: buffer_
-; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
-; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
-; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 4
-; GCN:     s_lshr_b64 s[[[RL:[0-9]+]]:{{[0-9]+}}], s[[[SL]]:[[SH]]], [[SEL]]
-; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]]
-; GCN:     store_short v[{{[0-9:]+}}], v[[VRL]]
 define amdgpu_kernel void @half4_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: half4_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_mov_b32 s2, 0x40003c00
+; GCN-NEXT:    s_mov_b32 s3, 0x44004200
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshl_b32 s4, s4, 4
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    flat_store_short v[0:1], v2
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, i32 %sel
   store half %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}float2_extelt:
-; GCN-NOT: buffer_
-; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]]
-; GCN: store_dword v[{{[0-9:]+}}], [[V1]]
 define amdgpu_kernel void @float2_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: float2_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s4, 1
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <2 x float> <float 0.0, float 1.0>, i32 %sel
   store float %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}double2_extelt:
-; GCN-NOT: buffer_
-; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 1
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3f847ae1
-; GCN-DAG: s_cselect_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x47ae147b
-; GCN: store_dwordx2 v[{{[0-9:]+}}]
 define amdgpu_kernel void @double2_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: double2_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_mov_b32 s2, 0x3ff028f5
+; GCN-NEXT:    s_mov_b32 s3, 0xc28f5c29
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s4, 1
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0x3f847ae1
+; GCN-NEXT:    s_cselect_b32 s3, s3, 0x47ae147b
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <2 x double> <double 0.01, double 1.01>, i32 %sel
   store double %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}half8_extelt:
-; GCN-NOT: buffer_
-; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
-; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
-; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
-; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
-; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
-; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
-; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
-; GCN:     store_short v[{{[0-9:]+}}], [[V7]]
 define amdgpu_kernel void @half8_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: half8_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s4, 1
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 2
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4200
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4400
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 4
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4500
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 5
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4600
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4700
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 7
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4800
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_short v[0:1], v2
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x half> <half 1.0, half 2.0, half 3.0, half 4.0, half 5.0, half 6.0, half 7.0, half 8.0>, i32 %sel
   store half %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}short8_extelt:
-; GCN-NOT: buffer_
-; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
-; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
-; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
-; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
-; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
-; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
-; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
-; GCN:     store_short v[{{[0-9:]+}}], [[V7]]
 define amdgpu_kernel void @short8_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: short8_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s4, 1
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 2
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, 2, s[2:3]
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 4
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 4, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 5
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 6, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 7
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 7, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v2, 8, v0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_short v[0:1], v2
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i32 %sel
   store i16 %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}float8_extelt:
-; GCN-DAG: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-DAG: s_load_dword [[S0:s[0-9]+]], s[2:3], 0x2c
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
-; GCN-DAG: s_waitcnt lgkmcnt(0)
-; GCN-DAG: s_mov_b32 m0, [[S0]]
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40a00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40c00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40e00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41000000
-; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], v{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; GCN:     flat_store_dword v[{{[0-9:]+}}], [[RES]]
 define amdgpu_kernel void @float8_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: float8_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_load_dword s2, s[2:3], 0x2c
+; GCN-NEXT:    v_mov_b32_e32 v0, 1.0
+; GCN-NEXT:    v_mov_b32_e32 v1, 2.0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GCN-NEXT:    v_mov_b32_e32 v3, 4.0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; GCN-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; GCN-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; GCN-NEXT:    v_movrels_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, i32 %sel
   store float %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}double8_extelt:
-; GCN-NOT: buffer_
-; GCN-NOT: s_or_b32
-; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
-; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
-; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
-; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
-; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
 define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: double8_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x24
+; GCN-NEXT:    s_load_dword s18, s[2:3], 0x2c
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:    s_mov_b32 s15, 0x40200000
+; GCN-NEXT:    s_mov_b32 s13, 0x401c0000
+; GCN-NEXT:    s_mov_b32 s11, 0x40180000
+; GCN-NEXT:    s_mov_b32 s9, 0x40140000
+; GCN-NEXT:    s_mov_b32 s7, 0x40100000
+; GCN-NEXT:    s_mov_b32 s5, 0x40080000
+; GCN-NEXT:    s_mov_b32 s3, 2.0
+; GCN-NEXT:    s_mov_b32 s1, 0x3ff00000
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s6, s0
+; GCN-NEXT:    s_mov_b32 s8, s0
+; GCN-NEXT:    s_mov_b32 s10, s0
+; GCN-NEXT:    s_mov_b32 s12, s0
+; GCN-NEXT:    s_mov_b32 s14, s0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshl_b32 s18, s18, 1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v15, s15
+; GCN-NEXT:    s_mov_b32 m0, s18
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NEXT:    v_mov_b32_e32 v6, s6
+; GCN-NEXT:    v_mov_b32_e32 v7, s7
+; GCN-NEXT:    v_mov_b32_e32 v8, s8
+; GCN-NEXT:    v_mov_b32_e32 v9, s9
+; GCN-NEXT:    v_mov_b32_e32 v10, s10
+; GCN-NEXT:    v_mov_b32_e32 v11, s11
+; GCN-NEXT:    v_mov_b32_e32 v12, s12
+; GCN-NEXT:    v_mov_b32_e32 v13, s13
+; GCN-NEXT:    v_mov_b32_e32 v14, s14
+; GCN-NEXT:    v_movrels_b32_e32 v16, v1
+; GCN-NEXT:    v_movrels_b32_e32 v15, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[15:16]
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0>, i32 %sel
   store double %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}double7_extelt:
-; GCN-NOT: buffer_
-; GCN-NOT: s_or_b32
-; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
-; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
-; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
-; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
-; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
 define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: double7_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx2 s[14:15], s[2:3], 0x24
+; GCN-NEXT:    s_load_dword s16, s[2:3], 0x2c
+; GCN-NEXT:    s_mov_b32 s0, 0
+; GCN-NEXT:    s_mov_b32 s13, 0x401c0000
+; GCN-NEXT:    s_mov_b32 s11, 0x40180000
+; GCN-NEXT:    s_mov_b32 s9, 0x40140000
+; GCN-NEXT:    s_mov_b32 s7, 0x40100000
+; GCN-NEXT:    s_mov_b32 s5, 0x40080000
+; GCN-NEXT:    s_mov_b32 s3, 2.0
+; GCN-NEXT:    s_mov_b32 s1, 0x3ff00000
+; GCN-NEXT:    s_mov_b32 s2, s0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s6, s0
+; GCN-NEXT:    s_mov_b32 s8, s0
+; GCN-NEXT:    s_mov_b32 s10, s0
+; GCN-NEXT:    s_mov_b32 s12, s0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshl_b32 s16, s16, 1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v15, s15
+; GCN-NEXT:    s_mov_b32 m0, s16
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NEXT:    v_mov_b32_e32 v6, s6
+; GCN-NEXT:    v_mov_b32_e32 v7, s7
+; GCN-NEXT:    v_mov_b32_e32 v8, s8
+; GCN-NEXT:    v_mov_b32_e32 v9, s9
+; GCN-NEXT:    v_mov_b32_e32 v10, s10
+; GCN-NEXT:    v_mov_b32_e32 v11, s11
+; GCN-NEXT:    v_mov_b32_e32 v12, s12
+; GCN-NEXT:    v_mov_b32_e32 v13, s13
+; GCN-NEXT:    v_mov_b32_e32 v14, s14
+; GCN-NEXT:    v_movrels_b32_e32 v16, v1
+; GCN-NEXT:    v_movrels_b32_e32 v15, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NEXT:    v_mov_b32_e32 v1, s15
+; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[15:16]
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <7 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0>, i32 %sel
   store double %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}float16_extelt:
-; GCN-NOT: buffer_
-; GCN-DAG: s_mov_b32 m0,
-; GCN-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40a00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40c00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40e00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41000000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41100000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41200000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41300000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41400000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41500000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41600000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41700000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41800000
-; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], [[VLO]]
-; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
 define amdgpu_kernel void @float16_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: float16_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_load_dword s2, s[2:3], 0x2c
+; GCN-NEXT:    v_mov_b32_e32 v0, 1.0
+; GCN-NEXT:    v_mov_b32_e32 v1, 2.0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GCN-NEXT:    v_mov_b32_e32 v3, 4.0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; GCN-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; GCN-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; GCN-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; GCN-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; GCN-NEXT:    v_mov_b32_e32 v10, 0x41300000
+; GCN-NEXT:    v_mov_b32_e32 v11, 0x41400000
+; GCN-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; GCN-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; GCN-NEXT:    v_mov_b32_e32 v14, 0x41700000
+; GCN-NEXT:    v_mov_b32_e32 v15, 0x41800000
+; GCN-NEXT:    v_movrels_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %sel
   store float %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}double15_extelt:
-; GCN-NOT: buffer_
-; GCN-NOT: s_or_b32
-; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
-; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
-; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
-; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
-; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
 define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: double15_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_mov_b32 s36, 0
+; GCN-NEXT:    s_mov_b32 s65, 0x402e0000
+; GCN-NEXT:    s_mov_b32 s63, 0x402c0000
+; GCN-NEXT:    s_mov_b32 s61, 0x402a0000
+; GCN-NEXT:    s_mov_b32 s59, 0x40280000
+; GCN-NEXT:    s_mov_b32 s57, 0x40260000
+; GCN-NEXT:    s_mov_b32 s55, 0x40240000
+; GCN-NEXT:    s_mov_b32 s53, 0x40220000
+; GCN-NEXT:    s_mov_b32 s51, 0x40200000
+; GCN-NEXT:    s_mov_b32 s49, 0x401c0000
+; GCN-NEXT:    s_mov_b32 s47, 0x40180000
+; GCN-NEXT:    s_mov_b32 s45, 0x40140000
+; GCN-NEXT:    s_mov_b32 s43, 0x40100000
+; GCN-NEXT:    s_mov_b32 s41, 0x40080000
+; GCN-NEXT:    s_mov_b32 s39, 2.0
+; GCN-NEXT:    s_mov_b32 s37, 0x3ff00000
+; GCN-NEXT:    s_mov_b32 s38, s36
+; GCN-NEXT:    s_mov_b32 s40, s36
+; GCN-NEXT:    s_mov_b32 s42, s36
+; GCN-NEXT:    s_mov_b32 s44, s36
+; GCN-NEXT:    s_mov_b32 s46, s36
+; GCN-NEXT:    s_mov_b32 s48, s36
+; GCN-NEXT:    s_mov_b32 s50, s36
+; GCN-NEXT:    s_mov_b32 s52, s36
+; GCN-NEXT:    s_mov_b32 s54, s36
+; GCN-NEXT:    s_mov_b32 s56, s36
+; GCN-NEXT:    s_mov_b32 s58, s36
+; GCN-NEXT:    s_mov_b32 s60, s36
+; GCN-NEXT:    s_mov_b32 s62, s36
+; GCN-NEXT:    s_mov_b32 s64, s36
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshl_b32 s2, s4, 1
+; GCN-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NEXT:    v_mov_b32_e32 v1, s37
+; GCN-NEXT:    v_mov_b32_e32 v31, s67
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s38
+; GCN-NEXT:    v_mov_b32_e32 v3, s39
+; GCN-NEXT:    v_mov_b32_e32 v4, s40
+; GCN-NEXT:    v_mov_b32_e32 v5, s41
+; GCN-NEXT:    v_mov_b32_e32 v6, s42
+; GCN-NEXT:    v_mov_b32_e32 v7, s43
+; GCN-NEXT:    v_mov_b32_e32 v8, s44
+; GCN-NEXT:    v_mov_b32_e32 v9, s45
+; GCN-NEXT:    v_mov_b32_e32 v10, s46
+; GCN-NEXT:    v_mov_b32_e32 v11, s47
+; GCN-NEXT:    v_mov_b32_e32 v12, s48
+; GCN-NEXT:    v_mov_b32_e32 v13, s49
+; GCN-NEXT:    v_mov_b32_e32 v14, s50
+; GCN-NEXT:    v_mov_b32_e32 v15, s51
+; GCN-NEXT:    v_mov_b32_e32 v16, s52
+; GCN-NEXT:    v_mov_b32_e32 v17, s53
+; GCN-NEXT:    v_mov_b32_e32 v18, s54
+; GCN-NEXT:    v_mov_b32_e32 v19, s55
+; GCN-NEXT:    v_mov_b32_e32 v20, s56
+; GCN-NEXT:    v_mov_b32_e32 v21, s57
+; GCN-NEXT:    v_mov_b32_e32 v22, s58
+; GCN-NEXT:    v_mov_b32_e32 v23, s59
+; GCN-NEXT:    v_mov_b32_e32 v24, s60
+; GCN-NEXT:    v_mov_b32_e32 v25, s61
+; GCN-NEXT:    v_mov_b32_e32 v26, s62
+; GCN-NEXT:    v_mov_b32_e32 v27, s63
+; GCN-NEXT:    v_mov_b32_e32 v28, s64
+; GCN-NEXT:    v_mov_b32_e32 v29, s65
+; GCN-NEXT:    v_mov_b32_e32 v30, s66
+; GCN-NEXT:    v_movrels_b32_e32 v32, v1
+; GCN-NEXT:    v_movrels_b32_e32 v31, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[31:32]
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <15 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0>, i32 %sel
   store double %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}double16_extelt:
-; GCN-NOT: buffer_
-; GCN-NOT: s_or_b32
-; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[#BASE:]], [[ZERO]]
-; GCN-DAG: s_mov_b32 m0, [[IND:s[0-9]+]]
-; GCN-DAG: v_movrels_b32_e32 v[[RES_LO:[0-9]+]], v[[#BASE]]
-; GCN-DAG: v_movrels_b32_e32 v[[RES_HI:[0-9]+]], v[[#BASE+1]]
-; GCN:     store_dwordx2 v[{{[0-9:]+}}], v[[[RES_LO]]:[[RES_HI]]]
 define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: double16_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_mov_b32 s36, 0
+; GCN-NEXT:    s_mov_b32 s67, 0x40300000
+; GCN-NEXT:    s_mov_b32 s65, 0x402e0000
+; GCN-NEXT:    s_mov_b32 s63, 0x402c0000
+; GCN-NEXT:    s_mov_b32 s61, 0x402a0000
+; GCN-NEXT:    s_mov_b32 s59, 0x40280000
+; GCN-NEXT:    s_mov_b32 s57, 0x40260000
+; GCN-NEXT:    s_mov_b32 s55, 0x40240000
+; GCN-NEXT:    s_mov_b32 s53, 0x40220000
+; GCN-NEXT:    s_mov_b32 s51, 0x40200000
+; GCN-NEXT:    s_mov_b32 s49, 0x401c0000
+; GCN-NEXT:    s_mov_b32 s47, 0x40180000
+; GCN-NEXT:    s_mov_b32 s45, 0x40140000
+; GCN-NEXT:    s_mov_b32 s43, 0x40100000
+; GCN-NEXT:    s_mov_b32 s41, 0x40080000
+; GCN-NEXT:    s_mov_b32 s39, 2.0
+; GCN-NEXT:    s_mov_b32 s37, 0x3ff00000
+; GCN-NEXT:    s_mov_b32 s38, s36
+; GCN-NEXT:    s_mov_b32 s40, s36
+; GCN-NEXT:    s_mov_b32 s42, s36
+; GCN-NEXT:    s_mov_b32 s44, s36
+; GCN-NEXT:    s_mov_b32 s46, s36
+; GCN-NEXT:    s_mov_b32 s48, s36
+; GCN-NEXT:    s_mov_b32 s50, s36
+; GCN-NEXT:    s_mov_b32 s52, s36
+; GCN-NEXT:    s_mov_b32 s54, s36
+; GCN-NEXT:    s_mov_b32 s56, s36
+; GCN-NEXT:    s_mov_b32 s58, s36
+; GCN-NEXT:    s_mov_b32 s60, s36
+; GCN-NEXT:    s_mov_b32 s62, s36
+; GCN-NEXT:    s_mov_b32 s64, s36
+; GCN-NEXT:    s_mov_b32 s66, s36
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshl_b32 s2, s4, 1
+; GCN-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NEXT:    v_mov_b32_e32 v1, s37
+; GCN-NEXT:    v_mov_b32_e32 v31, s67
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s38
+; GCN-NEXT:    v_mov_b32_e32 v3, s39
+; GCN-NEXT:    v_mov_b32_e32 v4, s40
+; GCN-NEXT:    v_mov_b32_e32 v5, s41
+; GCN-NEXT:    v_mov_b32_e32 v6, s42
+; GCN-NEXT:    v_mov_b32_e32 v7, s43
+; GCN-NEXT:    v_mov_b32_e32 v8, s44
+; GCN-NEXT:    v_mov_b32_e32 v9, s45
+; GCN-NEXT:    v_mov_b32_e32 v10, s46
+; GCN-NEXT:    v_mov_b32_e32 v11, s47
+; GCN-NEXT:    v_mov_b32_e32 v12, s48
+; GCN-NEXT:    v_mov_b32_e32 v13, s49
+; GCN-NEXT:    v_mov_b32_e32 v14, s50
+; GCN-NEXT:    v_mov_b32_e32 v15, s51
+; GCN-NEXT:    v_mov_b32_e32 v16, s52
+; GCN-NEXT:    v_mov_b32_e32 v17, s53
+; GCN-NEXT:    v_mov_b32_e32 v18, s54
+; GCN-NEXT:    v_mov_b32_e32 v19, s55
+; GCN-NEXT:    v_mov_b32_e32 v20, s56
+; GCN-NEXT:    v_mov_b32_e32 v21, s57
+; GCN-NEXT:    v_mov_b32_e32 v22, s58
+; GCN-NEXT:    v_mov_b32_e32 v23, s59
+; GCN-NEXT:    v_mov_b32_e32 v24, s60
+; GCN-NEXT:    v_mov_b32_e32 v25, s61
+; GCN-NEXT:    v_mov_b32_e32 v26, s62
+; GCN-NEXT:    v_mov_b32_e32 v27, s63
+; GCN-NEXT:    v_mov_b32_e32 v28, s64
+; GCN-NEXT:    v_mov_b32_e32 v29, s65
+; GCN-NEXT:    v_mov_b32_e32 v30, s66
+; GCN-NEXT:    v_movrels_b32_e32 v32, v1
+; GCN-NEXT:    v_movrels_b32_e32 v31, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[31:32]
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <16 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0, double 16.0>, i32 %sel
   store double %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}float32_extelt:
-; GCN-NOT: buffer_
-; GCN-DAG: s_mov_b32 m0,
-; GCN-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40a00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40c00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40e00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41000000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41100000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41200000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41300000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41400000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41500000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41600000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41700000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41800000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41880000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41980000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a80000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b80000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41c00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41c80000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41d00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41d80000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41e00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41e80000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41f00000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41f80000
-; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x42000000
-; GCN-DAG: v_movrels_b32_e32 [[RES:v[0-9]+]], [[VLO]]
-; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
 define amdgpu_kernel void @float32_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: float32_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 1.0
+; GCN-NEXT:    v_mov_b32_e32 v1, 2.0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 m0, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, 4.0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; GCN-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; GCN-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; GCN-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; GCN-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; GCN-NEXT:    v_mov_b32_e32 v10, 0x41300000
+; GCN-NEXT:    v_mov_b32_e32 v11, 0x41400000
+; GCN-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; GCN-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; GCN-NEXT:    v_mov_b32_e32 v14, 0x41700000
+; GCN-NEXT:    v_mov_b32_e32 v15, 0x41800000
+; GCN-NEXT:    v_mov_b32_e32 v16, 0x41880000
+; GCN-NEXT:    v_mov_b32_e32 v17, 0x41900000
+; GCN-NEXT:    v_mov_b32_e32 v18, 0x41980000
+; GCN-NEXT:    v_mov_b32_e32 v19, 0x41a00000
+; GCN-NEXT:    v_mov_b32_e32 v20, 0x41a80000
+; GCN-NEXT:    v_mov_b32_e32 v21, 0x41b00000
+; GCN-NEXT:    v_mov_b32_e32 v22, 0x41b80000
+; GCN-NEXT:    v_mov_b32_e32 v23, 0x41c00000
+; GCN-NEXT:    v_mov_b32_e32 v24, 0x41c80000
+; GCN-NEXT:    v_mov_b32_e32 v25, 0x41d00000
+; GCN-NEXT:    v_mov_b32_e32 v26, 0x41d80000
+; GCN-NEXT:    v_mov_b32_e32 v27, 0x41e00000
+; GCN-NEXT:    v_mov_b32_e32 v28, 0x41e80000
+; GCN-NEXT:    v_mov_b32_e32 v29, 0x41f00000
+; GCN-NEXT:    v_mov_b32_e32 v30, 0x41f80000
+; GCN-NEXT:    v_mov_b32_e32 v31, 0x42000000
+; GCN-NEXT:    v_movrels_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 32.0>, i32 %sel
   store float %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}byte8_extelt:
-; GCN-NOT: buffer_
-; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x4030201
-; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x8070605
-; GCN-DAG: s_lshl_b32 [[SEL:s[0-p]+]], s{{[0-9]+}}, 3
-; GCN:     s_lshr_b64 s[[[RL:[0-9]+]]:{{[0-9]+}}], s[[[SL]]:[[SH]]], [[SEL]]
-; GCN-DAG: v_mov_b32_e32 v[[VRL:[0-9]+]], s[[RL]]
-; GCN:     store_byte v[{{[0-9:]+}}], v[[VRL]]
 define amdgpu_kernel void @byte8_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: byte8_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_mov_b32 s2, 0x4030201
+; GCN-NEXT:    s_mov_b32 s3, 0x8070605
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshl_b32 s4, s4, 3
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    flat_store_byte v[0:1], v2
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i32 %sel
   store i8 %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}byte16_extelt:
-; GCN-NOT: buffer_
-; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
-; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
-; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
-; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
-; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
-; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
-; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
-; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 8
-; GCN-DAG: s_cselect_b64 [[C8:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 9
-; GCN-DAG: s_cselect_b64 [[C9:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 10
-; GCN-DAG: s_cselect_b64 [[C10:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 11
-; GCN-DAG: s_cselect_b64 [[C11:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 12
-; GCN-DAG: s_cselect_b64 [[C12:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 13
-; GCN-DAG: s_cselect_b64 [[C13:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 14
-; GCN-DAG: s_cselect_b64 [[C14:[^,]+]], -1, 0
-; GCN-DAG: s_cmp_lg_u32 [[IDX]], 15
-; GCN-DAG: s_cselect_b64 [[C15:[^,]+]], -1, 0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V4:v[0-9]+]], {{[^,]+}}, [[V3]], [[C4]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V5:v[0-9]+]], {{[^,]+}}, [[V4]], [[C5]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V6:v[0-9]+]], {{[^,]+}}, [[V5]], [[C6]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V7:v[0-9]+]], {{[^,]+}}, [[V6]], [[C7]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V8:v[0-9]+]], {{[^,]+}}, [[V7]], [[C8]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V9:v[0-9]+]], {{[^,]+}}, [[V8]], [[C8]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V10:v[0-9]+]], {{[^,]+}}, [[V9]], [[C10]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V11:v[0-9]+]], {{[^,]+}}, [[V10]], [[C11]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V12:v[0-9]+]], {{[^,]+}}, [[V11]], [[C12]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V13:v[0-9]+]], {{[^,]+}}, [[V12]], [[C13]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V14:v[0-9]+]], {{[^,]+}}, [[V13]], [[C14]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V15:v[0-9]+]], {{[^,]+}}, [[V14]], [[C15]]
-; GCN:     store_byte v[{{[0-9:]+}}], [[V15]]
 define amdgpu_kernel void @byte16_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: byte16_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s4, 1
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 2
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, 2, s[2:3]
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 4
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 4, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 5
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 6, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 7
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 7, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 8, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 9
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 9, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 10
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 10, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 11
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 11, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 12
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 12, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 13
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 13, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 14
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 14, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 15, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v2, 16, v0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_byte v[0:1], v2
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>, i32 %sel
   store i8 %ext, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}bit4_extelt:
+define amdgpu_kernel void @bit4_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: bit4_extelt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
@@ -421,7 +735,7 @@ entry:
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
-define amdgpu_kernel void @bit4_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x i1> <i1 0, i1 1, i1 0, i1 1>, i32 %sel
   %zext = zext i1 %ext to i32
@@ -429,15 +743,398 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}bit128_extelt:
-; GCN-NOT: buffer_
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1
-; GCN: s_cmpk_lg_i32 {{s[0-9]+}}, 0x7f
-; GCN: s_cselect_b64 [[CL:[^,]+]], -1, 0
-; GCN: v_cndmask_b32_e{{32|64}} [[VL:v[0-9]+]], 0, [[V1]], [[CL]]
-; GCN:     v_and_b32_e32 [[RES:v[0-9]+]], 1, [[VL]]
-; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
 define amdgpu_kernel void @bit128_extelt(ptr addrspace(1) %out, i32 %sel) {
+; GCN-LABEL: bit128_extelt:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lg_u32 s4, 1
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 2
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 4
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 5
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 7
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 9
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 10
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 11
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 12
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 13
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 14
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 16
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 17
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 18
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 19
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 20
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 21
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 22
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 23
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 24
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 25
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 26
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 27
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 28
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 29
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 30
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 31
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 32
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 33
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 34
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 35
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 36
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 37
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 38
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 39
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 40
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 41
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 42
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 43
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 44
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 45
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 46
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 47
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 48
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 49
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 50
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 51
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 52
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 53
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 54
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 55
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 56
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 57
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 58
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 59
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 60
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 61
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 62
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 63
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 64
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x41
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x42
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x43
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x44
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x45
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x46
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x47
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x48
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x49
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x4a
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x4b
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x4c
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x4d
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x4e
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x4f
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x50
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x51
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x52
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x53
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x54
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x55
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x56
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x57
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x58
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x59
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x5a
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x5b
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x5c
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x5d
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x5e
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x5f
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x60
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x61
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x62
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x63
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x64
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x65
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x66
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x67
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x68
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x69
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x6a
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x6b
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x6c
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x6d
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x6e
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x6f
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x70
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x71
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x72
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x73
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x74
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x75
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x76
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x77
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x78
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x79
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x7a
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x7b
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x7c
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x7d
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x7e
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x7f
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    v_and_b32_e32 v2, 1, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
 entry:
   %ext = extractelement <128 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, i32 %sel
   %zext = zext i1 %ext to i32
@@ -445,29 +1142,177 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}float32_extelt_vec:
-; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_eq_u32_e{{32|64}} [[CC1:[^,]+]], 1, v0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 1.0, 2.0, [[CC1]]
-; GCN-DAG: v_mov_b32_e32 [[LASTVAL:v[0-9]+]], 0x42000000
-; GCN-DAG: v_cmp_ne_u32_e32 [[LASTCC:[^,]+]], 31, v0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} v0, [[LASTVAL]], v{{[0-9]+}}, [[LASTCC]]
 define float @float32_extelt_vec(i32 %sel) {
+; GCN-LABEL: float32_extelt_vec:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 4.0, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x40a00000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x40c00000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 5, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x40e00000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 6, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41000000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 7, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41100000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41200000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41300000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41400000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41500000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41600000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 13, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41700000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41800000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41880000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 16, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41900000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 17, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41980000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 18, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41a00000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 19, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41a80000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 20, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41b00000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 21, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41b80000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 22, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41c00000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 23, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41c80000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 24, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41d00000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 25, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41d80000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41e00000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 27, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41e80000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 28, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41f00000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 29, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x41f80000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 30, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 31, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 32.0>, i32 %sel
   ret float %ext
 }
 
-; GCN-LABEL: {{^}}double16_extelt_vec:
-; GCN-NOT: buffer_
-; GCN-DAG: v_mov_b32_e32 [[V1HI:v[0-9]+]], 0x3ff19999
-; GCN-DAG: v_mov_b32_e32 [[V1LO:v[0-9]+]], 0x9999999a
-; GCN-DAG: v_mov_b32_e32 [[V2HI:v[0-9]+]], 0x4000cccc
-; GCN-DAG: v_mov_b32_e32 [[V2LO:v[0-9]+]], 0xcccccccd
-; GCN-DAG: v_cmp_eq_u32_e{{32|64}} [[CC1:[^,]+]], 1, v0
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[R1HI:v[0-9]+]], [[V1HI]], [[V2HI]], [[CC1]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[R1LO:v[0-9]+]], [[V1LO]], [[V2LO]], [[CC1]]
 define double @double16_extelt_vec(i32 %sel) {
+; GCN-LABEL: double16_extelt_vec:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v3, 0x3ff19999
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x4000cccc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x9999999a
+; GCN-NEXT:    v_mov_b32_e32 v2, 0xcccccccd
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x4008cccc
+; GCN-NEXT:    s_or_b64 vcc, s[4:5], vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40106666
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40146666
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40186666
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x401c6666
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x66666666
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GCN-NEXT:    s_or_b64 vcc, vcc, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40203333
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40223333
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 8, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40243333
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40263333
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x40283333
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x402a3333
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x402c3333
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x402e3333
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x33333333
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GCN-NEXT:    s_or_b64 vcc, vcc, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x40301999
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <16 x double> <double 1.1, double 2.1, double 3.1, double 4.1, double 5.1, double 6.1, double 7.1, double 8.1, double 9.1, double 10.1, double 11.1, double 12.1, double 13.1, double 14.1, double 15.1, double 16.1>, i32 %sel
   ret double %ext
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index 164352ef75b3b9..b3a3c775e76f43 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -1,25 +1,65 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI %s
 
-; GCN-LABEL: {{^}}extract_vector_elt_v1i8:
-; GCN: s_load_dword [[LOAD:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
-; GCN: buffer_store_byte [[V_LOAD]]
 define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i8> %foo) #0 {
+; SI-LABEL: extract_vector_elt_v1i8:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[6:7], 0x2
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: extract_vector_elt_v1i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x8
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
   %p0 = extractelement <1 x i8> %foo, i32 0
   store i8 %p0, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}extract_vector_elt_v2i8:
-; GCN: s_load_dword s
-; GCN-NOT: {{flat|buffer|global}}
-; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
-; VI: v_lshrrev_b16_e64 v{{[0-9]+}}, 8, s{{[0-9]+}}
-; GCN-NOT: {{flat|buffer|global}}
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
 define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i8> %foo) #0 {
+; SI-LABEL: extract_vector_elt_v2i8:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[6:7], 0x2
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s5, s4, 8
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: extract_vector_elt_v2i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x8
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b16_e64 v1, 8, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %p0 = extractelement <2 x i8> %foo, i32 0
   %p1 = extractelement <2 x i8> %foo, i32 1
   %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
@@ -28,14 +68,38 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i
   ret void
 }
 
-; GCN-LABEL: {{^}}extract_vector_elt_v3i8:
-; GCN: s_load_dword s
-; GCN-NOT: {{flat|buffer|global}}
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; GCN-NOT: {{flat|buffer|global}}
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
 define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i8> %foo) #0 {
+; SI-LABEL: extract_vector_elt_v3i8:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[6:7], 0x2
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: extract_vector_elt_v3i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x8
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s5, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %p0 = extractelement <3 x i8> %foo, i32 0
   %p1 = extractelement <3 x i8> %foo, i32 2
   %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
@@ -44,14 +108,38 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i
   ret void
 }
 
-; GCN-LABEL: {{^}}extract_vector_elt_v4i8:
-; GCN: s_load_dword s
-; GCN-NOT: {{flat|buffer|global}}
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; GCN-NOT: {{flat|buffer|global}}
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
 define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i8> %foo) #0 {
+; SI-LABEL: extract_vector_elt_v4i8:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[6:7], 0x2
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: extract_vector_elt_v4i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x8
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s5, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %p0 = extractelement <4 x i8> %foo, i32 0
   %p1 = extractelement <4 x i8> %foo, i32 2
   %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
@@ -60,15 +148,40 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i
   ret void
 }
 
-; GCN-LABEL: {{^}}extract_vector_elt_v8i8:
-; GCN-NOT: {{s|flat|buffer|global}}_load
-; GCN: s_load_dword [[VAL:s[0-9]+]]
-; GCN-NOT: {{s|flat|buffer|global}}_load
-; GCN: s_lshr_b32 s{{[0-9]+}}, [[VAL]], 16
-; GCN-NOT: {{s|flat|buffer|global}}_load
-; GCN: buffer_store_byte
-; GCN: buffer_store_byte
 define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
+; SI-LABEL: extract_vector_elt_v8i8:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s1, s0
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: extract_vector_elt_v8i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s1, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s5, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s5
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %p0 = extractelement <8 x i8> %foo, i32 0
   %p1 = extractelement <8 x i8> %foo, i32 2
   store volatile i8 %p1, ptr addrspace(1) null
@@ -76,15 +189,38 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}extract_vector_elt_v16i8:
-; GCN: s_load_dword [[LOAD0:s[0-9]+]]
-; GCN-NOT: {{flat|buffer|global}}
-; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
-; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
-; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
-; GCN: buffer_store_byte [[V_ELT2]]
-; GCN: buffer_store_byte [[V_LOAD0]]
 define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x i8> %foo) #0 {
+; SI-LABEL: extract_vector_elt_v16i8:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[6:7], 0x4
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: extract_vector_elt_v16i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s5, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %p0 = extractelement <16 x i8> %foo, i32 0
   %p1 = extractelement <16 x i8> %foo, i32 2
   %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
@@ -93,16 +229,40 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x
   ret void
 }
 
-; GCN-LABEL: {{^}}extract_vector_elt_v32i8:
-; GCN-NOT: {{s|flat|buffer|global}}_load
-; GCN: s_load_dword [[VAL:s[0-9]+]]
-; GCN-NOT: {{s|flat|buffer|global}}_load
-; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[VAL]], 16
-; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
-; GCN: buffer_store_byte [[V_ELT2]]
-; GCN: buffer_store_byte [[V_LOAD0]]
 define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
+; SI-LABEL: extract_vector_elt_v32i8:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; SI-NEXT:    s_mov_b32 s0, 0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s1, s0
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: extract_vector_elt_v32i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s1, s0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s5, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s5
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %p0 = extractelement <32 x i8> %foo, i32 0
   %p1 = extractelement <32 x i8> %foo, i32 2
   store volatile i8 %p1, ptr addrspace(1) null
@@ -110,15 +270,38 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}extract_vector_elt_v64i8:
-; GCN: s_load_dword [[LOAD0:s[0-9]+]]
-; GCN-NOT: {{flat|buffer|global}}
-; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
-; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
-; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
-; GCN: buffer_store_byte [[V_ELT2]]
-; GCN: buffer_store_byte [[V_LOAD0]]
 define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x i8> %foo) #0 {
+; SI-LABEL: extract_vector_elt_v64i8:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[6:7], 0x10
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: extract_vector_elt_v64i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x40
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s5, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %p0 = extractelement <64 x i8> %foo, i32 0
   %p1 = extractelement <64 x i8> %foo, i32 2
   %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
@@ -132,45 +315,112 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x
 ; FIXME: 16-bit and 32-bit shift not combined after legalize to to
 ; isTypeDesirableForOp in SimplifyDemandedBits
 
-; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8:
-; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x4c
-; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[6:7], 0x28
-; VI-NOT: {{flat|buffer|global}}
-; VI-DAG: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
-; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: v_lshrrev_b16_e32 [[ELT:v[0-9]+]], [[SCALED_IDX]], [[V_LOAD]]
-; VI: buffer_store_byte [[ELT]]
 define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 {
+; SI-LABEL: dynamic_extract_vector_elt_v2i8:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s2, s[6:7], 0xa
+; SI-NEXT:    s_load_dword s4, s[6:7], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s2, s2, 0xffff
+; SI-NEXT:    s_lshl_b32 s4, s4, 3
+; SI-NEXT:    s_lshr_b32 s4, s2, s4
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: dynamic_extract_vector_elt_v2i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x4c
+; VI-NEXT:    s_load_dword s5, s[6:7], 0x28
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshl_b32 s4, s4, 3
+; VI-NEXT:    v_mov_b32_e32 v0, s5
+; VI-NEXT:    v_lshrrev_b16_e32 v0, s4, v0
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %elt = extractelement <2 x i8> %foo, i32 %idx
   store volatile i8 %elt, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8:
-; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x4c
-; VI-NEXT: s_load_dword [[LOAD:s[0-9]+]], s[6:7], 0x28
-; VI-NOT: {{flat|buffer|global}}
-; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: s_lshr_b32 [[ELT:s[0-9]+]], [[LOAD]], [[SCALED_IDX]]
-; VI: v_mov_b32_e32 [[V_ELT:v[0-9]+]], [[ELT]]
-; VI: buffer_store_byte [[V_ELT]]
 define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 {
+; SI-LABEL: dynamic_extract_vector_elt_v3i8:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s2, s[6:7], 0x13
+; SI-NEXT:    s_load_dword s4, s[6:7], 0xa
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshl_b32 s2, s2, 3
+; SI-NEXT:    s_lshr_b32 s4, s4, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: dynamic_extract_vector_elt_v3i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x4c
+; VI-NEXT:    s_load_dword s5, s[6:7], 0x28
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshl_b32 s4, s4, 3
+; VI-NEXT:    s_lshr_b32 s4, s5, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %p0 = extractelement <3 x i8> %foo, i32 %idx
   %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
   store volatile i8 %p0, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8:
-; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x30
-; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
-
-; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: s_lshr_b32 [[EXTRACT:s[0-9]+]], [[VEC4]], [[SCALED_IDX]]
-
-; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]]
-; VI: buffer_store_byte [[V_EXTRACT]]
 define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 {
+; SI-LABEL: dynamic_extract_vector_elt_v4i8:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; SI-NEXT:    s_load_dword s4, s[6:7], 0xc
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_lshl_b32 s4, s4, 3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s4, s2, s4
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: dynamic_extract_vector_elt_v4i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_load_dword s8, s[6:7], 0x30
+; VI-NEXT:    s_mov_b32 s7, 0x1100f000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_lshl_b32 s0, s8, 3
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s0, s2, s0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %vec = load <4 x i8>, ptr addrspace(4) %vec.ptr
   %p0 = extractelement <4 x i8> %vec, i32 %idx
   %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
@@ -178,15 +428,40 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8:
-; VI: s_load_dword [[IDX:s[0-9]+]], s[6:7], 0x10
-; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
-
-; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
-; VI: s_lshr_b64 s[[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC8]], [[SCALED_IDX]]
-; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], s[[EXTRACT_LO]]
-; VI: buffer_store_byte [[V_EXTRACT]]
 define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 {
+; SI-LABEL: dynamic_extract_vector_elt_v8i8:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; SI-NEXT:    s_load_dword s6, s[6:7], 0x4
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_lshl_b32 s2, s6, 3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: dynamic_extract_vector_elt_v8i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_load_dword s8, s[6:7], 0x10
+; VI-NEXT:    s_mov_b32 s7, 0x1100f000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_lshl_b32 s0, s8, 3
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %vec = load <8 x i8>, ptr addrspace(4) %vec.ptr
   %p0 = extractelement <8 x i8> %vec, i32 %idx
   %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
@@ -194,14 +469,54 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_0123:
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_load_dword s
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 24
 define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
+; SI-LABEL: reduce_load_vector_v8i8_extract_0123:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b64 s[0:1], 0
+; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s0, 8
+; SI-NEXT:    s_lshr_b32 s4, s0, 16
+; SI-NEXT:    s_lshr_b32 s5, s0, 24
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s5
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: reduce_load_vector_v8i8_extract_0123:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b64 s[0:1], 0
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s1, s0, 8
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 s4, s0, 16
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    s_lshr_b32 s5, s0, 24
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s5
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %load = load <8 x i8>, ptr addrspace(4) null
   %elt0 = extractelement <8 x i8> %load, i32 0
   %elt1 = extractelement <8 x i8> %load, i32 1
@@ -214,13 +529,52 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_0145:
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_load_dwordx2
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
 define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
+; SI-LABEL: reduce_load_vector_v8i8_extract_0145:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b64 s[0:1], 0
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s4, s0, 8
+; SI-NEXT:    s_lshr_b32 s5, s1, 8
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s5
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: reduce_load_vector_v8i8_extract_0145:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b64 s[0:1], 0
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s4, s0, 8
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_lshr_b32 s5, s1, 8
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s5
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %load = load <8 x i8>, ptr addrspace(4) null
   %elt0 = extractelement <8 x i8> %load, i32 0
   %elt1 = extractelement <8 x i8> %load, i32 1
@@ -233,13 +587,38 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}reduce_load_vector_v8i8_extract_45:
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_mov_b64 [[PTR:s\[[0-9]+:[0-9]+\]]], 4{{$}}
-; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0{{$}}
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
 define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
+; SI-LABEL: reduce_load_vector_v8i8_extract_45:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b64 s[0:1], 4
+; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s0, 8
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: reduce_load_vector_v8i8_extract_45:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b64 s[0:1], 4
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s1, s0, 8
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %load = load <8 x i8>, ptr addrspace(4) null
   %elt4 = extractelement <8 x i8> %load, i32 4
   %elt5 = extractelement <8 x i8> %load, i32 5
@@ -249,13 +628,52 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
 }
 
 ; FIXME: ought to be able to eliminate high half of load
-; GCN-LABEL: {{^}}reduce_load_vector_v16i8_extract_0145:
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_load_dwordx4
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
 define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
+; SI-LABEL: reduce_load_vector_v16i8_extract_0145:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b64 s[0:1], 0
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s3, 0x100f000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_lshr_b32 s4, s0, 8
+; SI-NEXT:    s_lshr_b32 s5, s1, 8
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s5
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: reduce_load_vector_v16i8_extract_0145:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b64 s[0:1], 0
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_lshr_b32 s4, s0, 8
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_lshr_b32 s5, s1, 8
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s5
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
   %load = load <16 x i8>, ptr addrspace(4) null
   %elt0 = extractelement <16 x i8> %load, i32 0
   %elt1 = extractelement <16 x i8> %load, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 93e210bb4c8090..fab001eddad5c6 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -1,23 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,CIVI,GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI,CIVI,GCN %s
 
-; GCN-LABEL: {{^}}s_abs_v2i16:
-; GFX9: s_load_dword [[VAL:s[0-9]+]]
-; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
-; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
-; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0]
-
-; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
-; CIVI: s_sub_i32
-; CIVI: s_sub_i32
-; CIVI: s_max_i32
-; CIVI: s_max_i32
-; CIVI: s_add_i32
-; CIVI-DAG: s_add_i32
-; CIVI-DAG: s_and_b32
-; CIVI-DAG: s_or_b32
+
 define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0 {
+; GFX9-LABEL: s_abs_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_sub_i16 v0, 0, s4
+; GFX9-NEXT:    v_pk_max_i16 v0, s4, v0
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
+;
+; VI-LABEL: s_abs_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s5, s4, 16
+; VI-NEXT:    s_sub_i32 s6, 0, s4
+; VI-NEXT:    s_sub_i32 s5, 0, s5
+; VI-NEXT:    s_ashr_i32 s7, s4, 16
+; VI-NEXT:    s_sext_i32_i16 s4, s4
+; VI-NEXT:    s_sext_i32_i16 s6, s6
+; VI-NEXT:    s_sext_i32_i16 s5, s5
+; VI-NEXT:    s_max_i32 s4, s4, s6
+; VI-NEXT:    s_max_i32 s5, s7, s5
+; VI-NEXT:    s_add_i32 s4, s4, 2
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_or_b32 s4, s5, s4
+; VI-NEXT:    s_add_i32 s4, s4, 0x20000
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: s_abs_v2i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_ashr_i32 s5, s4, 16
+; CI-NEXT:    s_lshr_b32 s6, s4, 16
+; CI-NEXT:    s_sext_i32_i16 s7, s4
+; CI-NEXT:    s_sub_i32 s4, 0, s4
+; CI-NEXT:    s_sext_i32_i16 s4, s4
+; CI-NEXT:    s_sub_i32 s6, 0, s6
+; CI-NEXT:    s_sext_i32_i16 s6, s6
+; CI-NEXT:    s_max_i32 s4, s7, s4
+; CI-NEXT:    s_max_i32 s5, s5, s6
+; CI-NEXT:    s_add_i32 s4, s4, 2
+; CI-NEXT:    s_lshl_b32 s5, s5, 16
+; CI-NEXT:    s_and_b32 s4, s4, 0xffff
+; CI-NEXT:    s_or_b32 s4, s5, s4
+; CI-NEXT:    s_add_i32 s4, s4, 0x20000
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %neg = sub <2 x i16> zeroinitializer, %val
   %cond = icmp sgt <2 x i16> %val, %neg
   %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
@@ -26,34 +75,73 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0
   ret void
 }
 
-; GCN-LABEL: {{^}}v_abs_v2i16:
-; GFX9: global_load_dword [[VAL:v[0-9]+]]
-; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
-; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
-; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0]
-
-; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; VI-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
-; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, [[ZERO]], v{{[0-9]+}}
-; VI-DAG: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI-DAG: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
-; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[TWO]]  dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NOT: v_and_b32
-; VI: v_or_b32_e32
-
-; CI: buffer_load_dword v
-; CI: v_lshrrev_b32_e32
-; CI-DAG: v_sub_i32_e32
-; CI-DAG: v_bfe_i32
-; CI-DAG: v_bfe_i32
-; CI-DAG: v_max_i32
-; CI-DAG: v_max_i32
-; CI-DAG: v_add_i32
-; CI-DAG: v_add_i32
-; CI-DAG: v_or_b32
 define amdgpu_kernel void @v_abs_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %src) #0 {
+; GFX9-LABEL: v_abs_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_sub_i16 v2, 0, v1
+; GFX9-NEXT:    v_pk_max_i16 v1, v1, v2
+; GFX9-NEXT:    v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0]
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; VI-LABEL: v_abs_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v4, 0
+; VI-NEXT:    v_mov_b32_e32 v5, 2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v3, v[0:1]
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_sub_u16_e32 v2, 0, v3
+; VI-NEXT:    v_sub_u16_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_max_i16_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_max_i16_e32 v2, v3, v2
+; VI-NEXT:    v_add_u16_e32 v2, 2, v2
+; VI-NEXT:    v_add_u16_sdwa v3, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: v_abs_v2i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, 0
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_bfe_i32 v3, v2, 0, 16
+; CI-NEXT:    v_ashrrev_i32_e32 v4, 16, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; CI-NEXT:    v_sub_i32_e32 v2, vcc, 0, v2
+; CI-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; CI-NEXT:    v_sub_i32_e32 v5, vcc, 0, v5
+; CI-NEXT:    v_bfe_i32 v5, v5, 0, 16
+; CI-NEXT:    v_max_i32_e32 v2, v3, v2
+; CI-NEXT:    v_max_i32_e32 v3, v4, v5
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 2, v2
+; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_or_b32_e32 v2, v3, v2
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 0x20000, v2
+; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; CI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds <2 x i16>, ptr addrspace(1) %src, i32 %tid
   %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
@@ -66,12 +154,69 @@ define amdgpu_kernel void @v_abs_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}s_abs_v2i16_2:
-; GFX9: s_load_dword [[VAL:s[0-9]+]]
-; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
-; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
-; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0]
 define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val) #0 {
+; GFX9-LABEL: s_abs_v2i16_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_sub_i16 v0, 0, s4
+; GFX9-NEXT:    v_pk_max_i16 v0, s4, v0
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
+;
+; VI-LABEL: s_abs_v2i16_2:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s5, s4, 16
+; VI-NEXT:    s_sub_i32 s6, 0, s4
+; VI-NEXT:    s_sub_i32 s5, 0, s5
+; VI-NEXT:    s_ashr_i32 s7, s4, 16
+; VI-NEXT:    s_sext_i32_i16 s4, s4
+; VI-NEXT:    s_sext_i32_i16 s6, s6
+; VI-NEXT:    s_sext_i32_i16 s5, s5
+; VI-NEXT:    s_max_i32 s4, s4, s6
+; VI-NEXT:    s_max_i32 s5, s7, s5
+; VI-NEXT:    s_add_i32 s4, s4, 2
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_or_b32 s4, s5, s4
+; VI-NEXT:    s_add_i32 s4, s4, 0x20000
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: s_abs_v2i16_2:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_ashr_i32 s5, s4, 16
+; CI-NEXT:    s_lshr_b32 s6, s4, 16
+; CI-NEXT:    s_sext_i32_i16 s7, s4
+; CI-NEXT:    s_sub_i32 s4, 0, s4
+; CI-NEXT:    s_sext_i32_i16 s4, s4
+; CI-NEXT:    s_sub_i32 s6, 0, s6
+; CI-NEXT:    s_sext_i32_i16 s6, s6
+; CI-NEXT:    s_max_i32 s4, s7, s4
+; CI-NEXT:    s_max_i32 s5, s5, s6
+; CI-NEXT:    s_add_i32 s4, s4, 2
+; CI-NEXT:    s_lshl_b32 s5, s5, 16
+; CI-NEXT:    s_and_b32 s4, s4, 0xffff
+; CI-NEXT:    s_or_b32 s4, s5, s4
+; CI-NEXT:    s_add_i32 s4, s4, 0x20000
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %z0 = insertelement <2 x i16> undef, i16 0, i16 0
   %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
   %t0 = insertelement <2 x i16> undef, i16 2, i16 0
@@ -84,12 +229,77 @@ define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}v_abs_v2i16_2:
-; GFX9: global_load_dword [[VAL:v[0-9]+]]
-; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
-; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
-; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0]
 define amdgpu_kernel void @v_abs_v2i16_2(ptr addrspace(1) %out, ptr addrspace(1) %src) #0 {
+; GFX9-LABEL: v_abs_v2i16_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v0, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_sub_i16 v1, 0, v0
+; GFX9-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_endpgm
+;
+; VI-LABEL: v_abs_v2i16_2:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    v_mov_b32_e32 v2, 2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_sub_u16_e32 v3, 0, v0
+; VI-NEXT:    v_sub_u16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_max_i16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_max_i16_e32 v0, v0, v3
+; VI-NEXT:    v_add_u16_e32 v0, 2, v0
+; VI-NEXT:    v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: v_abs_v2i16_2:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s10, 0
+; CI-NEXT:    s_mov_b32 s11, s7
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s4, s0
+; CI-NEXT:    s_mov_b32 s5, s1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_bfe_i32 v1, v0, 0, 16
+; CI-NEXT:    v_ashrrev_i32_e32 v2, 16, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, 0, v3
+; CI-NEXT:    v_bfe_i32 v3, v3, 0, 16
+; CI-NEXT:    v_max_i32_e32 v0, v1, v0
+; CI-NEXT:    v_max_i32_e32 v1, v2, v3
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_or_b32_e32 v0, v1, v0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 0x20000, v0
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_endpgm
   %z0 = insertelement <2 x i16> undef, i16 0, i16 0
   %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
   %t0 = insertelement <2 x i16> undef, i16 2, i16 0
@@ -105,15 +315,105 @@ define amdgpu_kernel void @v_abs_v2i16_2(ptr addrspace(1) %out, ptr addrspace(1)
   ret void
 }
 
-; GCN-LABEL: {{^}}s_abs_v4i16:
-; GFX9: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[2:3], 0x24
-; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, s[[#LOAD + 2]]
-; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[#LOAD + 3]]
-; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[#LOAD + 2]], [[SUB0]]
-; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], s[[#LOAD + 3]], [[SUB1]]
-; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 op_sel_hi:[1,0]
-; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 op_sel_hi:[1,0]
 define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0 {
+; GFX9-LABEL: s_abs_v4i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_sub_i16 v0, 0, s7
+; GFX9-NEXT:    v_pk_sub_i16 v1, 0, s6
+; GFX9-NEXT:    v_pk_max_i16 v2, s6, v1
+; GFX9-NEXT:    v_pk_max_i16 v0, s7, v0
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_pk_add_u16 v1, v0, 2 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v0, v2, 2 op_sel_hi:[1,0]
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
+;
+; VI-LABEL: s_abs_v4i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshr_b32 s0, s2, 16
+; VI-NEXT:    s_lshr_b32 s1, s3, 16
+; VI-NEXT:    s_sub_i32 s8, 0, s3
+; VI-NEXT:    s_sub_i32 s9, 0, s2
+; VI-NEXT:    s_sub_i32 s1, 0, s1
+; VI-NEXT:    s_sub_i32 s0, 0, s0
+; VI-NEXT:    s_ashr_i32 s10, s2, 16
+; VI-NEXT:    s_ashr_i32 s11, s3, 16
+; VI-NEXT:    s_sext_i32_i16 s2, s2
+; VI-NEXT:    s_sext_i32_i16 s3, s3
+; VI-NEXT:    s_sext_i32_i16 s9, s9
+; VI-NEXT:    s_sext_i32_i16 s8, s8
+; VI-NEXT:    s_sext_i32_i16 s0, s0
+; VI-NEXT:    s_sext_i32_i16 s1, s1
+; VI-NEXT:    s_max_i32 s3, s3, s8
+; VI-NEXT:    s_max_i32 s2, s2, s9
+; VI-NEXT:    s_max_i32 s1, s11, s1
+; VI-NEXT:    s_max_i32 s0, s10, s0
+; VI-NEXT:    s_add_i32 s2, s2, 2
+; VI-NEXT:    s_add_i32 s3, s3, 2
+; VI-NEXT:    s_lshl_b32 s0, s0, 16
+; VI-NEXT:    s_lshl_b32 s1, s1, 16
+; VI-NEXT:    s_and_b32 s3, s3, 0xffff
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_or_b32 s1, s1, s3
+; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    s_add_i32 s1, s1, 0x20000
+; VI-NEXT:    s_add_i32 s0, s0, 0x20000
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: s_abs_v4i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s4, s0
+; CI-NEXT:    s_mov_b32 s5, s1
+; CI-NEXT:    s_ashr_i64 s[0:1], s[2:3], 48
+; CI-NEXT:    s_ashr_i32 s1, s2, 16
+; CI-NEXT:    s_lshr_b32 s8, s2, 16
+; CI-NEXT:    s_lshr_b32 s9, s3, 16
+; CI-NEXT:    s_sext_i32_i16 s10, s3
+; CI-NEXT:    s_sext_i32_i16 s11, s2
+; CI-NEXT:    s_sub_i32 s3, 0, s3
+; CI-NEXT:    s_sub_i32 s2, 0, s2
+; CI-NEXT:    s_sext_i32_i16 s3, s3
+; CI-NEXT:    s_sext_i32_i16 s2, s2
+; CI-NEXT:    s_sub_i32 s9, 0, s9
+; CI-NEXT:    s_sub_i32 s8, 0, s8
+; CI-NEXT:    s_sext_i32_i16 s9, s9
+; CI-NEXT:    s_sext_i32_i16 s8, s8
+; CI-NEXT:    s_max_i32 s2, s11, s2
+; CI-NEXT:    s_max_i32 s3, s10, s3
+; CI-NEXT:    s_max_i32 s1, s1, s8
+; CI-NEXT:    s_max_i32 s0, s0, s9
+; CI-NEXT:    s_add_i32 s3, s3, 2
+; CI-NEXT:    s_add_i32 s2, s2, 2
+; CI-NEXT:    s_lshl_b32 s0, s0, 16
+; CI-NEXT:    s_and_b32 s3, s3, 0xffff
+; CI-NEXT:    s_lshl_b32 s1, s1, 16
+; CI-NEXT:    s_and_b32 s2, s2, 0xffff
+; CI-NEXT:    s_or_b32 s0, s0, s3
+; CI-NEXT:    s_or_b32 s1, s1, s2
+; CI-NEXT:    s_add_i32 s0, s0, 0x20000
+; CI-NEXT:    s_add_i32 s1, s1, 0x20000
+; CI-NEXT:    v_mov_b32_e32 v0, s1
+; CI-NEXT:    v_mov_b32_e32 v1, s0
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; CI-NEXT:    s_endpgm
   %z0 = insertelement <4 x i16> undef, i16 0, i16 0
   %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
   %z2 = insertelement <4 x i16> %z1, i16 0, i16 2
@@ -130,17 +430,103 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0
   ret void
 }
 
-; GCN-LABEL: {{^}}v_abs_v4i16:
-; GFX9: global_load_dwordx2 v[[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]]
 
-; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, v[[VAL0]]
-; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], v[[VAL0]], [[SUB0]]
-; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 op_sel_hi:[1,0]
 
-; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, v[[VAL1]]
-; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], v[[VAL1]], [[SUB1]]
-; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 op_sel_hi:[1,0]
 define amdgpu_kernel void @v_abs_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %src) #0 {
+; GFX9-LABEL: v_abs_v4i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_pk_sub_i16 v2, 0, v1
+; GFX9-NEXT:    v_pk_sub_i16 v3, 0, v0
+; GFX9-NEXT:    v_pk_max_i16 v0, v0, v3
+; GFX9-NEXT:    v_pk_max_i16 v1, v1, v2
+; GFX9-NEXT:    v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0]
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT:    s_endpgm
+;
+; VI-LABEL: v_abs_v4i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NEXT:    v_mov_b32_e32 v3, 2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_sub_u16_e32 v4, 0, v1
+; VI-NEXT:    v_sub_u16_sdwa v5, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_sub_u16_e32 v6, 0, v0
+; VI-NEXT:    v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_max_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_max_i16_e32 v0, v0, v6
+; VI-NEXT:    v_max_i16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_max_i16_e32 v1, v1, v4
+; VI-NEXT:    v_add_u16_e32 v1, 2, v1
+; VI-NEXT:    v_add_u16_sdwa v4, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_add_u16_e32 v0, 2, v0
+; VI-NEXT:    v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v1, v1, v4
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: v_abs_v4i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s10, 0
+; CI-NEXT:    s_mov_b32 s11, s3
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_mov_b32 s0, s4
+; CI-NEXT:    s_mov_b32 s1, s5
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_ashr_i64 v[2:3], v[0:1], 48
+; CI-NEXT:    v_bfe_i32 v4, v1, 0, 16
+; CI-NEXT:    v_bfe_i32 v3, v0, 0, 16
+; CI-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
+; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; CI-NEXT:    v_sub_i32_e32 v7, vcc, 0, v7
+; CI-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; CI-NEXT:    v_sub_i32_e32 v6, vcc, 0, v6
+; CI-NEXT:    v_bfe_i32 v7, v7, 0, 16
+; CI-NEXT:    v_bfe_i32 v6, v6, 0, 16
+; CI-NEXT:    v_max_i32_e32 v0, v3, v0
+; CI-NEXT:    v_max_i32_e32 v1, v4, v1
+; CI-NEXT:    v_max_i32_e32 v3, v5, v6
+; CI-NEXT:    v_max_i32_e32 v2, v2, v7
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 2, v1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_or_b32_e32 v1, v2, v1
+; CI-NEXT:    v_or_b32_e32 v0, v3, v0
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x20000, v1
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 0x20000, v0
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; CI-NEXT:    s_endpgm
   %z0 = insertelement <4 x i16> undef, i16 0, i16 0
   %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
   %z2 = insertelement <4 x i16> %z1, i16 0, i16 2
@@ -160,10 +546,98 @@ define amdgpu_kernel void @v_abs_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}s_min_max_v2i16:
-; GFX9: v_pk_max_i16
-; GFX9: v_pk_min_i16
 define amdgpu_kernel void @s_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %val0, <2 x i16> %val1) #0 {
+; GFX9-LABEL: s_min_max_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x34
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s2, s10
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-NEXT:    s_mov_b32 s9, s5
+; GFX9-NEXT:    v_pk_max_i16 v1, s12, v0
+; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s1, s7
+; GFX9-NEXT:    s_mov_b32 s3, s11
+; GFX9-NEXT:    v_pk_min_i16 v0, s12, v0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[8:11], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; VI-LABEL: s_min_max_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x34
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
+; VI-NEXT:    s_mov_b32 s2, s10
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s4
+; VI-NEXT:    s_mov_b32 s9, s5
+; VI-NEXT:    s_mov_b32 s0, s6
+; VI-NEXT:    s_mov_b32 s1, s7
+; VI-NEXT:    s_ashr_i32 s4, s12, 16
+; VI-NEXT:    s_sext_i32_i16 s5, s12
+; VI-NEXT:    s_ashr_i32 s6, s13, 16
+; VI-NEXT:    s_sext_i32_i16 s7, s13
+; VI-NEXT:    s_max_i32 s12, s4, s6
+; VI-NEXT:    s_max_i32 s13, s5, s7
+; VI-NEXT:    s_lshl_b32 s12, s12, 16
+; VI-NEXT:    s_and_b32 s13, s13, 0xffff
+; VI-NEXT:    s_min_i32 s4, s4, s6
+; VI-NEXT:    s_min_i32 s5, s5, s7
+; VI-NEXT:    s_or_b32 s12, s13, s12
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_or_b32 s4, s5, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s12
+; VI-NEXT:    s_mov_b32 s3, s11
+; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: s_min_max_v2i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; CI-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0xd
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_mov_b32 s2, s10
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s8, s4
+; CI-NEXT:    s_mov_b32 s0, s6
+; CI-NEXT:    s_ashr_i32 s4, s12, 16
+; CI-NEXT:    s_ashr_i32 s6, s13, 16
+; CI-NEXT:    s_mov_b32 s9, s5
+; CI-NEXT:    s_mov_b32 s1, s7
+; CI-NEXT:    s_sext_i32_i16 s5, s12
+; CI-NEXT:    s_sext_i32_i16 s7, s13
+; CI-NEXT:    s_max_i32 s12, s4, s6
+; CI-NEXT:    s_max_i32 s13, s5, s7
+; CI-NEXT:    v_mov_b32_e32 v0, s12
+; CI-NEXT:    s_min_i32 s4, s4, s6
+; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0 offset:2
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s13
+; CI-NEXT:    s_mov_b32 s3, s11
+; CI-NEXT:    s_min_i32 s5, s5, s7
+; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:2
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s5
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_endpgm
   %cond0 = icmp sgt <2 x i16> %val0, %val1
   %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
   %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
@@ -173,10 +647,110 @@ define amdgpu_kernel void @s_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
   ret void
 }
 
-; GCN-LABEL: {{^}}v_min_max_v2i16:
-; GFX9: v_pk_max_i16
-; GFX9: v_pk_min_i16
 define amdgpu_kernel void @v_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) #0 {
+; GFX9-LABEL: v_min_max_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s14, s2
+; GFX9-NEXT:    s_mov_b32 s15, s3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s12, s8
+; GFX9-NEXT:    s_mov_b32 s13, s9
+; GFX9-NEXT:    s_mov_b32 s8, s10
+; GFX9-NEXT:    s_mov_b32 s9, s11
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v1, off, s[8:11], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s8, s6
+; GFX9-NEXT:    s_mov_b32 s9, s7
+; GFX9-NEXT:    v_pk_max_i16 v2, v0, v1
+; GFX9-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; VI-LABEL: v_min_max_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
+; VI-NEXT:    s_mov_b32 s14, s10
+; VI-NEXT:    s_mov_b32 s15, s11
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s12, s4
+; VI-NEXT:    s_mov_b32 s13, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s10
+; VI-NEXT:    s_mov_b32 s7, s11
+; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s0
+; VI-NEXT:    s_mov_b32 s9, s1
+; VI-NEXT:    s_mov_b32 s4, s2
+; VI-NEXT:    s_mov_b32 s5, s3
+; VI-NEXT:    v_max_i32_sdwa v2, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-NEXT:    v_max_i32_sdwa v3, sext(v0), sext(v1) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_i32_sdwa v4, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-NEXT:    v_min_i32_sdwa v0, sext(v0), sext(v1) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v1, off, s[8:11], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: v_min_max_v2i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_mov_b32 s14, s10
+; CI-NEXT:    s_mov_b32 s15, s11
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s12, s4
+; CI-NEXT:    s_mov_b32 s13, s5
+; CI-NEXT:    s_mov_b32 s4, s6
+; CI-NEXT:    s_mov_b32 s5, s7
+; CI-NEXT:    s_mov_b32 s6, s10
+; CI-NEXT:    s_mov_b32 s7, s11
+; CI-NEXT:    buffer_load_sshort v0, off, s[12:15], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_sshort v1, off, s[12:15], 0 offset:2 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_sshort v2, off, s[4:7], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_sshort v3, off, s[4:7], 0 offset:2 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 s8, s0
+; CI-NEXT:    s_mov_b32 s9, s1
+; CI-NEXT:    s_mov_b32 s4, s2
+; CI-NEXT:    s_mov_b32 s5, s3
+; CI-NEXT:    v_max_i32_e32 v4, v0, v2
+; CI-NEXT:    v_max_i32_e32 v5, v1, v3
+; CI-NEXT:    v_min_i32_e32 v0, v0, v2
+; CI-NEXT:    v_min_i32_e32 v1, v1, v3
+; CI-NEXT:    buffer_store_short v5, off, s[8:11], 0 offset:2
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v4, off, s[8:11], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:2
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_endpgm
   %val0 = load volatile <2 x i16>, ptr addrspace(1) %ptr0
   %val1 = load volatile <2 x i16>, ptr addrspace(1) %ptr1
 
@@ -189,12 +763,134 @@ define amdgpu_kernel void @v_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
   ret void
 }
 
-; GCN-LABEL: {{^}}s_min_max_v4i16:
-; GFX9-DAG: v_pk_max_i16
-; GFX9-DAG: v_pk_min_i16
-; GFX9-DAG: v_pk_max_i16
-; GFX9-DAG: v_pk_min_i16
 define amdgpu_kernel void @s_min_max_v4i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i16> %val0, <4 x i16> %val1) #0 {
+; GFX9-LABEL: s_min_max_v4i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-NEXT:    v_mov_b32_e32 v4, s10
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    v_pk_max_i16 v1, s9, v2
+; GFX9-NEXT:    v_pk_max_i16 v0, s8, v4
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    v_pk_min_i16 v3, s9, v2
+; GFX9-NEXT:    v_pk_min_i16 v2, s8, v4
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; VI-LABEL: s_min_max_v4i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_ashr_i32 s12, s9, 16
+; VI-NEXT:    s_ashr_i32 s13, s8, 16
+; VI-NEXT:    s_sext_i32_i16 s9, s9
+; VI-NEXT:    s_sext_i32_i16 s8, s8
+; VI-NEXT:    s_ashr_i32 s14, s11, 16
+; VI-NEXT:    s_ashr_i32 s15, s10, 16
+; VI-NEXT:    s_sext_i32_i16 s11, s11
+; VI-NEXT:    s_sext_i32_i16 s10, s10
+; VI-NEXT:    s_max_i32 s16, s13, s15
+; VI-NEXT:    s_max_i32 s17, s12, s14
+; VI-NEXT:    s_max_i32 s18, s8, s10
+; VI-NEXT:    s_max_i32 s19, s9, s11
+; VI-NEXT:    s_min_i32 s12, s12, s14
+; VI-NEXT:    s_min_i32 s9, s9, s11
+; VI-NEXT:    s_lshl_b32 s17, s17, 16
+; VI-NEXT:    s_and_b32 s19, s19, 0xffff
+; VI-NEXT:    s_lshl_b32 s16, s16, 16
+; VI-NEXT:    s_and_b32 s18, s18, 0xffff
+; VI-NEXT:    s_min_i32 s13, s13, s15
+; VI-NEXT:    s_min_i32 s8, s8, s10
+; VI-NEXT:    s_lshl_b32 s10, s12, 16
+; VI-NEXT:    s_and_b32 s9, s9, 0xffff
+; VI-NEXT:    s_or_b32 s17, s19, s17
+; VI-NEXT:    s_or_b32 s16, s18, s16
+; VI-NEXT:    s_or_b32 s9, s9, s10
+; VI-NEXT:    s_lshl_b32 s10, s13, 16
+; VI-NEXT:    s_and_b32 s8, s8, 0xffff
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s16
+; VI-NEXT:    v_mov_b32_e32 v1, s17
+; VI-NEXT:    s_or_b32 s8, s8, s10
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s8
+; VI-NEXT:    v_mov_b32_e32 v3, s9
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: s_min_max_v4i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_ashr_i32 s12, s5, 16
+; CI-NEXT:    s_ashr_i32 s14, s7, 16
+; CI-NEXT:    s_sext_i32_i16 s5, s5
+; CI-NEXT:    s_sext_i32_i16 s7, s7
+; CI-NEXT:    s_max_i32 s17, s12, s14
+; CI-NEXT:    s_mov_b32 s8, s0
+; CI-NEXT:    s_mov_b32 s9, s1
+; CI-NEXT:    s_ashr_i32 s13, s4, 16
+; CI-NEXT:    s_ashr_i32 s15, s6, 16
+; CI-NEXT:    s_max_i32 s19, s5, s7
+; CI-NEXT:    v_mov_b32_e32 v0, s17
+; CI-NEXT:    s_sext_i32_i16 s4, s4
+; CI-NEXT:    s_sext_i32_i16 s6, s6
+; CI-NEXT:    s_max_i32 s16, s13, s15
+; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0 offset:6
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s19
+; CI-NEXT:    s_max_i32 s18, s4, s6
+; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0 offset:4
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s16
+; CI-NEXT:    s_min_i32 s12, s12, s14
+; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0 offset:2
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s18
+; CI-NEXT:    s_mov_b32 s0, s2
+; CI-NEXT:    s_mov_b32 s1, s3
+; CI-NEXT:    s_mov_b32 s2, s10
+; CI-NEXT:    s_mov_b32 s3, s11
+; CI-NEXT:    s_min_i32 s5, s5, s7
+; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s12
+; CI-NEXT:    s_min_i32 s13, s13, s15
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:6
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s5
+; CI-NEXT:    s_min_i32 s4, s4, s6
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s13
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:2
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_endpgm
   %cond0 = icmp sgt <4 x i16> %val0, %val1
   %sel0 = select <4 x i1> %cond0, <4 x i16> %val0, <4 x i16> %val1
   %sel1 = select <4 x i1> %cond0, <4 x i16> %val1, <4 x i16> %val0
@@ -204,8 +900,161 @@ define amdgpu_kernel void @s_min_max_v4i16(ptr addrspace(1) %out0, ptr addrspace
   ret void
 }
 
-; GCN-LABEL: {{^}}v_min_max_v2i16_user:
 define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) #0 {
+; GFX9-LABEL: v_min_max_v2i16_user:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s2, s6
+; GFX9-NEXT:    s_mov_b32 s3, s7
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s12
+; GFX9-NEXT:    s_mov_b32 s1, s13
+; GFX9-NEXT:    s_mov_b32 s12, s14
+; GFX9-NEXT:    s_mov_b32 s13, s15
+; GFX9-NEXT:    s_mov_b32 s14, s6
+; GFX9-NEXT:    s_mov_b32 s15, s7
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v1, off, s[12:15], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, s8
+; GFX9-NEXT:    s_mov_b32 s5, s9
+; GFX9-NEXT:    s_mov_b32 s12, s10
+; GFX9-NEXT:    s_mov_b32 s13, s11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cmp_gt_i16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_i16_e64 s[0:1], v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v3, v2, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
+; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 1, v3
+; GFX9-NEXT:    buffer_store_dword v4, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v0, off, s[12:15], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_e32 v0, v1, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; VI-LABEL: v_min_max_v2i16_user:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s12
+; VI-NEXT:    s_mov_b32 s5, s13
+; VI-NEXT:    s_mov_b32 s12, s14
+; VI-NEXT:    s_mov_b32 s13, s15
+; VI-NEXT:    s_mov_b32 s14, s2
+; VI-NEXT:    s_mov_b32 s15, s3
+; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s8
+; VI-NEXT:    s_mov_b32 s1, s9
+; VI-NEXT:    s_mov_b32 s5, s11
+; VI-NEXT:    s_mov_b32 s4, s10
+; VI-NEXT:    v_readfirstlane_b32 s8, v0
+; VI-NEXT:    v_readfirstlane_b32 s9, v1
+; VI-NEXT:    s_ashr_i32 s11, s8, 16
+; VI-NEXT:    s_ashr_i32 s13, s9, 16
+; VI-NEXT:    s_cmp_gt_i32 s11, s13
+; VI-NEXT:    s_sext_i32_i16 s10, s8
+; VI-NEXT:    s_sext_i32_i16 s12, s9
+; VI-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[8:9]
+; VI-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; VI-NEXT:    s_cselect_b32 s8, s11, s13
+; VI-NEXT:    s_cselect_b32 s11, s13, s11
+; VI-NEXT:    s_lshl_b32 s13, s8, 16
+; VI-NEXT:    s_cmp_gt_i32 s10, s12
+; VI-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[8:9]
+; VI-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; VI-NEXT:    s_cselect_b32 s8, s10, s12
+; VI-NEXT:    s_cselect_b32 s9, s12, s10
+; VI-NEXT:    s_and_b32 s8, s8, 0xffff
+; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
+; VI-NEXT:    s_lshl_b32 s10, s11, 16
+; VI-NEXT:    s_and_b32 s9, s9, 0xffff
+; VI-NEXT:    s_or_b32 s8, s8, s13
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    s_or_b32 s9, s9, s10
+; VI-NEXT:    v_mov_b32_e32 v1, s8
+; VI-NEXT:    v_and_b32_e32 v0, 3, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s9
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: v_min_max_v2i16_user:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x9
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s2, s6
+; CI-NEXT:    s_mov_b32 s3, s7
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s0, s12
+; CI-NEXT:    s_mov_b32 s1, s13
+; CI-NEXT:    s_mov_b32 s12, s14
+; CI-NEXT:    s_mov_b32 s13, s15
+; CI-NEXT:    s_mov_b32 s14, s6
+; CI-NEXT:    s_mov_b32 s15, s7
+; CI-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_sshort v1, off, s[0:3], 0 offset:2 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_sshort v2, off, s[12:15], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_sshort v3, off, s[12:15], 0 offset:2 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 s4, s8
+; CI-NEXT:    s_mov_b32 s5, s9
+; CI-NEXT:    s_mov_b32 s12, s10
+; CI-NEXT:    s_mov_b32 s13, s11
+; CI-NEXT:    v_cmp_gt_i32_e32 vcc, v0, v2
+; CI-NEXT:    v_cmp_gt_i32_e64 s[0:1], v1, v3
+; CI-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CI-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; CI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; CI-NEXT:    buffer_store_short v5, off, s[4:7], 0 offset:2
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v4, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v1, off, s[12:15], 0 offset:2
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v0, off, s[12:15], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 1, v2
+; CI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CI-NEXT:    v_or_b32_e32 v0, v1, v0
+; CI-NEXT:    v_and_b32_e32 v0, 3, v0
+; CI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_endpgm
   %val0 = load volatile <2 x i16>, ptr addrspace(1) %ptr0
   %val1 = load volatile <2 x i16>, ptr addrspace(1) %ptr1
 
@@ -219,10 +1068,96 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr
   ret void
 }
 
-; GCN-LABEL: {{^}}u_min_max_v2i16:
-; GFX9: v_pk_max_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX9: v_pk_min_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @u_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind {
+; GFX9-LABEL: u_min_max_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x34
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s2, s10
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-NEXT:    s_mov_b32 s9, s5
+; GFX9-NEXT:    v_pk_max_u16 v1, s12, v0
+; GFX9-NEXT:    s_mov_b32 s0, s6
+; GFX9-NEXT:    s_mov_b32 s1, s7
+; GFX9-NEXT:    s_mov_b32 s3, s11
+; GFX9-NEXT:    v_pk_min_u16 v0, s12, v0
+; GFX9-NEXT:    buffer_store_dword v1, off, s[8:11], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; VI-LABEL: u_min_max_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x34
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
+; VI-NEXT:    s_mov_b32 s2, s10
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s4
+; VI-NEXT:    s_mov_b32 s0, s6
+; VI-NEXT:    s_lshr_b32 s4, s12, 16
+; VI-NEXT:    s_lshr_b32 s6, s13, 16
+; VI-NEXT:    s_mov_b32 s9, s5
+; VI-NEXT:    s_mov_b32 s1, s7
+; VI-NEXT:    s_and_b32 s5, s12, 0xffff
+; VI-NEXT:    s_and_b32 s7, s13, 0xffff
+; VI-NEXT:    s_max_u32 s13, s4, s6
+; VI-NEXT:    s_max_u32 s12, s5, s7
+; VI-NEXT:    s_lshl_b32 s13, s13, 16
+; VI-NEXT:    s_min_u32 s4, s4, s6
+; VI-NEXT:    s_or_b32 s12, s12, s13
+; VI-NEXT:    s_min_u32 s5, s5, s7
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_or_b32 s4, s5, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s12
+; VI-NEXT:    s_mov_b32 s3, s11
+; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: u_min_max_v2i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; CI-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0xd
+; CI-NEXT:    s_mov_b32 s11, 0xf000
+; CI-NEXT:    s_mov_b32 s10, -1
+; CI-NEXT:    s_mov_b32 s2, s10
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s8, s4
+; CI-NEXT:    s_mov_b32 s0, s6
+; CI-NEXT:    s_lshr_b32 s4, s12, 16
+; CI-NEXT:    s_lshr_b32 s6, s13, 16
+; CI-NEXT:    s_mov_b32 s9, s5
+; CI-NEXT:    s_mov_b32 s1, s7
+; CI-NEXT:    s_and_b32 s5, s12, 0xffff
+; CI-NEXT:    s_and_b32 s7, s13, 0xffff
+; CI-NEXT:    s_max_u32 s13, s4, s6
+; CI-NEXT:    s_max_u32 s12, s5, s7
+; CI-NEXT:    v_mov_b32_e32 v0, s13
+; CI-NEXT:    s_min_u32 s4, s4, s6
+; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0 offset:2
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s12
+; CI-NEXT:    s_mov_b32 s3, s11
+; CI-NEXT:    s_min_u32 s5, s5, s7
+; CI-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:2
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s5
+; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_endpgm
   %cond0 = icmp ugt <2 x i16> %val0, %val1
   %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
   %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
@@ -236,3 +1171,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CIVI: {{.*}}
+; GCN: {{.*}}

>From 3fa6da14373eaa45f9487409c9732dbaf8c20397 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Wed, 28 Aug 2024 14:18:57 +0200
Subject: [PATCH 2/2] [AMDGPU] Promote uniform ops to I32 in ISel

Promote uniform binops, selects and setcc in Global & DAGISel instead of CGP.

Solves #64591
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |    2 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   19 +-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |   10 +-
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |    8 +-
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td       |   28 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   35 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |    2 +-
 .../Target/AMDGPU/AMDGPURegBankCombiner.cpp   |  113 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  156 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |    2 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |    3 +-
 llvm/lib/Target/X86/X86ISelLowering.h         |    2 +-
 .../CodeGen/AMDGPU/GlobalISel/add.v2i16.ll    |   70 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll  |  114 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll   |  163 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   |  120 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   |  130 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll |  854 ++-
 llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll   |  149 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll    |   77 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll   |  114 +-
 .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll   |  169 +-
 .../AMDGPU/GlobalISel/shl-ext-reduce.ll       |   10 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll    |   88 +-
 .../CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll    |   54 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll   |   26 +-
 llvm/test/CodeGen/AMDGPU/add.v2i16.ll         |   22 +-
 ...amdgpu-codegenprepare-fold-binop-select.ll |    7 +-
 .../amdgpu-codegenprepare-i16-to-i32.ll       |    4 +-
 .../amdgpu-simplify-libcall-pow-codegen.ll    |  652 +-
 .../CodeGen/AMDGPU/amdgpu.private-memory.ll   |    2 +-
 llvm/test/CodeGen/AMDGPU/anyext.ll            |    8 +-
 llvm/test/CodeGen/AMDGPU/bitreverse.ll        |    7 +-
 .../branch-folding-implicit-def-subreg.ll     |    4 +-
 .../AMDGPU/bug-sdag-emitcopyfromreg.ll        |   64 +-
 .../CodeGen/AMDGPU/calling-conventions.ll     | 1739 ++---
 .../CodeGen/AMDGPU/cgp-bitfield-extract.ll    |   11 +-
 llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll |    2 +-
 llvm/test/CodeGen/AMDGPU/ctlz.ll              |   26 +-
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   |   25 +-
 llvm/test/CodeGen/AMDGPU/cttz.ll              |   14 +-
 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll   |   38 +-
 llvm/test/CodeGen/AMDGPU/dagcombine-select.ll |    5 +-
 .../CodeGen/AMDGPU/extract_vector_dynelt.ll   |  452 +-
 .../CodeGen/AMDGPU/extract_vector_elt-i8.ll   |    8 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |  101 +-
 llvm/test/CodeGen/AMDGPU/fneg.ll              |   13 +-
 llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll         |  532 --
 .../AMDGPU/gfx-callable-argument-types.ll     |   33 +-
 llvm/test/CodeGen/AMDGPU/idiv-licm.ll         |  463 +-
 llvm/test/CodeGen/AMDGPU/imm16.ll             |   18 +-
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |  106 +-
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    | 1903 +++--
 llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll |  160 +-
 .../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll    |   40 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  | 6643 ++++++++---------
 llvm/test/CodeGen/AMDGPU/load-constant-i8.ll  | 4835 ++++++------
 llvm/test/CodeGen/AMDGPU/load-global-i8.ll    |   15 +-
 llvm/test/CodeGen/AMDGPU/load-local-i8.ll     |   15 +-
 .../AMDGPU/lower-lds-struct-aa-memcpy.ll      |    4 +-
 llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll        |   14 +-
 llvm/test/CodeGen/AMDGPU/min.ll               |  395 +-
 llvm/test/CodeGen/AMDGPU/mul.ll               |   51 +-
 llvm/test/CodeGen/AMDGPU/permute_i8.ll        |   73 +-
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll  |  227 +-
 llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll  |   35 +-
 llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll     |  199 +-
 llvm/test/CodeGen/AMDGPU/select-i1.ll         |   13 +-
 llvm/test/CodeGen/AMDGPU/select-vectors.ll    |    5 +-
 llvm/test/CodeGen/AMDGPU/setcc-opt.ll         |   17 +-
 llvm/test/CodeGen/AMDGPU/sext-in-reg.ll       |   14 +-
 llvm/test/CodeGen/AMDGPU/shl.ll               |    5 +-
 llvm/test/CodeGen/AMDGPU/shl.v2i16.ll         |    6 +-
 llvm/test/CodeGen/AMDGPU/sign_extend.ll       |   19 +-
 llvm/test/CodeGen/AMDGPU/smed3.ll             |   20 +-
 llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll     |  284 +-
 llvm/test/CodeGen/AMDGPU/sra.ll               |   80 +-
 llvm/test/CodeGen/AMDGPU/srem.ll              |   36 +-
 llvm/test/CodeGen/AMDGPU/sub.v2i16.ll         |   34 +-
 llvm/test/CodeGen/AMDGPU/trunc-combine.ll     |    4 +-
 llvm/test/CodeGen/AMDGPU/trunc-store.ll       |  136 +-
 llvm/test/CodeGen/AMDGPU/uaddo.ll             |   15 +-
 llvm/test/CodeGen/AMDGPU/usubo.ll             |   15 +-
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   |   21 +-
 .../CodeGen/AMDGPU/vector-alloca-bitcast.ll   |    3 +-
 .../AMDGPU/vgpr-spill-placement-issue61083.ll |    6 +-
 llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll  |   61 +-
 llvm/test/CodeGen/AMDGPU/zero_extend.ll       |   11 +-
 88 files changed, 10580 insertions(+), 11708 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index eda38cd8a564d6..85310a4911b8ed 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3299,7 +3299,7 @@ class TargetLoweringBase {
   /// Return true if it's profitable to narrow operations of type SrcVT to
   /// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
   /// i32 to i16.
-  virtual bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+  virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const {
     return false;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b0a906743f29ff..513ad392cb360a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7031,7 +7031,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
         TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) &&
         TLI.isTypeDesirableForOp(ISD::AND, SrcVT) &&
-        TLI.isNarrowingProfitable(VT, SrcVT))
+        TLI.isNarrowingProfitable(N, VT, SrcVT))
       return DAG.getNode(ISD::ZERO_EXTEND, DL, VT,
                          DAG.getNode(ISD::AND, DL, SrcVT, N0Op0,
                                      DAG.getZExtOrTrunc(N1, DL, SrcVT)));
@@ -14574,7 +14574,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
   // ShLeftAmt will indicate how much a narrowed load should be shifted left.
   unsigned ShLeftAmt = 0;
   if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
-      ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
+      ExtVT == VT && TLI.isNarrowingProfitable(N, N0.getValueType(), VT)) {
     if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
       ShLeftAmt = N01->getZExtValue();
       N0 = N0.getOperand(0);
@@ -15118,9 +15118,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   }
 
   // trunc (select c, a, b) -> select c, (trunc a), (trunc b)
-  if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
-    if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
-        TLI.isTruncateFree(SrcVT, VT)) {
+  if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse() &&
+      TLI.isTruncateFree(SrcVT, VT)) {
+    if (!LegalOperations ||
+        (TLI.isOperationLegal(ISD::SELECT, SrcVT) &&
+         TLI.isNarrowingProfitable(N0.getNode(), N0.getValueType(), VT))) {
       SDLoc SL(N0);
       SDValue Cond = N0.getOperand(0);
       SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
@@ -20061,10 +20063,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     // The narrowing should be profitable, the load/store operation should be
     // legal (or custom) and the store size should be equal to the NewVT width.
-    while (NewBW < BitWidth &&
-           (NewVT.getStoreSizeInBits() != NewBW ||
-            !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
-            !TLI.isNarrowingProfitable(VT, NewVT))) {
+    while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW ||
+                                !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
+                                !TLI.isNarrowingProfitable(N, VT, NewVT))) {
       NewBW = NextPowerOf2(NewBW);
       NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     }
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4e796289cff0a1..97e10b3551db1a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1841,7 +1841,7 @@ bool TargetLowering::SimplifyDemandedBits(
         for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
              SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
           EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
-          if (isNarrowingProfitable(VT, SmallVT) &&
+          if (isNarrowingProfitable(Op.getNode(), VT, SmallVT) &&
               isTypeDesirableForOp(ISD::SHL, SmallVT) &&
               isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
               (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
@@ -1865,7 +1865,7 @@ bool TargetLowering::SimplifyDemandedBits(
       if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth &&
           DemandedBits.countLeadingOnes() >= HalfWidth) {
         EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth);
-        if (isNarrowingProfitable(VT, HalfVT) &&
+        if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
             isTypeDesirableForOp(ISD::SHL, HalfVT) &&
             isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
             (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, HalfVT))) {
@@ -1984,7 +1984,7 @@ bool TargetLowering::SimplifyDemandedBits(
       if ((BitWidth % 2) == 0 && !VT.isVector()) {
         APInt HiBits = APInt::getHighBitsSet(BitWidth, BitWidth / 2);
         EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2);
-        if (isNarrowingProfitable(VT, HalfVT) &&
+        if (isNarrowingProfitable(Op.getNode(), VT, HalfVT) &&
             isTypeDesirableForOp(ISD::SRL, HalfVT) &&
             isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
             (!TLO.LegalOperations() || isOperationLegal(ISD::SRL, HalfVT)) &&
@@ -4762,9 +4762,11 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       case ISD::SETULT:
       case ISD::SETULE: {
         EVT newVT = N0.getOperand(0).getValueType();
+        // FIXME: Should use isNarrowingProfitable.
         if (DCI.isBeforeLegalizeOps() ||
             (isOperationLegal(ISD::SETCC, newVT) &&
-             isCondCodeLegal(Cond, newVT.getSimpleVT()))) {
+             isCondCodeLegal(Cond, newVT.getSimpleVT()) &&
+             isTypeDesirableForOp(ISD::SETCC, newVT))) {
           EVT NewSetCCVT = getSetCCResultType(Layout, *DAG.getContext(), newVT);
           SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 052e1140533f3f..f689fcf62fe8eb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -46,10 +46,10 @@ static cl::opt<bool> WidenLoads(
   cl::init(false));
 
 static cl::opt<bool> Widen16BitOps(
-  "amdgpu-codegenprepare-widen-16-bit-ops",
-  cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
-  cl::ReallyHidden,
-  cl::init(true));
+    "amdgpu-codegenprepare-widen-16-bit-ops",
+    cl::desc(
+        "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
+    cl::ReallyHidden, cl::init(false));
 
 static cl::opt<bool>
     BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b2a3f9392157d1..01e96159babd03 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -145,6 +145,31 @@ def expand_promoted_fmed3 : GICombineRule<
 
 } // End Predicates = [NotHasMed3_16]
 
+def promote_i16_uniform_binops_frag : GICombinePatFrag<
+  (outs root:$dst), (ins),
+  !foreach(op, [G_ADD, G_SUB, G_SHL, G_ASHR, G_LSHR, G_AND, G_XOR, G_OR, G_MUL],
+          (pattern (op i16:$dst, i16:$lhs, i16:$rhs)))>;
+
+def promote_i16_uniform_binops : GICombineRule<
+  (defs root:$dst),
+  (match (promote_i16_uniform_binops_frag i16:$dst):$mi,
+    [{ return matchPromote16to32(*${mi}); }]),
+  (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+def promote_i16_uniform_ternary_frag : GICombinePatFrag<
+  (outs root:$dst), (ins),
+  !foreach(op, [G_ICMP, G_SELECT],
+          (pattern (op i16:$dst, $first, i16:$lhs, i16:$rhs)))>;
+
+def promote_i16_uniform_ternary : GICombineRule<
+  (defs root:$dst),
+  (match (promote_i16_uniform_ternary_frag i16:$dst):$mi,
+    [{ return matchPromote16to32(*${mi}); }]),
+  (apply [{ applyPromote16to32(*${mi}); }])
+>;
+
+
 // Combines which should only apply on SI/CI
 def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
 
@@ -169,5 +194,6 @@ def AMDGPURegBankCombiner : GICombiner<
   "AMDGPURegBankCombinerImpl",
   [unmerge_merge, unmerge_cst, unmerge_undef,
    zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
-   fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
+   fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
+   promote_i16_uniform_binops, promote_i16_uniform_ternary]> {
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 96143d688801aa..1a596cc80c0c9c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1017,14 +1017,45 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
   return Src == MVT::i32 && Dest == MVT::i64;
 }
 
-bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
+                                                 EVT DestVT) const {
+  switch (N->getOpcode()) {
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::MUL:
+  case ISD::SETCC:
+  case ISD::SELECT:
+    if (Subtarget->has16BitInsts() &&
+        (DestVT.isVector() ? !Subtarget->hasVOP3PInsts() : true)) {
+      // Don't narrow back down to i16 if promoted to i32 already.
+      if (!N->isDivergent() && DestVT.isInteger() &&
+          DestVT.getScalarSizeInBits() > 1 &&
+          DestVT.getScalarSizeInBits() <= 16 &&
+          SrcVT.getScalarSizeInBits() > 16) {
+        return false;
+      }
+    }
+    return true;
+  default:
+    break;
+  }
+
   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
   // limited number of native 64-bit operations. Shrinking an operation to fit
   // in a single 32-bit register should always be helpful. As currently used,
   // this is much less general than the name suggests, and is only used in
   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
   // not profitable, and may actually be harmful.
-  return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+  if (isa<LoadSDNode>(N))
+    return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
+
+  return true;
 }
 
 bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 59f640ea99de3e..4dfa7ac052a5ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -201,7 +201,7 @@ class AMDGPUTargetLowering : public TargetLowering {
                                NegatibleCost &Cost,
                                unsigned Depth) const override;
 
-  bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override;
+  bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
 
   bool isDesirableToCommuteWithShift(const SDNode *N,
                                      CombineLevel Level) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index e236a5d7522e02..3b4faa35b93738 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -89,6 +89,9 @@ class AMDGPURegBankCombinerImpl : public Combiner {
   void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
   void applyClamp(MachineInstr &MI, Register &Reg) const;
 
+  bool matchPromote16to32(MachineInstr &MI) const;
+  void applyPromote16to32(MachineInstr &MI) const;
+
 private:
   SIModeRegisterDefaults getMode() const;
   bool getIEEE() const;
@@ -348,6 +351,116 @@ bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
   return false;
 }
 
+bool AMDGPURegBankCombinerImpl::matchPromote16to32(MachineInstr &MI) const {
+  Register Dst = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  const auto *RB = MRI.getRegBankOrNull(Dst);
+
+  // Only promote uniform instructions.
+  if (RB->getID() != AMDGPU::SGPRRegBankID)
+    return false;
+
+  // Promote only if:
+  //    - We have 16 bit insts (not true 16 bit insts).
+  //    - We don't have packed instructions (for vector types only).
+  // TODO: For vector types, the set of packed operations is more limited, so
+  // may want to promote some anyway.
+  return STI.has16BitInsts() &&
+         (DstTy.isVector() ? !STI.hasVOP3PInsts() : true);
+}
+
+static unsigned getExtOpcodeForPromotedOp(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::G_ASHR:
+    return AMDGPU::G_SEXT;
+  case AMDGPU::G_ADD:
+  case AMDGPU::G_SUB:
+  case AMDGPU::G_FSHR:
+    return AMDGPU::G_ZEXT;
+  case AMDGPU::G_AND:
+  case AMDGPU::G_OR:
+  case AMDGPU::G_XOR:
+  case AMDGPU::G_SHL:
+  case AMDGPU::G_SELECT:
+  case AMDGPU::G_MUL:
+    // operation result won't be influenced by garbage high bits.
+    // TODO: are all of those cases correct, and are there more?
+    return AMDGPU::G_ANYEXT;
+  case AMDGPU::G_ICMP: {
+    return CmpInst::isSigned(cast<GICmp>(MI).getCond()) ? AMDGPU::G_SEXT
+                                                        : AMDGPU::G_ZEXT;
+  }
+  default:
+    llvm_unreachable("unexpected opcode!");
+  }
+}
+
+void AMDGPURegBankCombinerImpl::applyPromote16to32(MachineInstr &MI) const {
+  const unsigned Opc = MI.getOpcode();
+  assert(Opc == AMDGPU::G_ADD || Opc == AMDGPU::G_SUB || Opc == AMDGPU::G_SHL ||
+         Opc == AMDGPU::G_LSHR || Opc == AMDGPU::G_ASHR ||
+         Opc == AMDGPU::G_AND || Opc == AMDGPU::G_OR || Opc == AMDGPU::G_XOR ||
+         Opc == AMDGPU::G_MUL || Opc == AMDGPU::G_SELECT ||
+         Opc == AMDGPU::G_ICMP);
+
+  Register Dst = MI.getOperand(0).getReg();
+
+  bool IsSelectOrCmp = (Opc == AMDGPU::G_SELECT || Opc == AMDGPU::G_ICMP);
+  Register LHS = MI.getOperand(IsSelectOrCmp + 1).getReg();
+  Register RHS = MI.getOperand(IsSelectOrCmp + 2).getReg();
+
+  assert(MRI.getType(Dst) == LLT::scalar(16));
+  assert(MRI.getType(LHS) == LLT::scalar(16));
+  assert(MRI.getType(RHS) == LLT::scalar(16));
+
+  assert(MRI.getRegBankOrNull(Dst)->getID() == AMDGPU::SGPRRegBankID);
+  assert(MRI.getRegBankOrNull(LHS)->getID() == AMDGPU::SGPRRegBankID);
+  assert(MRI.getRegBankOrNull(RHS)->getID() == AMDGPU::SGPRRegBankID);
+  const RegisterBank &RB = *MRI.getRegBankOrNull(Dst);
+
+  LLT S32 = LLT::scalar(32);
+
+  B.setInstrAndDebugLoc(MI);
+  const unsigned ExtOpc = getExtOpcodeForPromotedOp(MI);
+  LHS = B.buildInstr(ExtOpc, {S32}, {LHS}).getReg(0);
+  RHS = B.buildInstr(ExtOpc, {S32}, {RHS}).getReg(0);
+
+  MRI.setRegBank(LHS, RB);
+  MRI.setRegBank(RHS, RB);
+
+  MachineInstr *NewInst;
+  if (IsSelectOrCmp)
+    NewInst = B.buildInstr(Opc, {Dst}, {MI.getOperand(1), LHS, RHS});
+  else
+    NewInst = B.buildInstr(Opc, {S32}, {LHS, RHS});
+
+  if (Opc != AMDGPU::G_ICMP) {
+    Register Dst32 = NewInst->getOperand(0).getReg();
+    MRI.setRegBank(Dst32, RB);
+    B.buildTrunc(Dst, Dst32);
+  }
+
+  switch (Opc) {
+  case AMDGPU::G_ADD:
+  case AMDGPU::G_SHL:
+    NewInst->setFlag(MachineInstr::NoUWrap);
+    NewInst->setFlag(MachineInstr::NoSWrap);
+    break;
+  case AMDGPU::G_SUB:
+    if (MI.getFlag(MachineInstr::NoUWrap))
+      NewInst->setFlag(MachineInstr::NoUWrap);
+    NewInst->setFlag(MachineInstr::NoSWrap);
+    break;
+  case AMDGPU::G_MUL:
+    NewInst->setFlag(MachineInstr::NoUWrap);
+    if (MI.getFlag(MachineInstr::NoUWrap))
+      NewInst->setFlag(MachineInstr::NoUWrap);
+    break;
+  }
+
+  MI.eraseFromParent();
+}
+
 void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
                                            Register &Reg) const {
   B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1437f3d58b5e79..96a59acd751a62 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -894,6 +894,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        ISD::UADDO_CARRY,
                        ISD::SUB,
                        ISD::USUBO_CARRY,
+                       ISD::MUL,
                        ISD::FADD,
                        ISD::FSUB,
                        ISD::FDIV,
@@ -909,9 +910,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        ISD::UMIN,
                        ISD::UMAX,
                        ISD::SETCC,
+                       ISD::SELECT,
+                       ISD::SMIN,
+                       ISD::SMAX,
+                       ISD::UMIN,
+                       ISD::UMAX,
                        ISD::AND,
                        ISD::OR,
                        ISD::XOR,
+                       ISD::SHL,
+                       ISD::SRL,
+                       ISD::SRA,
                        ISD::FSHR,
                        ISD::SINT_TO_FP,
                        ISD::UINT_TO_FP,
@@ -1935,13 +1944,6 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
     switch (Op) {
     case ISD::LOAD:
     case ISD::STORE:
-
-    // These operations are done with 32-bit instructions anyway.
-    case ISD::AND:
-    case ISD::OR:
-    case ISD::XOR:
-    case ISD::SELECT:
-      // TODO: Extensions?
       return true;
     default:
       return false;
@@ -6746,6 +6748,122 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
 }
 
+static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
+  switch (Op->getOpcode()) {
+  case ISD::SRA:
+  case ISD::SMIN:
+  case ISD::SMAX:
+    return ISD::SIGN_EXTEND;
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::SRL:
+  case ISD::UMIN:
+  case ISD::UMAX:
+    return ISD::ZERO_EXTEND;
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SELECT:
+  case ISD::MUL:
+    // operation result won't be influenced by garbage high bits.
+    // TODO: are all of those cases correct, and are there more?
+    return ISD::ANY_EXTEND;
+  case ISD::SETCC: {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    return ISD::isSignedIntSetCC(CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+  }
+  default:
+    llvm_unreachable("unexpected opcode!");
+  }
+}
+
+SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
+                                                DAGCombinerInfo &DCI) const {
+  const unsigned Opc = Op.getOpcode();
+  assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
+         Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
+         Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
+         Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
+         Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
+
+  EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
+                                 : Op->getOperand(0).getValueType();
+
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // Promote only if:
+  //    - We have 16 bit insts (not true 16 bit insts).
+  //    - We don't have packed instructions (for vector types only).
+  // TODO: For vector types, the set of packed operations is more limited, so
+  // may want to promote some anyway.
+  if (!Subtarget->has16BitInsts() ||
+      (OpTy.isVector() ? Subtarget->hasVOP3PInsts() : false))
+    return SDValue();
+
+  // Promote uniform scalar and vector integers between 2 and 16 bits.
+  if (Op->isDivergent() || !OpTy.isInteger() ||
+      OpTy.getScalarSizeInBits() == 1 || OpTy.getScalarSizeInBits() > 16)
+    return SDValue();
+
+  auto &DAG = DCI.DAG;
+
+  SDLoc DL(Op);
+  SDValue LHS;
+  SDValue RHS;
+  if (Opc == ISD::SELECT) {
+    LHS = Op->getOperand(1);
+    RHS = Op->getOperand(2);
+  } else {
+    LHS = Op->getOperand(0);
+    RHS = Op->getOperand(1);
+  }
+
+  auto ExtTy = OpTy.changeElementType(MVT::i32);
+
+  const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
+  LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
+  RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
+
+  // setcc always return i1/i1 vec so no need to truncate after.
+  if (Opc == ISD::SETCC) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
+  }
+
+  SDNodeFlags Flags;
+  switch (Op->getOpcode()) {
+  case ISD::ADD:
+  case ISD::SHL:
+    Flags.setNoUnsignedWrap(true);
+    Flags.setNoSignedWrap(true);
+    break;
+  case ISD::SUB:
+    Flags.setNoUnsignedWrap(Op->getFlags().hasNoUnsignedWrap());
+    Flags.setNoSignedWrap(true);
+    break;
+  case ISD::MUL:
+    Flags.setNoUnsignedWrap(true);
+    Flags.setNoSignedWrap(Op->getFlags().hasNoUnsignedWrap());
+    break;
+  default:
+    break;
+  }
+
+  Flags.setExact(Op->getFlags().hasExact());
+
+  // For other ops, we extend the operation's return type as well so we need to
+  // truncate back to the original type.
+  SDValue NewVal;
+  if (Opc == ISD::SELECT)
+    NewVal = DAG.getSelect(DL, ExtTy, Op->getOperand(0), LHS, RHS);
+  else
+    NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS}, Flags);
+
+  return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
+}
+
 // Custom lowering for vector multiplications and s_mul_u64.
 SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
@@ -14682,8 +14800,32 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
 
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::MUL:
+  case ISD::SETCC:
+  case ISD::SELECT:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+    if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
+      return Res;
+    break;
+  default:
+    break;
+  }
+
   if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
     return SDValue();
+
   switch (N->getOpcode()) {
   case ISD::ADD:
     return performAddCombine(N, DCI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index eed4b3e79cdeee..f299dabdbc6d0b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -148,6 +148,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerFPTRUNC_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
   SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
@@ -464,7 +465,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
-
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1a6be4eb5af1ef..c187820bc896f7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34444,7 +34444,8 @@ bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
   return false;
 }
 
-bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
+bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
+                                              EVT DestVT) const {
   // i16 instructions are longer (0x66 prefix) and potentially slower.
   return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
 }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 93d2b3e65742b2..44bf7e98a2ded8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1429,7 +1429,7 @@ namespace llvm {
     /// Return true if it's profitable to narrow operations of type SrcVT to
     /// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not
     /// from i32 to i16.
-    bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override;
+    bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
 
     bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
                                               EVT VT) const override;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index c8b82716a9fe13..9c25a07bc8dc3d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -281,12 +281,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_add_i32 s0, s0, 0xffc0
-; GFX8-NEXT:    s_add_i32 s1, s1, 0xffc0
+; GFX8-NEXT:    s_addk_i32 s1, 0xffc0
+; GFX8-NEXT:    s_addk_i32 s0, 0xffc0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_splat:
@@ -323,12 +323,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_add_i32 s0, s0, 0xffc0
 ; GFX8-NEXT:    s_add_i32 s1, s1, 4
+; GFX8-NEXT:    s_addk_i32 s0, 0xffc0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_lo:
@@ -365,12 +365,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
 ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_addk_i32 s1, 0xffc0
 ; GFX8-NEXT:    s_add_i32 s0, s0, 4
-; GFX8-NEXT:    s_add_i32 s1, s1, 0xffc0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16_neg_inline_imm_hi:
@@ -408,14 +408,13 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
 ; GFX8-LABEL: s_add_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s3
-; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_add_i32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s2
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16:
@@ -461,14 +460,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s3
-; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_add_i32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s2
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16_fneg_lhs:
@@ -517,14 +515,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s3
-; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_add_i32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s2
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16_fneg_rhs:
@@ -580,14 +577,13 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha
 ; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
 ; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_add_i32 s0, s0, s1
 ; GFX8-NEXT:    s_add_i32 s2, s2, s3
-; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_add_i32 s0, s0, s1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s2
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index 4be00fedb972e7..cfa93a0a301671 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -349,63 +349,67 @@ define amdgpu_ps <2 x i32> @s_andn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i3
 }
 
 define amdgpu_ps i16 @s_andn2_i16(i16 inreg %src0, i16 inreg %src1) {
-; GCN-LABEL: s_andn2_i16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_andn2_b32 s0, s2, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_andn2_i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_andn2_b32 s0, s2, s3
+; GFX6-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: s_andn2_i16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_andn2_b32 s0, s2, s3
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: s_andn2_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_xor_b32 s0, s3, -1
+; GFX9-NEXT:    s_and_b32 s0, s2, s0
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_andn2_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_not1_b32 s0, s2, s3
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX10PLUS-LABEL: s_andn2_i16:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_xor_b32 s0, s3, -1
+; GFX10PLUS-NEXT:    s_and_b32 s0, s2, s0
+; GFX10PLUS-NEXT:    ; return to shader part epilog
   %not.src1 = xor i16 %src1, -1
   %and = and i16 %src0, %not.src1
   ret i16 %and
 }
 
 define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
-; GCN-LABEL: s_andn2_i16_commute:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_andn2_b32 s0, s2, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_andn2_i16_commute:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_andn2_b32 s0, s2, s3
+; GFX6-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: s_andn2_i16_commute:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_andn2_b32 s0, s2, s3
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: s_andn2_i16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_xor_b32 s0, s3, -1
+; GFX9-NEXT:    s_and_b32 s0, s0, s2
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_andn2_i16_commute:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_not1_b32 s0, s2, s3
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX10PLUS-LABEL: s_andn2_i16_commute:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_xor_b32 s0, s3, -1
+; GFX10PLUS-NEXT:    s_and_b32 s0, s0, s2
+; GFX10PLUS-NEXT:    ; return to shader part epilog
   %not.src1 = xor i16 %src1, -1
   %and = and i16 %not.src1, %src0
   ret i16 %and
 }
 
 define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
-; GCN-LABEL: s_andn2_i16_multi_use:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_xor_b32 s1, s3, -1
-; GCN-NEXT:    s_andn2_b32 s0, s2, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_andn2_i16_multi_use:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_xor_b32 s1, s3, -1
+; GFX6-NEXT:    s_andn2_b32 s0, s2, s3
+; GFX6-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: s_andn2_i16_multi_use:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_andn2_b32 s0, s2, s3
-; GFX10-NEXT:    s_xor_b32 s1, s3, -1
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: s_andn2_i16_multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_xor_b32 s1, s3, -1
+; GFX9-NEXT:    s_and_b32 s0, s2, s1
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_andn2_i16_multi_use:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_not1_b32 s0, s2, s3
-; GFX11-NEXT:    s_xor_b32 s1, s3, -1
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX10PLUS-LABEL: s_andn2_i16_multi_use:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_xor_b32 s1, s3, -1
+; GFX10PLUS-NEXT:    s_and_b32 s0, s2, s1
+; GFX10PLUS-NEXT:    ; return to shader part epilog
   %not.src1 = xor i16 %src1, -1
   %and = and i16 %src0, %not.src1
   %insert.0 = insertvalue { i16, i16 } undef, i16 %and, 0
@@ -414,23 +418,25 @@ define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg
 }
 
 define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
-; GCN-LABEL: s_andn2_i16_multi_foldable_use:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_andn2_b32 s0, s2, s4
-; GCN-NEXT:    s_andn2_b32 s1, s3, s4
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_andn2_i16_multi_foldable_use:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_andn2_b32 s0, s2, s4
+; GFX6-NEXT:    s_andn2_b32 s1, s3, s4
+; GFX6-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: s_andn2_i16_multi_foldable_use:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_andn2_b32 s0, s2, s4
-; GFX10-NEXT:    s_andn2_b32 s1, s3, s4
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: s_andn2_i16_multi_foldable_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_xor_b32 s1, s4, -1
+; GFX9-NEXT:    s_and_b32 s0, s2, s1
+; GFX9-NEXT:    s_and_b32 s1, s3, s1
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_andn2_i16_multi_foldable_use:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_not1_b32 s0, s2, s4
-; GFX11-NEXT:    s_and_not1_b32 s1, s3, s4
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX10PLUS-LABEL: s_andn2_i16_multi_foldable_use:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_xor_b32 s1, s4, -1
+; GFX10PLUS-NEXT:    s_and_b32 s0, s2, s1
+; GFX10PLUS-NEXT:    s_and_b32 s1, s3, s1
+; GFX10PLUS-NEXT:    ; return to shader part epilog
   %not.src2 = xor i16 %src2, -1
   %and0 = and i16 %src0, %not.src2
   %and1 = and i16 %src1, %not.src2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 63f5464371cc62..58ae28bc48f4aa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -79,22 +79,30 @@ define amdgpu_ps i8 @s_ashr_i8(i8 inreg %value, i8 inreg %amount) {
 ;
 ; GFX8-LABEL: s_ashr_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i8 s0, s0
-; GFX8-NEXT:    s_sext_i32_i8 s1, s1
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_ashr_i32 s0, s0, 8
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_ashr_i8:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-NEXT:    s_sext_i32_i8 s1, s1
+; GFX9-NEXT:    s_sext_i32_i16 s0, s0
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_ashr_i8:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_sext_i32_i8 s0, s0
-; GFX10PLUS-NEXT:    s_sext_i32_i8 s1, s1
+; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
+; GFX10PLUS-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10PLUS-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = ashr i8 %value, %amount
@@ -102,15 +110,30 @@ define amdgpu_ps i8 @s_ashr_i8(i8 inreg %value, i8 inreg %amount) {
 }
 
 define amdgpu_ps i8 @s_ashr_i8_7(i8 inreg %value) {
-; GCN-LABEL: s_ashr_i8_7:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_sext_i32_i8 s0, s0
-; GCN-NEXT:    s_ashr_i32 s0, s0, 7
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_ashr_i8_7:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_sext_i32_i8 s0, s0
+; GFX6-NEXT:    s_ashr_i32 s0, s0, 7
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_ashr_i8_7:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_ashr_i32 s0, s0, 15
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_ashr_i8_7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_sext_i32_i8 s0, s0
+; GFX9-NEXT:    s_sext_i32_i16 s0, s0
+; GFX9-NEXT:    s_ashr_i32 s0, s0, 7
+; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_ashr_i8_7:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_sext_i32_i8 s0, s0
+; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX10PLUS-NEXT:    s_ashr_i32 s0, s0, 7
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = ashr i8 %value, 7
@@ -652,21 +675,21 @@ define amdgpu_ps i16 @s_ashr_i16(i16 inreg %value, i16 inreg %amount) {
 ; GFX8-LABEL: s_ashr_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_ashr_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_sext_i32_i16 s0, s0
-; GFX9-NEXT:    s_sext_i32_i16 s1, s1
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_ashr_i16:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
-; GFX10PLUS-NEXT:    s_sext_i32_i16 s1, s1
+; GFX10PLUS-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10PLUS-NEXT:    s_ashr_i32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = ashr i16 %value, %amount
@@ -827,14 +850,16 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 ;
 ; GFX8-LABEL: s_ashr_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s2, s0
-; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s3, s1
-; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x100010
-; GFX8-NEXT:    s_ashr_i32 s2, s2, s3
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s1
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX8-NEXT:    s_sext_i32_i16 s1, s2
+; GFX8-NEXT:    s_ashr_i32 s1, s1, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1029,23 +1054,27 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ;
 ; GFX8-LABEL: s_ashr_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s4, s0
-; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s5, s1
-; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s6, s2
-; GFX8-NEXT:    s_bfe_i32 s2, s2, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s7, s3
-; GFX8-NEXT:    s_bfe_i32 s3, s3, 0x100010
-; GFX8-NEXT:    s_ashr_i32 s4, s4, s6
+; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s2
-; GFX8-NEXT:    s_ashr_i32 s2, s5, s7
+; GFX8-NEXT:    s_sext_i32_i16 s2, s4
+; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX8-NEXT:    s_ashr_i32 s2, s2, s6
+; GFX8-NEXT:    s_sext_i32_i16 s1, s1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s3
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX8-NEXT:    s_and_b32 s3, s4, 0xffff
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s0, s3
+; GFX8-NEXT:    s_sext_i32_i16 s3, s5
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_ashr_i32 s3, s3, s7
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1236,41 +1265,49 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ;
 ; GFX8-LABEL: s_ashr_v8i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_sext_i32_i16 s8, s0
-; GFX8-NEXT:    s_bfe_i32 s0, s0, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s9, s1
-; GFX8-NEXT:    s_bfe_i32 s1, s1, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s12, s4
-; GFX8-NEXT:    s_bfe_i32 s4, s4, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s13, s5
-; GFX8-NEXT:    s_bfe_i32 s5, s5, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s10, s2
-; GFX8-NEXT:    s_bfe_i32 s2, s2, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s14, s6
-; GFX8-NEXT:    s_bfe_i32 s6, s6, 0x100010
+; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
+; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, s4
-; GFX8-NEXT:    s_ashr_i32 s4, s9, s13
+; GFX8-NEXT:    s_sext_i32_i16 s4, s8
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
+; GFX8-NEXT:    s_ashr_i32 s4, s4, s12
+; GFX8-NEXT:    s_sext_i32_i16 s1, s1
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, s5
-; GFX8-NEXT:    s_sext_i32_i16 s11, s3
-; GFX8-NEXT:    s_bfe_i32 s3, s3, 0x100010
-; GFX8-NEXT:    s_sext_i32_i16 s15, s7
-; GFX8-NEXT:    s_bfe_i32 s7, s7, 0x100010
-; GFX8-NEXT:    s_ashr_i32 s5, s10, s14
+; GFX8-NEXT:    s_sext_i32_i16 s5, s9
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
+; GFX8-NEXT:    s_ashr_i32 s5, s5, s13
+; GFX8-NEXT:    s_sext_i32_i16 s2, s2
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, s6
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX8-NEXT:    s_ashr_i32 s8, s8, s12
-; GFX8-NEXT:    s_ashr_i32 s6, s11, s15
+; GFX8-NEXT:    s_sext_i32_i16 s6, s10
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s5
+; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
+; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
+; GFX8-NEXT:    s_ashr_i32 s6, s6, s14
+; GFX8-NEXT:    s_sext_i32_i16 s3, s3
+; GFX8-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, s7
+; GFX8-NEXT:    s_sext_i32_i16 s7, s11
 ; GFX8-NEXT:    s_or_b32 s1, s1, s4
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    s_and_b32 s4, s5, 0xffff
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX8-NEXT:    s_and_b32 s7, s8, 0xffff
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_ashr_i32 s7, s7, s15
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX8-NEXT:    s_and_b32 s4, s6, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s0, s7
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s7
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index afffebea451a0e..3723b781ef364c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -3277,7 +3277,8 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
 ; GFX8-LABEL: s_fshl_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s3, s2, 15
-; GFX8-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX8-NEXT:    s_xor_b32 s2, s2, -1
+; GFX8-NEXT:    s_and_b32 s2, s2, 15
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
@@ -3290,7 +3291,8 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
 ; GFX9-LABEL: s_fshl_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s3, s2, 15
-; GFX9-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX9-NEXT:    s_xor_b32 s2, s2, -1
+; GFX9-NEXT:    s_and_b32 s2, s2, 15
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
@@ -3302,27 +3304,29 @@ define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
 ;
 ; GFX10-LABEL: s_fshl_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b32 s3, s2, 15
-; GFX10-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX10-NEXT:    s_xor_b32 s3, s2, -1
+; GFX10-NEXT:    s_and_b32 s2, s2, 15
+; GFX10-NEXT:    s_and_b32 s3, s3, 15
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b32 s3, s2, 15
-; GFX11-NEXT:    s_and_not1_b32 s2, 15, s2
+; GFX11-NEXT:    s_xor_b32 s3, s2, -1
+; GFX11-NEXT:    s_and_b32 s2, s2, 15
+; GFX11-NEXT:    s_and_b32 s3, s3, 15
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s3
-; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
+; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s2
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -3661,7 +3665,8 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
 ; GFX8-LABEL: v_fshl_i16_svs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s2, s1, 15
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    s_xor_b32 s1, s1, -1
+; GFX8-NEXT:    s_and_b32 s1, s1, 15
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 1, v0
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
@@ -3672,7 +3677,8 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
 ; GFX9-LABEL: v_fshl_i16_svs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s2, s1, 15
-; GFX9-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX9-NEXT:    s_xor_b32 s1, s1, -1
+; GFX9-NEXT:    s_and_b32 s1, s1, 15
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 1, v0
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
@@ -3683,8 +3689,9 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
 ; GFX10-LABEL: v_fshl_i16_svs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshrrev_b16 v0, 1, v0
-; GFX10-NEXT:    s_andn2_b32 s2, 15, s1
+; GFX10-NEXT:    s_xor_b32 s2, s1, -1
 ; GFX10-NEXT:    s_and_b32 s1, s1, 15
+; GFX10-NEXT:    s_and_b32 s2, s2, 15
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    v_lshrrev_b16 v0, s2, v0
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s1
@@ -3694,9 +3701,9 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
 ; GFX11-LABEL: v_fshl_i16_svs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_lshrrev_b16 v0, 1, v0
-; GFX11-NEXT:    s_and_not1_b32 s2, 15, s1
+; GFX11-NEXT:    s_xor_b32 s2, s1, -1
 ; GFX11-NEXT:    s_and_b32 s1, s1, 15
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, 15
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX11-NEXT:    v_lshrrev_b16 v0, s2, v0
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, s1
@@ -3724,7 +3731,8 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 ; GFX8-LABEL: v_fshl_i16_vss:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s2, s1, 15
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    s_xor_b32 s1, s1, -1
+; GFX8-NEXT:    s_and_b32 s1, s1, 15
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
@@ -3736,7 +3744,8 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 ; GFX9-LABEL: v_fshl_i16_vss:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s2, s1, 15
-; GFX9-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX9-NEXT:    s_xor_b32 s1, s1, -1
+; GFX9-NEXT:    s_and_b32 s1, s1, 15
 ; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, 1
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
@@ -3747,24 +3756,26 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 ;
 ; GFX10-LABEL: v_fshl_i16_vss:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b32 s2, s1, 15
-; GFX10-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX10-NEXT:    s_xor_b32 s2, s1, -1
+; GFX10-NEXT:    s_and_b32 s1, s1, 15
+; GFX10-NEXT:    s_and_b32 s2, s2, 15
 ; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT:    v_lshlrev_b16 v0, s2, v0
+; GFX10-NEXT:    v_lshlrev_b16 v0, s1, v0
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s2
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_fshl_i16_vss:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b32 s2, s1, 15
-; GFX11-NEXT:    s_and_not1_b32 s1, 15, s1
+; GFX11-NEXT:    s_xor_b32 s2, s1, -1
+; GFX11-NEXT:    s_and_b32 s1, s1, 15
+; GFX11-NEXT:    s_and_b32 s2, s2, 15
 ; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT:    v_lshlrev_b16 v0, s2, v0
+; GFX11-NEXT:    v_lshlrev_b16 v0, s1, v0
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, 1
-; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -3802,10 +3813,11 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ;
 ; GFX8-LABEL: s_fshl_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX8-NEXT:    s_and_b32 s6, s2, 15
-; GFX8-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX8-NEXT:    s_xor_b32 s2, s2, -1
+; GFX8-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX8-NEXT:    s_and_b32 s2, s2, 15
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
@@ -3815,7 +3827,8 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s5, 15
-; GFX8-NEXT:    s_andn2_b32 s2, 15, s5
+; GFX8-NEXT:    s_xor_b32 s2, s5, -1
+; GFX8-NEXT:    s_and_b32 s2, s2, 15
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
 ; GFX8-NEXT:    s_lshr_b32 s3, s4, 1
@@ -4183,9 +4196,10 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 ;
 ; GFX8-LABEL: v_fshl_v2i16_svs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s4, s1, 15
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    s_and_b32 s4, s1, 15
+; GFX8-NEXT:    s_xor_b32 s1, s1, -1
+; GFX8-NEXT:    s_and_b32 s1, s1, 15
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v0
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
@@ -4193,8 +4207,9 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, s1, v1
 ; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    s_and_b32 s0, s3, 15
+; GFX8-NEXT:    s_xor_b32 s1, s3, -1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 1
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
+; GFX8-NEXT:    s_and_b32 s1, s1, 15
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    s_lshl_b32 s0, s2, s0
@@ -4280,18 +4295,20 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ;
 ; GFX8-LABEL: v_fshl_v2i16_vss:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_and_b32 s4, s1, 15
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    s_xor_b32 s1, s1, -1
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_and_b32 s1, s1, 15
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, s4, v0
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    s_xor_b32 s1, s3, -1
 ; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    s_and_b32 s0, s3, 15
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
+; GFX8-NEXT:    s_and_b32 s1, s1, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
@@ -4402,10 +4419,11 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
 ;
 ; GFX8-LABEL: s_fshl_v3i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 16
 ; GFX8-NEXT:    s_and_b32 s9, s4, 15
-; GFX8-NEXT:    s_andn2_b32 s4, 15, s4
+; GFX8-NEXT:    s_xor_b32 s4, s4, -1
+; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX8-NEXT:    s_and_b32 s4, s4, 15
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_and_b32 s9, 0xffff, s9
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 1
@@ -4415,7 +4433,8 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s8, 15
-; GFX8-NEXT:    s_andn2_b32 s4, 15, s8
+; GFX8-NEXT:    s_xor_b32 s4, s8, -1
+; GFX8-NEXT:    s_and_b32 s4, s4, 15
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s2, s6, s2
 ; GFX8-NEXT:    s_lshr_b32 s6, s7, 1
@@ -4423,7 +4442,8 @@ define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_lshr_b32 s4, s6, s4
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_and_b32 s4, s5, 15
-; GFX8-NEXT:    s_andn2_b32 s5, 15, s5
+; GFX8-NEXT:    s_xor_b32 s5, s5, -1
+; GFX8-NEXT:    s_and_b32 s5, s5, 15
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s4
@@ -4759,10 +4779,11 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ;
 ; GFX8-LABEL: s_fshl_v4i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
 ; GFX8-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX8-NEXT:    s_and_b32 s12, s4, 15
-; GFX8-NEXT:    s_andn2_b32 s4, 15, s4
+; GFX8-NEXT:    s_xor_b32 s4, s4, -1
+; GFX8-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX8-NEXT:    s_and_b32 s4, s4, 15
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_and_b32 s12, 0xffff, s12
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 1
@@ -4772,17 +4793,19 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, s4
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s10, 15
-; GFX8-NEXT:    s_andn2_b32 s4, 15, s10
+; GFX8-NEXT:    s_xor_b32 s4, s10, -1
+; GFX8-NEXT:    s_and_b32 s4, s4, 15
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s2, s6, s2
 ; GFX8-NEXT:    s_lshr_b32 s6, s8, 1
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshr_b32 s4, s6, s4
+; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
 ; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_and_b32 s4, s5, 15
+; GFX8-NEXT:    s_xor_b32 s5, s5, -1
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 16
-; GFX8-NEXT:    s_lshr_b32 s11, s5, 16
-; GFX8-NEXT:    s_andn2_b32 s5, 15, s5
+; GFX8-NEXT:    s_and_b32 s5, s5, 15
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
@@ -4790,9 +4813,10 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s5
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s4
+; GFX8-NEXT:    s_xor_b32 s4, s11, -1
 ; GFX8-NEXT:    s_or_b32 s1, s1, s3
 ; GFX8-NEXT:    s_and_b32 s3, s11, 15
-; GFX8-NEXT:    s_andn2_b32 s4, 15, s11
+; GFX8-NEXT:    s_and_b32 s4, s4, 15
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshr_b32 s5, s9, 1
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 8538dcabca924b..625a66d70f9512 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -3021,7 +3021,8 @@ define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
 ; GFX8-LABEL: s_fshr_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s3, s2, 15
-; GFX8-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX8-NEXT:    s_xor_b32 s2, s2, -1
+; GFX8-NEXT:    s_and_b32 s2, s2, 15
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
@@ -3034,7 +3035,8 @@ define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
 ; GFX9-LABEL: s_fshr_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s3, s2, 15
-; GFX9-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX9-NEXT:    s_xor_b32 s2, s2, -1
+; GFX9-NEXT:    s_and_b32 s2, s2, 15
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
@@ -3046,27 +3048,29 @@ define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt)
 ;
 ; GFX10-LABEL: s_fshr_i16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b32 s3, s2, 15
-; GFX10-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX10-NEXT:    s_xor_b32 s3, s2, -1
+; GFX10-NEXT:    s_and_b32 s2, s2, 15
+; GFX10-NEXT:    s_and_b32 s3, s3, 15
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b32 s3, s2, 15
-; GFX11-NEXT:    s_and_not1_b32 s2, 15, s2
+; GFX11-NEXT:    s_xor_b32 s3, s2, -1
+; GFX11-NEXT:    s_and_b32 s2, s2, 15
+; GFX11-NEXT:    s_and_b32 s3, s3, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX11-NEXT:    s_lshr_b32 s1, s1, s3
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s3
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -3403,7 +3407,8 @@ define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
 ; GFX8-LABEL: v_fshr_i16_svs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s2, s1, 15
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    s_xor_b32 s1, s1, -1
+; GFX8-NEXT:    s_and_b32 s1, s1, 15
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
@@ -3414,7 +3419,8 @@ define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
 ; GFX9-LABEL: v_fshr_i16_svs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s2, s1, 15
-; GFX9-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX9-NEXT:    s_xor_b32 s1, s1, -1
+; GFX9-NEXT:    s_and_b32 s1, s1, 15
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
@@ -3424,22 +3430,24 @@ define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
 ;
 ; GFX10-LABEL: v_fshr_i16_svs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_and_b32 s2, s1, 15
-; GFX10-NEXT:    s_andn2_b32 s1, 15, s1
-; GFX10-NEXT:    v_lshrrev_b16 v0, s2, v0
+; GFX10-NEXT:    s_xor_b32 s2, s1, -1
+; GFX10-NEXT:    s_and_b32 s1, s1, 15
+; GFX10-NEXT:    s_and_b32 s2, s2, 15
+; GFX10-NEXT:    v_lshrrev_b16 v0, s1, v0
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_fshr_i16_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b32 s2, s1, 15
-; GFX11-NEXT:    s_and_not1_b32 s1, 15, s1
-; GFX11-NEXT:    v_lshrrev_b16 v0, s2, v0
+; GFX11-NEXT:    s_xor_b32 s2, s1, -1
+; GFX11-NEXT:    s_and_b32 s1, s1, 15
+; GFX11-NEXT:    s_and_b32 s2, s2, 15
+; GFX11-NEXT:    v_lshrrev_b16 v0, s1, v0
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -3467,7 +3475,8 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 ; GFX8-LABEL: v_fshr_i16_vss:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s2, s1, 15
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    s_xor_b32 s1, s1, -1
+; GFX8-NEXT:    s_and_b32 s1, s1, 15
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, s1, v0
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
@@ -3479,7 +3488,8 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 ; GFX9-LABEL: v_fshr_i16_vss:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s2, s1, 15
-; GFX9-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX9-NEXT:    s_xor_b32 s1, s1, -1
+; GFX9-NEXT:    s_and_b32 s1, s1, 15
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, s1, v0
 ; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
@@ -3491,11 +3501,12 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 ; GFX10-LABEL: v_fshr_i16_vss:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT:    s_andn2_b32 s2, 15, s1
+; GFX10-NEXT:    s_xor_b32 s2, s1, -1
 ; GFX10-NEXT:    s_and_b32 s1, s1, 15
+; GFX10-NEXT:    s_and_b32 s2, s2, 15
 ; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, s2, v0
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -3503,11 +3514,13 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
 ; GFX11-LABEL: v_fshr_i16_vss:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX11-NEXT:    s_and_not1_b32 s2, 15, s1
+; GFX11-NEXT:    s_xor_b32 s2, s1, -1
 ; GFX11-NEXT:    s_and_b32 s1, s1, 15
+; GFX11-NEXT:    s_and_b32 s2, s2, 15
 ; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX11-NEXT:    v_lshlrev_b16 v0, s2, v0
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v0
@@ -3567,12 +3580,13 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_or_b32 s0, s0, s5
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX8-NEXT:    s_lshr_b32 s5, s4, 15
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX8-NEXT:    s_xor_b32 s2, s2, -1
 ; GFX8-NEXT:    s_or_b32 s3, s3, s5
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX8-NEXT:    s_lshr_b32 s5, s2, 16
 ; GFX8-NEXT:    s_and_b32 s6, s2, 15
-; GFX8-NEXT:    s_andn2_b32 s2, 15, s2
+; GFX8-NEXT:    s_xor_b32 s2, s2, -1
+; GFX8-NEXT:    s_and_b32 s2, s2, 15
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
@@ -3582,8 +3596,9 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    s_and_b32 s1, s5, 15
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX8-NEXT:    s_xor_b32 s2, s5, -1
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX8-NEXT:    s_andn2_b32 s2, 15, s5
+; GFX8-NEXT:    s_and_b32 s2, s2, 15
 ; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s4
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
@@ -4017,17 +4032,19 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <
 ; GFX8-NEXT:    s_lshl_b32 s0, s2, 1
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_e32 v2, s0, v2
-; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8-NEXT:    s_xor_b32 s0, s1, -1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v0
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX8-NEXT:    s_and_b32 s2, s0, 15
-; GFX8-NEXT:    s_andn2_b32 s0, 15, s0
+; GFX8-NEXT:    s_xor_b32 s0, s0, -1
+; GFX8-NEXT:    v_mov_b32_e32 v4, 1
+; GFX8-NEXT:    s_and_b32 s0, s0, 15
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 1, v3
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v3, s0, v3
 ; GFX8-NEXT:    s_and_b32 s0, s1, 15
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    s_xor_b32 s1, s1, -1
+; GFX8-NEXT:    s_and_b32 s1, s1, 15
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 1, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, s0, v2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, s1, v0
@@ -4146,12 +4163,13 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    s_lshr_b32 s3, s2, 15
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX8-NEXT:    s_xor_b32 s1, s1, -1
 ; GFX8-NEXT:    v_or_b32_e32 v0, s3, v0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_and_b32 s4, s1, 15
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s1
+; GFX8-NEXT:    s_xor_b32 s1, s1, -1
+; GFX8-NEXT:    s_and_b32 s1, s1, 15
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
@@ -4160,7 +4178,8 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX8-NEXT:    v_or_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    s_and_b32 s0, s3, 15
-; GFX8-NEXT:    s_andn2_b32 s1, 15, s3
+; GFX8-NEXT:    s_xor_b32 s1, s3, -1
+; GFX8-NEXT:    s_and_b32 s1, s1, 15
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, s0, v0
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, 1
@@ -4290,12 +4309,13 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_or_b32 s0, s0, s8
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 1
 ; GFX8-NEXT:    s_lshr_b32 s8, s7, 15
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX8-NEXT:    s_xor_b32 s4, s4, -1
 ; GFX8-NEXT:    s_or_b32 s6, s6, s8
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 16
 ; GFX8-NEXT:    s_and_b32 s9, s4, 15
-; GFX8-NEXT:    s_andn2_b32 s4, 15, s4
+; GFX8-NEXT:    s_xor_b32 s4, s4, -1
+; GFX8-NEXT:    s_and_b32 s4, s4, 15
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_and_b32 s9, 0xffff, s9
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 1
@@ -4305,8 +4325,9 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s8, 15
 ; GFX8-NEXT:    s_lshl_b32 s7, s7, 1
+; GFX8-NEXT:    s_xor_b32 s4, s8, -1
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT:    s_andn2_b32 s4, 15, s8
+; GFX8-NEXT:    s_and_b32 s4, s4, 15
 ; GFX8-NEXT:    s_lshl_b32 s2, s6, s2
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s7
 ; GFX8-NEXT:    s_lshr_b32 s6, s6, 1
@@ -4317,10 +4338,11 @@ define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX8-NEXT:    s_lshr_b32 s4, s4, 15
 ; GFX8-NEXT:    s_or_b32 s1, s1, s4
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX8-NEXT:    s_xor_b32 s4, s5, -1
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX8-NEXT:    s_and_b32 s5, s4, 15
-; GFX8-NEXT:    s_andn2_b32 s4, 15, s4
+; GFX8-NEXT:    s_xor_b32 s4, s4, -1
+; GFX8-NEXT:    s_and_b32 s4, s4, 15
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
@@ -4726,12 +4748,13 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-NEXT:    s_or_b32 s0, s0, s8
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 1
 ; GFX8-NEXT:    s_lshr_b32 s8, s7, 15
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX8-NEXT:    s_xor_b32 s4, s4, -1
 ; GFX8-NEXT:    s_or_b32 s6, s6, s8
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 16
 ; GFX8-NEXT:    s_and_b32 s9, s4, 15
-; GFX8-NEXT:    s_andn2_b32 s4, 15, s4
+; GFX8-NEXT:    s_xor_b32 s4, s4, -1
+; GFX8-NEXT:    s_and_b32 s4, s4, 15
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_and_b32 s9, 0xffff, s9
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 1
@@ -4741,8 +4764,9 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_and_b32 s2, s8, 15
 ; GFX8-NEXT:    s_lshl_b32 s7, s7, 1
+; GFX8-NEXT:    s_xor_b32 s4, s8, -1
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX8-NEXT:    s_andn2_b32 s4, 15, s8
+; GFX8-NEXT:    s_and_b32 s4, s4, 15
 ; GFX8-NEXT:    s_lshl_b32 s2, s6, s2
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s7
 ; GFX8-NEXT:    s_lshr_b32 s6, s6, 1
@@ -4761,12 +4785,13 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-NEXT:    s_or_b32 s1, s1, s6
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 1
 ; GFX8-NEXT:    s_lshr_b32 s6, s4, 15
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX8-NEXT:    s_xor_b32 s5, s5, -1
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX8-NEXT:    s_lshr_b32 s6, s5, 16
 ; GFX8-NEXT:    s_and_b32 s7, s5, 15
-; GFX8-NEXT:    s_andn2_b32 s5, 15, s5
+; GFX8-NEXT:    s_xor_b32 s5, s5, -1
+; GFX8-NEXT:    s_and_b32 s5, s5, 15
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_and_b32 s7, 0xffff, s7
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
@@ -4776,8 +4801,9 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %
 ; GFX8-NEXT:    s_or_b32 s1, s1, s3
 ; GFX8-NEXT:    s_and_b32 s3, s6, 15
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 1
+; GFX8-NEXT:    s_xor_b32 s5, s6, -1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NEXT:    s_andn2_b32 s5, 15, s6
+; GFX8-NEXT:    s_and_b32 s5, s5, 15
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s4
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 4caf83774bbba2..53f6c9543c3e3f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -291,16 +291,16 @@ define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double
 define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) {
 ; GFX7-LABEL: test_div_fmas_f32:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xa
-; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x13
-; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x1c
-; GFX7-NEXT:    s_load_dword s5, s[0:1], 0x25
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[2:3], 0xa
+; GFX7-NEXT:    s_load_dword s5, s[2:3], 0x13
+; GFX7-NEXT:    s_load_dword s6, s[2:3], 0x1c
+; GFX7-NEXT:    s_load_dword s7, s[2:3], 0x25
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    s_and_b32 s2, 1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    s_and_b32 s2, 1, s7
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
@@ -311,19 +311,20 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f
 ;
 ; GFX8-LABEL: test_div_fmas_f32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x28
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x4c
-; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x70
-; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x94
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x28
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x4c
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x70
+; GFX8-NEXT:    s_load_dword s5, s[2:3], 0x94
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    s_and_b32 s2, 1, s5
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX8-NEXT:    s_nop 3
+; GFX8-NEXT:    s_and_b32 s0, 1, s5
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_nop 2
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -332,52 +333,52 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f
 ; GFX10_W32-LABEL: test_div_fmas_f32:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x4
-; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x94
-; GFX10_W32-NEXT:    s_load_dword s5, s[0:1], 0x4c
-; GFX10_W32-NEXT:    s_load_dword s6, s[0:1], 0x70
-; GFX10_W32-NEXT:    s_load_dword s7, s[0:1], 0x28
-; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX10_W32-NEXT:    s_load_dword s4, s[2:3], 0x94
+; GFX10_W32-NEXT:    s_load_dword s5, s[2:3], 0x4c
+; GFX10_W32-NEXT:    s_load_dword s6, s[2:3], 0x70
+; GFX10_W32-NEXT:    s_load_dword s7, s[2:3], 0x28
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s0, 1, s4
+; GFX10_W32-NEXT:    s_and_b32 s2, 1, s4
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s7, v0, v1
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x4
-; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x94
-; GFX10_W64-NEXT:    s_load_dword s5, s[0:1], 0x4c
-; GFX10_W64-NEXT:    s_load_dword s6, s[0:1], 0x70
-; GFX10_W64-NEXT:    s_load_dword s7, s[0:1], 0x28
-; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX10_W64-NEXT:    s_load_dword s4, s[2:3], 0x94
+; GFX10_W64-NEXT:    s_load_dword s5, s[2:3], 0x4c
+; GFX10_W64-NEXT:    s_load_dword s6, s[2:3], 0x70
+; GFX10_W64-NEXT:    s_load_dword s7, s[2:3], 0x28
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s0, 1, s4
+; GFX10_W64-NEXT:    s_and_b32 s2, 1, s4
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s7, v0, v1
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x4
-; GFX11_W32-NEXT:    s_load_b32 s2, s[0:1], 0x94
-; GFX11_W32-NEXT:    s_load_b32 s3, s[0:1], 0x4c
-; GFX11_W32-NEXT:    s_load_b32 s4, s[0:1], 0x70
-; GFX11_W32-NEXT:    s_load_b32 s5, s[0:1], 0x28
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s4, s[2:3], 0x94
+; GFX11_W32-NEXT:    s_load_b32 s5, s[2:3], 0x4c
+; GFX11_W32-NEXT:    s_load_b32 s6, s[2:3], 0x70
+; GFX11_W32-NEXT:    s_load_b32 s7, s[2:3], 0x28
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
-; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4
+; GFX11_W32-NEXT:    s_and_b32 s2, 1, s4
+; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
 ; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s5, v0, v1
+; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s7, v0, v1
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W32-NEXT:    s_nop 0
@@ -387,17 +388,17 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f
 ; GFX11_W64-LABEL: test_div_fmas_f32:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x4
-; GFX11_W64-NEXT:    s_load_b32 s2, s[0:1], 0x94
-; GFX11_W64-NEXT:    s_load_b32 s3, s[0:1], 0x4c
-; GFX11_W64-NEXT:    s_load_b32 s4, s[0:1], 0x70
-; GFX11_W64-NEXT:    s_load_b32 s5, s[0:1], 0x28
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s4, s[2:3], 0x94
+; GFX11_W64-NEXT:    s_load_b32 s5, s[2:3], 0x4c
+; GFX11_W64-NEXT:    s_load_b32 s6, s[2:3], 0x70
+; GFX11_W64-NEXT:    s_load_b32 s7, s[2:3], 0x28
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_and_b32 s2, 1, s2
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s3
+; GFX11_W64-NEXT:    s_and_b32 s2, 1, s4
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s4
-; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s5, v0, v1
+; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s6
+; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s7, v0, v1
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W64-NEXT:    s_nop 0
@@ -411,35 +412,36 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) {
 ; GFX7-LABEL: test_div_fmas_f32_inline_imm_0:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x13
-; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x1c
-; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x25
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[2:3], 0x13
+; GFX7-NEXT:    s_load_dword s5, s[2:3], 0x1c
+; GFX7-NEXT:    s_load_dword s6, s[2:3], 0x25
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    s_and_b32 s2, 1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_and_b32 s2, 1, s6
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_nop 1
+; GFX7-NEXT:    s_nop 2
 ; GFX7-NEXT:    v_div_fmas_f32 v0, 1.0, v0, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f32_inline_imm_0:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x4c
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x70
-; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x94
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x4c
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x70
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x94
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_and_b32 s2, 1, s4
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX8-NEXT:    s_nop 3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_and_b32 s0, 1, s4
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_nop 2
 ; GFX8-NEXT:    v_div_fmas_f32 v2, 1.0, v0, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -448,48 +450,48 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out,
 ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_0:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x3
-; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x94
-; GFX10_W32-NEXT:    s_load_dword s5, s[0:1], 0x70
-; GFX10_W32-NEXT:    s_load_dword s6, s[0:1], 0x4c
-; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX10_W32-NEXT:    s_load_dword s4, s[2:3], 0x94
+; GFX10_W32-NEXT:    s_load_dword s5, s[2:3], 0x70
+; GFX10_W32-NEXT:    s_load_dword s6, s[2:3], 0x4c
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s0, 1, s4
+; GFX10_W32-NEXT:    s_and_b32 s2, 1, s4
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, 1.0, s6, v0
-; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x3
-; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x94
-; GFX10_W64-NEXT:    s_load_dword s5, s[0:1], 0x70
-; GFX10_W64-NEXT:    s_load_dword s6, s[0:1], 0x4c
-; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX10_W64-NEXT:    s_load_dword s4, s[2:3], 0x94
+; GFX10_W64-NEXT:    s_load_dword s5, s[2:3], 0x70
+; GFX10_W64-NEXT:    s_load_dword s6, s[2:3], 0x4c
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s0, 1, s4
+; GFX10_W64-NEXT:    s_and_b32 s2, 1, s4
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, 1.0, s6, v0
-; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_0:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x3
-; GFX11_W32-NEXT:    s_load_b32 s2, s[0:1], 0x94
-; GFX11_W32-NEXT:    s_load_b32 s3, s[0:1], 0x70
-; GFX11_W32-NEXT:    s_load_b32 s4, s[0:1], 0x4c
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s4, s[2:3], 0x94
+; GFX11_W32-NEXT:    s_load_b32 s5, s[2:3], 0x70
+; GFX11_W32-NEXT:    s_load_b32 s6, s[2:3], 0x4c
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
-; GFX11_W32-NEXT:    v_mov_b32_e32 v0, s3
+; GFX11_W32-NEXT:    s_and_b32 s2, 1, s4
+; GFX11_W32-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX11_W32-NEXT:    v_div_fmas_f32 v0, 1.0, s4, v0
+; GFX11_W32-NEXT:    v_div_fmas_f32 v0, 1.0, s6, v0
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W32-NEXT:    s_nop 0
 ; GFX11_W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -498,16 +500,16 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out,
 ; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_0:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x3
-; GFX11_W64-NEXT:    s_load_b32 s2, s[0:1], 0x94
-; GFX11_W64-NEXT:    s_load_b32 s3, s[0:1], 0x70
-; GFX11_W64-NEXT:    s_load_b32 s4, s[0:1], 0x4c
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s4, s[2:3], 0x94
+; GFX11_W64-NEXT:    s_load_b32 s5, s[2:3], 0x70
+; GFX11_W64-NEXT:    s_load_b32 s6, s[2:3], 0x4c
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_and_b32 s2, 1, s2
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s3
+; GFX11_W64-NEXT:    s_and_b32 s2, 1, s4
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX11_W64-NEXT:    v_div_fmas_f32 v0, 1.0, s4, v0
+; GFX11_W64-NEXT:    v_div_fmas_f32 v0, 1.0, s6, v0
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W64-NEXT:    s_nop 0
 ; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -520,35 +522,36 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, float %a, float %b, float %c, [8 x i32], i1 %d) {
 ; GFX7-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x2
-; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x4
-; GFX7-NEXT:    s_load_dword s4, s[0:1], 0xd
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[2:3], 0x2
+; GFX7-NEXT:    s_load_dword s5, s[2:3], 0x4
+; GFX7-NEXT:    s_load_dword s6, s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    s_and_b32 s2, 1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_and_b32 s2, 1, s6
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_nop 1
+; GFX7-NEXT:    s_nop 2
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, 1.0, v1
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x8
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x10
-; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x34
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x8
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x10
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_and_b32 s2, 1, s4
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX8-NEXT:    s_nop 3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_and_b32 s0, 1, s4
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_nop 2
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v0, 1.0, v1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -557,48 +560,48 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out,
 ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x3
-; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x34
-; GFX10_W32-NEXT:    s_load_dword s5, s[0:1], 0x10
-; GFX10_W32-NEXT:    s_load_dword s6, s[0:1], 0x8
-; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX10_W32-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX10_W32-NEXT:    s_load_dword s5, s[2:3], 0x10
+; GFX10_W32-NEXT:    s_load_dword s6, s[2:3], 0x8
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s0, 1, s4
+; GFX10_W32-NEXT:    s_and_b32 s2, 1, s4
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
-; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x3
-; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x34
-; GFX10_W64-NEXT:    s_load_dword s5, s[0:1], 0x10
-; GFX10_W64-NEXT:    s_load_dword s6, s[0:1], 0x8
-; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX10_W64-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX10_W64-NEXT:    s_load_dword s5, s[2:3], 0x10
+; GFX10_W64-NEXT:    s_load_dword s6, s[2:3], 0x8
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s0, 1, s4
+; GFX10_W64-NEXT:    s_and_b32 s2, 1, s4
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
-; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x3
-; GFX11_W32-NEXT:    s_load_b32 s2, s[0:1], 0x34
-; GFX11_W32-NEXT:    s_load_b32 s3, s[0:1], 0x10
-; GFX11_W32-NEXT:    s_load_b32 s4, s[0:1], 0x8
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; GFX11_W32-NEXT:    s_load_b32 s5, s[2:3], 0x10
+; GFX11_W32-NEXT:    s_load_b32 s6, s[2:3], 0x8
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
-; GFX11_W32-NEXT:    v_mov_b32_e32 v0, s3
+; GFX11_W32-NEXT:    s_and_b32 s2, 1, s4
+; GFX11_W32-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s4, 1.0, v0
+; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W32-NEXT:    s_nop 0
 ; GFX11_W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -607,16 +610,16 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out,
 ; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_1:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x3
-; GFX11_W64-NEXT:    s_load_b32 s2, s[0:1], 0x34
-; GFX11_W64-NEXT:    s_load_b32 s3, s[0:1], 0x10
-; GFX11_W64-NEXT:    s_load_b32 s4, s[0:1], 0x8
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; GFX11_W64-NEXT:    s_load_b32 s5, s[2:3], 0x10
+; GFX11_W64-NEXT:    s_load_b32 s6, s[2:3], 0x8
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_and_b32 s2, 1, s2
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s3
+; GFX11_W64-NEXT:    s_and_b32 s2, 1, s4
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s4, 1.0, v0
+; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W64-NEXT:    s_nop 0
 ; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -629,35 +632,36 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) {
 ; GFX7-LABEL: test_div_fmas_f32_inline_imm_2:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xa
-; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x13
-; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x25
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[2:3], 0xa
+; GFX7-NEXT:    s_load_dword s5, s[2:3], 0x13
+; GFX7-NEXT:    s_load_dword s6, s[2:3], 0x25
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    s_and_b32 s2, 1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_and_b32 s2, 1, s6
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_nop 1
+; GFX7-NEXT:    s_nop 2
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, 1.0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f32_inline_imm_2:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x28
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x4c
-; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x94
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x28
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x4c
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x94
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_and_b32 s2, 1, s4
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX8-NEXT:    s_nop 3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_and_b32 s0, 1, s4
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_nop 2
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, 1.0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -666,48 +670,48 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out,
 ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_2:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x3
-; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x94
-; GFX10_W32-NEXT:    s_load_dword s5, s[0:1], 0x4c
-; GFX10_W32-NEXT:    s_load_dword s6, s[0:1], 0x28
-; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX10_W32-NEXT:    s_load_dword s4, s[2:3], 0x94
+; GFX10_W32-NEXT:    s_load_dword s5, s[2:3], 0x4c
+; GFX10_W32-NEXT:    s_load_dword s6, s[2:3], 0x28
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s0, 1, s4
+; GFX10_W32-NEXT:    s_and_b32 s2, 1, s4
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, 1.0
-; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x3
-; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x94
-; GFX10_W64-NEXT:    s_load_dword s5, s[0:1], 0x4c
-; GFX10_W64-NEXT:    s_load_dword s6, s[0:1], 0x28
-; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX10_W64-NEXT:    s_load_dword s4, s[2:3], 0x94
+; GFX10_W64-NEXT:    s_load_dword s5, s[2:3], 0x4c
+; GFX10_W64-NEXT:    s_load_dword s6, s[2:3], 0x28
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s0, 1, s4
+; GFX10_W64-NEXT:    s_and_b32 s2, 1, s4
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, 1.0
-; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_2:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x3
-; GFX11_W32-NEXT:    s_load_b32 s2, s[0:1], 0x94
-; GFX11_W32-NEXT:    s_load_b32 s3, s[0:1], 0x4c
-; GFX11_W32-NEXT:    s_load_b32 s4, s[0:1], 0x28
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s4, s[2:3], 0x94
+; GFX11_W32-NEXT:    s_load_b32 s5, s[2:3], 0x4c
+; GFX11_W32-NEXT:    s_load_b32 s6, s[2:3], 0x28
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
-; GFX11_W32-NEXT:    v_mov_b32_e32 v0, s3
+; GFX11_W32-NEXT:    s_and_b32 s2, 1, s4
+; GFX11_W32-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
-; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s4, v0, 1.0
+; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, 1.0
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W32-NEXT:    s_nop 0
 ; GFX11_W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -716,16 +720,16 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out,
 ; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_2:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x3
-; GFX11_W64-NEXT:    s_load_b32 s2, s[0:1], 0x94
-; GFX11_W64-NEXT:    s_load_b32 s3, s[0:1], 0x4c
-; GFX11_W64-NEXT:    s_load_b32 s4, s[0:1], 0x28
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s4, s[2:3], 0x94
+; GFX11_W64-NEXT:    s_load_b32 s5, s[2:3], 0x4c
+; GFX11_W64-NEXT:    s_load_b32 s6, s[2:3], 0x28
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_and_b32 s2, 1, s2
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s3
+; GFX11_W64-NEXT:    s_and_b32 s2, 1, s4
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s4, v0, 1.0
+; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, 1.0
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W64-NEXT:    s_nop 0
 ; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -738,8 +742,8 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) {
 ; GFX7-LABEL: test_div_fmas_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x0
-; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x8
+; GFX7-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
@@ -758,8 +762,8 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
 ;
 ; GFX8-LABEL: test_div_fmas_f64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x0
-; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x20
+; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x20
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
@@ -779,10 +783,10 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
 ; GFX10_W32-LABEL: test_div_fmas_f64:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x1
-; GFX10_W32-NEXT:    s_load_dword s2, s[0:1], 0x20
-; GFX10_W32-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x0
+; GFX10_W32-NEXT:    s_load_dword s0, s[2:3], 0x20
+; GFX10_W32-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_and_b32 s0, 1, s2
+; GFX10_W32-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
@@ -796,10 +800,10 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
 ; GFX10_W64-LABEL: test_div_fmas_f64:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x1
-; GFX10_W64-NEXT:    s_load_dword s2, s[0:1], 0x20
-; GFX10_W64-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x0
+; GFX10_W64-NEXT:    s_load_dword s0, s[2:3], 0x20
+; GFX10_W64-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_and_b32 s0, 1, s2
+; GFX10_W64-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
@@ -813,8 +817,8 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
 ; GFX11_W32-LABEL: test_div_fmas_f64:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x1
-; GFX11_W32-NEXT:    s_load_b32 s8, s[0:1], 0x20
-; GFX11_W32-NEXT:    s_load_b256 s[0:7], s[0:1], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s8, s[2:3], 0x20
+; GFX11_W32-NEXT:    s_load_b256 s[0:7], s[2:3], 0x0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W32-NEXT:    s_and_b32 s8, 1, s8
 ; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
@@ -830,8 +834,8 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
 ; GFX11_W64-LABEL: test_div_fmas_f64:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x1
-; GFX11_W64-NEXT:    s_load_b32 s8, s[0:1], 0x20
-; GFX11_W64-NEXT:    s_load_b256 s[0:7], s[0:1], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s8, s[2:3], 0x20
+; GFX11_W64-NEXT:    s_load_b256 s[0:7], s[2:3], 0x0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W64-NEXT:    s_and_b32 s8, 1, s8
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s4
@@ -853,8 +857,8 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d
 define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %i) {
 ; GFX7-LABEL: test_div_fmas_f32_cond_to_vcc:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x2
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_cmp_eq_u32 s7, 0
@@ -872,18 +876,19 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out,
 ;
 ; GFX8-LABEL: test_div_fmas_f32_cond_to_vcc:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_cmp_eq_u32 s7, 0
-; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX8-NEXT:    s_and_b32 s2, 1, s2
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8-NEXT:    s_and_b32 s0, 1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX8-NEXT:    s_nop 3
+; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_nop 2
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -892,42 +897,42 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out,
 ; GFX10_W32-LABEL: test_div_fmas_f32_cond_to_vcc:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x1
-; GFX10_W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX10_W32-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    s_cmp_eq_u32 s7, 0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10_W32-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10_W32-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10_W32-NEXT:    s_and_b32 s0, 1, s0
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_cond_to_vcc:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x1
-; GFX10_W64-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
-; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX10_W64-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x8
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    s_cmp_eq_u32 s7, 0
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
-; GFX10_W64-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10_W64-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10_W64-NEXT:    s_and_b32 s0, 1, s0
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
+; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32_cond_to_vcc:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x1
-; GFX11_W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x8
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W32-NEXT:    s_load_b128 s[4:7], s[2:3], 0x8
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W32-NEXT:    s_cmp_eq_u32 s7, 0
 ; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6
@@ -944,8 +949,8 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out,
 ; GFX11_W64-LABEL: test_div_fmas_f32_cond_to_vcc:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x1
-; GFX11_W64-NEXT:    s_load_b128 s[4:7], s[0:1], 0x8
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W64-NEXT:    s_load_b128 s[4:7], s[2:3], 0x8
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W64-NEXT:    s_cmp_eq_u32 s7, 0
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s5
@@ -968,15 +973,15 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) {
 ; GFX7-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xa
-; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x13
-; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x1c
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[2:3], 0xa
+; GFX7-NEXT:    s_load_dword s5, s[2:3], 0x13
+; GFX7-NEXT:    s_load_dword s6, s[2:3], 0x1c
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX7-NEXT:    s_mov_b64 vcc, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
@@ -985,16 +990,17 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace
 ;
 ; GFX8-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x28
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x4c
-; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x70
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x28
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x4c
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x70
 ; GFX8-NEXT:    s_mov_b64 vcc, 0
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -1003,46 +1009,46 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace
 ; GFX10_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x3
-; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x4c
-; GFX10_W32-NEXT:    s_load_dword s5, s[0:1], 0x70
-; GFX10_W32-NEXT:    s_load_dword s6, s[0:1], 0x28
-; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX10_W32-NEXT:    s_load_dword s4, s[2:3], 0x4c
+; GFX10_W32-NEXT:    s_load_dword s5, s[2:3], 0x70
+; GFX10_W32-NEXT:    s_load_dword s6, s[2:3], 0x28
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W32-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x3
-; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x4c
-; GFX10_W64-NEXT:    s_load_dword s5, s[0:1], 0x70
-; GFX10_W64-NEXT:    s_load_dword s6, s[0:1], 0x28
-; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX10_W64-NEXT:    s_load_dword s4, s[2:3], 0x4c
+; GFX10_W64-NEXT:    s_load_dword s5, s[2:3], 0x70
+; GFX10_W64-NEXT:    s_load_dword s6, s[2:3], 0x28
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W64-NEXT:    s_mov_b64 vcc, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x3
-; GFX11_W32-NEXT:    s_load_b32 s2, s[0:1], 0x4c
-; GFX11_W32-NEXT:    s_load_b32 s3, s[0:1], 0x70
-; GFX11_W32-NEXT:    s_load_b32 s4, s[0:1], 0x28
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s4, s[2:3], 0x4c
+; GFX11_W32-NEXT:    s_load_b32 s5, s[2:3], 0x70
+; GFX11_W32-NEXT:    s_load_b32 s6, s[2:3], 0x28
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W32-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
+; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W32-NEXT:    s_nop 0
@@ -1052,15 +1058,15 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace
 ; GFX11_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x3
-; GFX11_W64-NEXT:    s_load_b32 s2, s[0:1], 0x4c
-; GFX11_W64-NEXT:    s_load_b32 s3, s[0:1], 0x70
-; GFX11_W64-NEXT:    s_load_b32 s4, s[0:1], 0x28
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s4, s[2:3], 0x4c
+; GFX11_W64-NEXT:    s_load_b32 s5, s[2:3], 0x70
+; GFX11_W64-NEXT:    s_load_b32 s6, s[2:3], 0x28
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W64-NEXT:    s_mov_b64 vcc, 0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s3
-; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s5
+; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W64-NEXT:    s_nop 0
@@ -1074,15 +1080,15 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace
 define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) {
 ; GFX7-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xa
-; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x13
-; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x1c
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s4, s[2:3], 0xa
+; GFX7-NEXT:    s_load_dword s5, s[2:3], 0x13
+; GFX7-NEXT:    s_load_dword s6, s[2:3], 0x1c
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX7-NEXT:    s_mov_b64 vcc, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
@@ -1091,16 +1097,17 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(
 ;
 ; GFX8-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x28
-; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x4c
-; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x70
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x28
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x4c
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x70
 ; GFX8-NEXT:    s_mov_b64 vcc, -1
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -1109,46 +1116,46 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(
 ; GFX10_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
 ; GFX10_W32:       ; %bb.0:
 ; GFX10_W32-NEXT:    s_clause 0x3
-; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x4c
-; GFX10_W32-NEXT:    s_load_dword s5, s[0:1], 0x70
-; GFX10_W32-NEXT:    s_load_dword s6, s[0:1], 0x28
-; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX10_W32-NEXT:    s_load_dword s4, s[2:3], 0x4c
+; GFX10_W32-NEXT:    s_load_dword s5, s[2:3], 0x70
+; GFX10_W32-NEXT:    s_load_dword s6, s[2:3], 0x28
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W32-NEXT:    s_mov_b32 vcc_lo, -1
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10_W32-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W32-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
 ; GFX10_W64:       ; %bb.0:
 ; GFX10_W64-NEXT:    s_clause 0x3
-; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x4c
-; GFX10_W64-NEXT:    s_load_dword s5, s[0:1], 0x70
-; GFX10_W64-NEXT:    s_load_dword s6, s[0:1], 0x28
-; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX10_W64-NEXT:    s_load_dword s4, s[2:3], 0x4c
+; GFX10_W64-NEXT:    s_load_dword s5, s[2:3], 0x70
+; GFX10_W64-NEXT:    s_load_dword s6, s[2:3], 0x28
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W64-NEXT:    s_mov_b64 vcc, -1
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10_W64-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10_W64-NEXT:    s_endpgm
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
 ; GFX11_W32:       ; %bb.0:
 ; GFX11_W32-NEXT:    s_clause 0x3
-; GFX11_W32-NEXT:    s_load_b32 s2, s[0:1], 0x4c
-; GFX11_W32-NEXT:    s_load_b32 s3, s[0:1], 0x70
-; GFX11_W32-NEXT:    s_load_b32 s4, s[0:1], 0x28
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s4, s[2:3], 0x4c
+; GFX11_W32-NEXT:    s_load_b32 s5, s[2:3], 0x70
+; GFX11_W32-NEXT:    s_load_b32 s6, s[2:3], 0x28
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W32-NEXT:    s_mov_b32 vcc_lo, -1
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
+; GFX11_W32-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W32-NEXT:    s_nop 0
@@ -1158,15 +1165,15 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(
 ; GFX11_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
 ; GFX11_W64:       ; %bb.0:
 ; GFX11_W64-NEXT:    s_clause 0x3
-; GFX11_W64-NEXT:    s_load_b32 s2, s[0:1], 0x4c
-; GFX11_W64-NEXT:    s_load_b32 s3, s[0:1], 0x70
-; GFX11_W64-NEXT:    s_load_b32 s4, s[0:1], 0x28
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s4, s[2:3], 0x4c
+; GFX11_W64-NEXT:    s_load_b32 s5, s[2:3], 0x70
+; GFX11_W64-NEXT:    s_load_b32 s6, s[2:3], 0x28
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W64-NEXT:    s_mov_b64 vcc, -1
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s3
-; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
+; GFX11_W64-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11_W64-NEXT:    v_mov_b32_e32 v1, s5
+; GFX11_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11_W64-NEXT:    s_nop 0
@@ -1180,36 +1187,36 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(
 define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %d) {
 ; GFX7-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX7-NEXT:    s_load_dword s8, s[0:1], 0xc
-; GFX7-NEXT:    s_mov_b32 s2, 0
+; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[2:3], 0xc
+; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
-; GFX7-NEXT:    buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 glc
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[10:11]
+; GFX7-NEXT:    buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4 glc
+; GFX7-NEXT:    buffer_load_dword v4, v[1:2], s[4:7], 0 addr64 offset:4 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 offset:8 glc
+; GFX7-NEXT:    buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 offset:8 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX7-NEXT:    s_and_b32 s0, 1, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_and_b64 vcc, vcc, s[0:1]
-; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX7-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v3, v4, v1
-; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
+; GFX7-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:8
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x30
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x30
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s6
@@ -1243,9 +1250,10 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
 ;
 ; GFX10_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX10_W32:       ; %bb.0:
-; GFX10_W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX10_W32-NEXT:    s_clause 0x1
+; GFX10_W32-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX10_W32-NEXT:    s_load_dword s0, s[2:3], 0x30
 ; GFX10_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W32-NEXT:    s_load_dword s0, s[0:1], 0x30
 ; GFX10_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    global_load_dword v2, v1, s[6:7] glc dlc
@@ -1254,8 +1262,8 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-NEXT:    global_load_dword v4, v1, s[6:7] offset:8 glc dlc
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W32-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10_W32-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
@@ -1266,9 +1274,10 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX10_W64:       ; %bb.0:
-; GFX10_W64-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
+; GFX10_W64-NEXT:    s_clause 0x1
+; GFX10_W64-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX10_W64-NEXT:    s_load_dword s0, s[2:3], 0x30
 ; GFX10_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W64-NEXT:    s_load_dword s0, s[0:1], 0x30
 ; GFX10_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    global_load_dword v2, v1, s[6:7] glc dlc
@@ -1277,8 +1286,8 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-NEXT:    global_load_dword v4, v1, s[6:7] offset:8 glc dlc
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10_W64-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10_W64-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -1289,9 +1298,11 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX11_W32:       ; %bb.0:
-; GFX11_W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11_W32-NEXT:    s_clause 0x1
+; GFX11_W32-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s0, s[2:3], 0x30
+; GFX11_W32-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX11_W32-NEXT:    s_load_b32 s0, s[0:1], 0x30
 ; GFX11_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W32-NEXT:    global_load_b32 v2, v1, s[6:7] glc dlc
@@ -1314,9 +1325,11 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
 ; GFX11_W64:       ; %bb.0:
-; GFX11_W64-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX11_W64-NEXT:    s_clause 0x1
+; GFX11_W64-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s0, s[2:3], 0x30
+; GFX11_W64-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX11_W64-NEXT:    s_load_b32 s0, s[0:1], 0x30
 ; GFX11_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W64-NEXT:    global_load_b32 v2, v1, s[6:7] glc dlc
@@ -1358,68 +1371,73 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1
 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [8 x i32], ptr addrspace(1) %in, [8 x i32], ptr addrspace(1) %dummy) {
 ; GFX7-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xa
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xa
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    buffer_load_dwordx3 v[1:3], v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GFX7-NEXT:    s_mov_b64 vcc, 0
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
 ; GFX7-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX7-NEXT:  ; %bb.1: ; %bb
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x14
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x14
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX7-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX7-NEXT:    s_and_b32 s0, 1, s0
+; GFX7-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX7-NEXT:    s_andn2_b64 s[8:9], 0, exec
+; GFX7-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX7-NEXT:    s_or_b64 vcc, s[8:9], s[0:1]
 ; GFX7-NEXT:  .LBB13_2: ; %exit
-; GFX7-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX7-NEXT:    s_and_b32 s0, 1, s6
-; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_nop 1
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x28
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x28
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
-; GFX8-NEXT:    s_mov_b32 s4, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    flat_load_dwordx3 v[1:3], v[1:2]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GFX8-NEXT:    s_mov_b64 vcc, 0
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
 ; GFX8-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX8-NEXT:  ; %bb.1: ; %bb
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x50
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x50
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8-NEXT:    s_and_b32 s0, 1, s0
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX8-NEXT:    s_andn2_b64 s[6:7], 0, exec
+; GFX8-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX8-NEXT:    s_or_b64 vcc, s[6:7], s[0:1]
 ; GFX8-NEXT:  .LBB13_2: ; %exit
-; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_add_u32 s0, s0, 8
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    s_and_b32 s2, 1, s4
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_nop 2
-; GFX8-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -1427,27 +1445,29 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ;
 ; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX10_W32:       ; %bb.0: ; %entry
-; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x28
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x28
 ; GFX10_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10_W32-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    global_load_dwordx3 v[1:3], v1, s[2:3]
-; GFX10_W32-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_W32-NEXT:    s_mov_b32 s2, 0
-; GFX10_W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX10_W32-NEXT:    global_load_dwordx3 v[1:3], v1, s[0:1]
+; GFX10_W32-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX10_W32-NEXT:    s_and_saveexec_b32 s1, s0
 ; GFX10_W32-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX10_W32-NEXT:  ; %bb.1: ; %bb
-; GFX10_W32-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x50
+; GFX10_W32-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x50
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX10_W32-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W32-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10_W32-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10_W32-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10_W32-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10_W32-NEXT:    s_andn2_b32 s4, 0, exec_lo
+; GFX10_W32-NEXT:    s_and_b32 s0, 1, s0
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10_W32-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX10_W32-NEXT:    s_or_b32 vcc_lo, s4, s0
 ; GFX10_W32-NEXT:  .LBB13_2: ; %exit
-; GFX10_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
+; GFX10_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
@@ -1457,27 +1477,29 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ;
 ; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX10_W64:       ; %bb.0: ; %entry
-; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x28
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x28
 ; GFX10_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10_W64-NEXT:    s_mov_b32 s4, 0
+; GFX10_W64-NEXT:    s_mov_b64 vcc, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    global_load_dwordx3 v[1:3], v1, s[2:3]
-; GFX10_W64-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX10_W64-NEXT:    global_load_dwordx3 v[1:3], v1, s[0:1]
+; GFX10_W64-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GFX10_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
 ; GFX10_W64-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX10_W64-NEXT:  ; %bb.1: ; %bb
-; GFX10_W64-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x50
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x50
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10_W64-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10_W64-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10_W64-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10_W64-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10_W64-NEXT:    s_andn2_b64 s[6:7], 0, exec
+; GFX10_W64-NEXT:    s_and_b32 s0, 1, s0
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX10_W64-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX10_W64-NEXT:    s_or_b64 vcc, s[6:7], s[0:1]
 ; GFX10_W64-NEXT:  .LBB13_2: ; %exit
-; GFX10_W64-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX10_W64-NEXT:    s_and_b32 s2, 1, s4
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; GFX10_W64-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
@@ -1487,28 +1509,32 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ;
 ; GFX11_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX11_W32:       ; %bb.0: ; %entry
-; GFX11_W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x28
-; GFX11_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x28
+; GFX11_W32-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
+; GFX11_W32-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX11_W32-NEXT:    v_lshlrev_b32_e32 v0, 2, v3
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    global_load_b96 v[1:3], v1, s[2:3]
-; GFX11_W32-NEXT:    s_mov_b32 s2, 0
-; GFX11_W32-NEXT:    s_mov_b32 s3, exec_lo
-; GFX11_W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11_W32-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
+; GFX11_W32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11_W32-NEXT:    v_cmpx_eq_u32_e32 0, v3
 ; GFX11_W32-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX11_W32-NEXT:  ; %bb.1: ; %bb
-; GFX11_W32-NEXT:    s_load_b64 s[4:5], s[0:1], 0x50
+; GFX11_W32-NEXT:    s_load_b64 s[4:5], s[2:3], 0x50
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    s_load_b32 s2, s[4:5], 0x0
+; GFX11_W32-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W32-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11_W32-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11_W32-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11_W32-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11_W32-NEXT:    s_and_not1_b32 s4, 0, exec_lo
+; GFX11_W32-NEXT:    s_and_b32 s0, 1, s0
+; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX11_W32-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX11_W32-NEXT:    s_or_b32 vcc_lo, s4, s0
 ; GFX11_W32-NEXT:  .LBB13_2: ; %exit
-; GFX11_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
-; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
+; GFX11_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX11_W32-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
+; GFX11_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W32-NEXT:    global_store_b32 v1, v0, s[0:1] offset:8
@@ -1518,28 +1544,32 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ;
 ; GFX11_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
 ; GFX11_W64:       ; %bb.0: ; %entry
-; GFX11_W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x28
-; GFX11_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX11_W64-NEXT:    s_mov_b32 s4, 0
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x28
+; GFX11_W64-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
+; GFX11_W64-NEXT:    s_mov_b64 vcc, 0
+; GFX11_W64-NEXT:    s_mov_b64 s[4:5], exec
+; GFX11_W64-NEXT:    v_lshlrev_b32_e32 v0, 2, v3
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    global_load_b96 v[1:3], v1, s[2:3]
-; GFX11_W64-NEXT:    s_mov_b64 s[2:3], exec
-; GFX11_W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11_W64-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
+; GFX11_W64-NEXT:    v_cmpx_eq_u32_e32 0, v3
 ; GFX11_W64-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX11_W64-NEXT:  ; %bb.1: ; %bb
-; GFX11_W64-NEXT:    s_load_b64 s[4:5], s[0:1], 0x50
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x50
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_load_b32 s4, s[4:5], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX11_W64-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11_W64-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11_W64-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11_W64-NEXT:    s_and_not1_b64 s[6:7], 0, exec
+; GFX11_W64-NEXT:    s_and_b32 s0, 1, s0
+; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
+; GFX11_W64-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX11_W64-NEXT:    s_or_b64 vcc, s[6:7], s[0:1]
 ; GFX11_W64-NEXT:  .LBB13_2: ; %exit
-; GFX11_W64-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11_W64-NEXT:    s_and_b32 s2, 1, s4
-; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
+; GFX11_W64-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX11_W64-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
+; GFX11_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W64-NEXT:    global_store_b32 v1, v0, s[0:1] offset:8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 5dd4fa0809131f..6ebd8c6146095b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -69,15 +69,36 @@ define i8 @v_lshr_i8_7(i8 %value) {
 }
 
 define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) {
-; GCN-LABEL: s_lshr_i8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_and_b32 s0, s0, 0xff
-; GCN-NEXT:    s_lshr_b32 s0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_lshr_i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_lshr_i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_lshr_i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i8:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10PLUS-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i8 %value, %amount
@@ -85,14 +106,30 @@ define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) {
 }
 
 define amdgpu_ps i8 @s_lshr_i8_7(i8 inreg %value) {
-; GCN-LABEL: s_lshr_i8_7:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_bfe_u32 s0, s0, 0x10007
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_lshr_i8_7:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x10007
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_lshr_i8_7:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 7
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_lshr_i8_7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 7
+; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i8_7:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_bfe_u32 s0, s0, 0x10007
+; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, 7
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i8 %value, 7
   ret i8 %result
@@ -619,15 +656,30 @@ define i16 @v_lshr_i16_15(i16 %value) {
 }
 
 define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) {
-; GCN-LABEL: s_lshr_i16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
-; GCN-NEXT:    s_lshr_b32 s0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_lshr_i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_lshr_i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_lshr_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX9-NEXT:    s_lshr_b32 s0, s0, s1
+; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10PLUS-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i16 %value, %amount
@@ -635,14 +687,27 @@ define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) {
 }
 
 define amdgpu_ps i16 @s_lshr_i16_15(i16 inreg %value) {
-; GCN-LABEL: s_lshr_i16_15:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_bfe_u32 s0, s0, 0x1000f
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_lshr_i16_15:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x1000f
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_lshr_i16_15:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 15
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_lshr_i16_15:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 15
+; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i16_15:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_bfe_u32 s0, s0, 0x1000f
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, 15
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i16 %value, 15
   ret i16 %result
@@ -783,13 +848,13 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
 ; GFX8-LABEL: s_lshr_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshr_b32 s1, s2, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_lshr_v2i16:
@@ -970,21 +1035,21 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ; GFX8-LABEL: s_lshr_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s2
 ; GFX8-NEXT:    s_lshr_b32 s2, s4, s6
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s3
 ; GFX8-NEXT:    s_lshr_b32 s3, s5, s7
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s2, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NEXT:    s_lshl_b32 s2, s3, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_or_b32 s1, s2, s1
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_lshr_v4i16:
@@ -1155,37 +1220,37 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX8-LABEL: s_lshr_v8i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
 ; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshr_b32 s4, s8, s12
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, s5
 ; GFX8-NEXT:    s_lshr_b32 s5, s9, s13
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s10, s14
-; GFX8-NEXT:    s_or_b32 s0, s4, s0
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshl_b32 s4, s5, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, s7
 ; GFX8-NEXT:    s_lshr_b32 s7, s11, s15
-; GFX8-NEXT:    s_or_b32 s1, s4, s1
+; GFX8-NEXT:    s_or_b32 s1, s1, s4
 ; GFX8-NEXT:    s_lshl_b32 s4, s6, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT:    s_or_b32 s2, s4, s2
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    s_lshl_b32 s4, s7, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX8-NEXT:    s_or_b32 s3, s4, s3
+; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_lshr_v8i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 42f1bf84c04207..306b5579bebbcb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -7,37 +7,18 @@
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
 
 define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
-; GFX7-LABEL: s_mul_i16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mul_i32 s0, s0, s1
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_mul_i16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_mul_i32 s0, s0, s1
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_mul_i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_mul_i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mul_i32 s0, s0, s1
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_mul_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: s_mul_i16:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX12-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX12-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
@@ -93,35 +74,27 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre
 ;
 ; GFX8-LABEL: s_mul_i16_zeroext:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_mul_i32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_mul_i16_zeroext:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_mul_i16_zeroext:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
-; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: s_mul_i16_zeroext:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX12-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mul_i32 s0, s0, s1
-; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX12-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
   ret i16 %result
@@ -170,42 +143,22 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
 }
 
 define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
-; GFX7-LABEL: s_mul_i16_signext:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mul_i32 s0, s0, s1
-; GFX7-NEXT:    s_sext_i32_i16 s0, s0
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_mul_i16_signext:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_mul_i32 s0, s0, s1
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_mul_i16_signext:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX9-NEXT:    s_mul_i32 s0, s0, s1
-; GFX9-NEXT:    s_sext_i32_i16 s0, s0
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_mul_i16_signext:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mul_i32 s0, s0, s1
+; GCN-NEXT:    s_sext_i32_i16 s0, s0
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_mul_i16_signext:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
 ;
 ; GFX12-LABEL: s_mul_i16_signext:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX12-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mul_i32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX12-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index e7119c89ac06cd..2e58696518a9f5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -349,63 +349,67 @@ define amdgpu_ps <2 x i32> @s_orn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32
 }
 
 define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) {
-; GCN-LABEL: s_orn2_i16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_orn2_b32 s0, s2, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_orn2_i16:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_orn2_b32 s0, s2, s3
+; GFX6-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: s_orn2_i16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_orn2_b32 s0, s2, s3
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: s_orn2_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_xor_b32 s0, s3, -1
+; GFX9-NEXT:    s_or_b32 s0, s2, s0
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_orn2_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_or_not1_b32 s0, s2, s3
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX10PLUS-LABEL: s_orn2_i16:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_xor_b32 s0, s3, -1
+; GFX10PLUS-NEXT:    s_or_b32 s0, s2, s0
+; GFX10PLUS-NEXT:    ; return to shader part epilog
   %not.src1 = xor i16 %src1, -1
   %or = or i16 %src0, %not.src1
   ret i16 %or
 }
 
 define amdgpu_ps i16 @s_orn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
-; GCN-LABEL: s_orn2_i16_commute:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_orn2_b32 s0, s2, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_orn2_i16_commute:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_orn2_b32 s0, s2, s3
+; GFX6-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: s_orn2_i16_commute:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_orn2_b32 s0, s2, s3
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: s_orn2_i16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_xor_b32 s0, s3, -1
+; GFX9-NEXT:    s_or_b32 s0, s0, s2
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_orn2_i16_commute:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_or_not1_b32 s0, s2, s3
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX10PLUS-LABEL: s_orn2_i16_commute:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_xor_b32 s0, s3, -1
+; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s2
+; GFX10PLUS-NEXT:    ; return to shader part epilog
   %not.src1 = xor i16 %src1, -1
   %or = or i16 %not.src1, %src0
   ret i16 %or
 }
 
 define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
-; GCN-LABEL: s_orn2_i16_multi_use:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_xor_b32 s1, s3, -1
-; GCN-NEXT:    s_orn2_b32 s0, s2, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_orn2_i16_multi_use:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_xor_b32 s1, s3, -1
+; GFX6-NEXT:    s_orn2_b32 s0, s2, s3
+; GFX6-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: s_orn2_i16_multi_use:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_orn2_b32 s0, s2, s3
-; GFX10-NEXT:    s_xor_b32 s1, s3, -1
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: s_orn2_i16_multi_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_xor_b32 s1, s3, -1
+; GFX9-NEXT:    s_or_b32 s0, s2, s1
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_orn2_i16_multi_use:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_or_not1_b32 s0, s2, s3
-; GFX11-NEXT:    s_xor_b32 s1, s3, -1
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX10PLUS-LABEL: s_orn2_i16_multi_use:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_xor_b32 s1, s3, -1
+; GFX10PLUS-NEXT:    s_or_b32 s0, s2, s1
+; GFX10PLUS-NEXT:    ; return to shader part epilog
   %not.src1 = xor i16 %src1, -1
   %or = or i16 %src0, %not.src1
   %insert.0 = insertvalue { i16, i16 } undef, i16 %or, 0
@@ -414,23 +418,25 @@ define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %
 }
 
 define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) {
-; GCN-LABEL: s_orn2_i16_multi_foldable_use:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_orn2_b32 s0, s2, s4
-; GCN-NEXT:    s_orn2_b32 s1, s3, s4
-; GCN-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: s_orn2_i16_multi_foldable_use:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_orn2_b32 s0, s2, s4
+; GFX6-NEXT:    s_orn2_b32 s1, s3, s4
+; GFX6-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: s_orn2_i16_multi_foldable_use:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_orn2_b32 s0, s2, s4
-; GFX10-NEXT:    s_orn2_b32 s1, s3, s4
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: s_orn2_i16_multi_foldable_use:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_xor_b32 s1, s4, -1
+; GFX9-NEXT:    s_or_b32 s0, s2, s1
+; GFX9-NEXT:    s_or_b32 s1, s3, s1
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_orn2_i16_multi_foldable_use:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_or_not1_b32 s0, s2, s4
-; GFX11-NEXT:    s_or_not1_b32 s1, s3, s4
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX10PLUS-LABEL: s_orn2_i16_multi_foldable_use:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_xor_b32 s1, s4, -1
+; GFX10PLUS-NEXT:    s_or_b32 s0, s2, s1
+; GFX10PLUS-NEXT:    s_or_b32 s1, s3, s1
+; GFX10PLUS-NEXT:    ; return to shader part epilog
   %not.src2 = xor i16 %src2, -1
   %or0 = or i16 %src0, %not.src2
   %or1 = or i16 %src1, %not.src2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index bac80f0777c024..fc852aa416cd7d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -40,30 +40,14 @@ define i8 @v_sext_inreg_i8_7(i8 %value) {
 }
 
 define amdgpu_ps i8 @s_sext_inreg_i8(i8 inreg %value) {
-; GFX6-LABEL: s_sext_inreg_i8:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x50000
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_sext_inreg_i8:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX8-NEXT:    s_sext_i32_i8 s0, s0
-; GFX8-NEXT:    s_ashr_i32 s0, s0, 3
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_sext_inreg_i8:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX9-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 3
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_sext_inreg_i8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_bfe_i32 s0, s0, 0x50000
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 3
-; GFX10PLUS-NEXT:    s_sext_i32_i8 s0, s0
-; GFX10PLUS-NEXT:    s_ashr_i32 s0, s0, 3
+; GFX10PLUS-NEXT:    s_bfe_i32 s0, s0, 0x50000
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i8 %value, 3
   %ashr = ashr i8 %shl, 3
@@ -71,30 +55,14 @@ define amdgpu_ps i8 @s_sext_inreg_i8(i8 inreg %value) {
 }
 
 define amdgpu_ps i8 @s_sext_inreg_i8_6(i8 inreg %value) {
-; GFX6-LABEL: s_sext_inreg_i8_6:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x20000
-; GFX6-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_sext_inreg_i8_6:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 6
-; GFX8-NEXT:    s_sext_i32_i8 s0, s0
-; GFX8-NEXT:    s_ashr_i32 s0, s0, 6
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_sext_inreg_i8_6:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 6
-; GFX9-NEXT:    s_sext_i32_i8 s0, s0
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 6
-; GFX9-NEXT:    ; return to shader part epilog
+; GCN-LABEL: s_sext_inreg_i8_6:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_bfe_i32 s0, s0, 0x20000
+; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i8_6:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 6
-; GFX10PLUS-NEXT:    s_sext_i32_i8 s0, s0
-; GFX10PLUS-NEXT:    s_ashr_i32 s0, s0, 6
+; GFX10PLUS-NEXT:    s_bfe_i32 s0, s0, 0x20000
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i8 %value, 6
   %ashr = ashr i8 %shl, 6
@@ -585,16 +553,12 @@ define amdgpu_ps i16 @s_sext_inreg_i16_9(i16 inreg %value) {
 ;
 ; GFX9-LABEL: s_sext_inreg_i16_9:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 9
-; GFX9-NEXT:    s_sext_i32_i16 s0, s0
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 9
+; GFX9-NEXT:    s_bfe_i32 s0, s0, 0x70000
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i16_9:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 9
-; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
-; GFX10PLUS-NEXT:    s_ashr_i32 s0, s0, 9
+; GFX10PLUS-NEXT:    s_bfe_i32 s0, s0, 0x70000
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i16 %value, 9
   %ashr = ashr i16 %shl, 9
@@ -616,16 +580,12 @@ define amdgpu_ps i16 @s_sext_inreg_i16_15(i16 inreg %value) {
 ;
 ; GFX9-LABEL: s_sext_inreg_i16_15:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshl_b32 s0, s0, 15
-; GFX9-NEXT:    s_sext_i32_i16 s0, s0
-; GFX9-NEXT:    s_ashr_i32 s0, s0, 15
+; GFX9-NEXT:    s_bfe_i32 s0, s0, 0x10000
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i16_15:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 15
-; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
-; GFX10PLUS-NEXT:    s_ashr_i32 s0, s0, 15
+; GFX10PLUS-NEXT:    s_bfe_i32 s0, s0, 0x10000
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i16 %value, 15
   %ashr = ashr i16 %shl, 15
@@ -720,15 +680,16 @@ define amdgpu_ps i32 @s_sext_inreg_v2i16_11(<2 x i16> inreg %value) {
 ; GFX8-LABEL: s_sext_inreg_v2i16_11:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_lshl_b32 s0, s0, 11
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 11
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_lshl_b32 s0, s0, 11
 ; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_ashr_i32 s0, s0, 11
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s1, s1, 11
+; GFX8-NEXT:    s_ashr_i32 s0, s0, 11
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_sext_inreg_v2i16_11:
@@ -854,25 +815,27 @@ define amdgpu_ps <2 x i32> @s_sext_inreg_v4i16_14(<4 x i16> inreg %value) {
 ; GFX8-LABEL: s_sext_inreg_v4i16_14:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 14
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 14
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 14
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 14
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 14
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_sext_i32_i16 s2, s2
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_ashr_i32 s0, s0, 14
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 14
-; GFX8-NEXT:    s_ashr_i32 s1, s1, 14
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 14
+; GFX8-NEXT:    s_ashr_i32 s0, s0, 14
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 14
+; GFX8-NEXT:    s_sext_i32_i16 s3, s3
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, 14
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_ashr_i32 s1, s1, 14
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s2, s0
-; GFX8-NEXT:    s_lshl_b32 s2, s3, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_or_b32 s1, s2, s1
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_sext_inreg_v4i16_14:
@@ -1068,45 +1031,49 @@ define amdgpu_ps <4 x i32> @s_sext_inreg_v8i16_5(<8 x i16> inreg %value) {
 ; GFX8-LABEL: s_sext_inreg_v8i16_5:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 5
 ; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX8-NEXT:    s_lshl_b32 s4, s4, 5
+; GFX8-NEXT:    s_sext_i32_i16 s4, s4
+; GFX8-NEXT:    s_sext_i32_i16 s0, s0
+; GFX8-NEXT:    s_ashr_i32 s4, s4, 5
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 5
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX8-NEXT:    s_ashr_i32 s0, s0, 5
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 5
-; GFX8-NEXT:    s_lshl_b32 s5, s5, 5
-; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    s_sext_i32_i16 s4, s4
+; GFX8-NEXT:    s_sext_i32_i16 s5, s5
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT:    s_sext_i32_i16 s1, s1
+; GFX8-NEXT:    s_ashr_i32 s5, s5, 5
+; GFX8-NEXT:    s_lshl_b32 s6, s6, 5
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX8-NEXT:    s_ashr_i32 s1, s1, 5
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 5
-; GFX8-NEXT:    s_lshl_b32 s6, s6, 5
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
-; GFX8-NEXT:    s_sext_i32_i16 s5, s5
-; GFX8-NEXT:    s_ashr_i32 s0, s0, 5
-; GFX8-NEXT:    s_ashr_i32 s4, s4, 5
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 5
-; GFX8-NEXT:    s_lshl_b32 s7, s7, 5
-; GFX8-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX8-NEXT:    s_sext_i32_i16 s6, s6
-; GFX8-NEXT:    s_ashr_i32 s1, s1, 5
-; GFX8-NEXT:    s_ashr_i32 s5, s5, 5
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s5
+; GFX8-NEXT:    s_sext_i32_i16 s2, s2
+; GFX8-NEXT:    s_ashr_i32 s6, s6, 5
+; GFX8-NEXT:    s_lshl_b32 s7, s7, 5
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_sext_i32_i16 s3, s3
-; GFX8-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX8-NEXT:    s_ashr_i32 s2, s2, 5
-; GFX8-NEXT:    s_ashr_i32 s6, s6, 5
-; GFX8-NEXT:    s_or_b32 s0, s4, s0
-; GFX8-NEXT:    s_lshl_b32 s4, s5, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_ashr_i32 s3, s3, 5
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 5
+; GFX8-NEXT:    s_sext_i32_i16 s7, s7
+; GFX8-NEXT:    s_or_b32 s1, s1, s4
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s6
+; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_ashr_i32 s7, s7, 5
-; GFX8-NEXT:    s_or_b32 s1, s4, s1
-; GFX8-NEXT:    s_lshl_b32 s4, s6, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT:    s_or_b32 s2, s4, s2
-; GFX8-NEXT:    s_lshl_b32 s4, s7, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX8-NEXT:    s_or_b32 s3, s4, s3
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_ashr_i32 s3, s3, 5
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s7
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_sext_inreg_v8i16_5:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 3729f1cc2b12d9..218d487aee4137 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -563,18 +563,21 @@ define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0x3fff
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_i32_zext_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0x3fff
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_shl_i32_zext_i16:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0x3fff
 ; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX10PLUS-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %and = and i16 %x, 16383
   %ext = zext i16 %and to i32
@@ -634,13 +637,12 @@ define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
 ;
 ; GFX8-LABEL: s_shl_v2i32_zext_v2i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s2, 0x3fff
-; GFX8-NEXT:    s_mov_b32 s3, s2
+; GFX8-NEXT:    s_and_b32 s0, s0, 0x3fff3fff
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_v2i32_zext_v2i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index 4cf1c92539c36f..c7603b7cec04a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -71,19 +71,22 @@ define amdgpu_ps i8 @s_shl_i8(i8 inreg %value, i8 inreg %amount) {
 ;
 ; GFX8-LABEL: s_shl_i8:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_shl_i8:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10PLUS-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = shl i8 %value, %amount
@@ -627,19 +630,19 @@ define amdgpu_ps i16 @s_shl_i16(i16 inreg %value, i16 inreg %amount) {
 ;
 ; GFX8-LABEL: s_shl_i16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_i16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_shl_i16:
 ; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = shl i16 %value, %amount
@@ -791,13 +794,14 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun
 ; GFX8-LABEL: s_shl_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_v2i16:
@@ -976,21 +980,23 @@ define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
 ; GFX8-LABEL: s_shl_v4i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX8-NEXT:    s_lshl_b32 s2, s4, s6
+; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s3
 ; GFX8-NEXT:    s_lshl_b32 s3, s5, s7
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s2, s0
-; GFX8-NEXT:    s_lshl_b32 s2, s3, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_or_b32 s1, s2, s1
+; GFX8-NEXT:    s_or_b32 s1, s1, s2
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_v4i16:
@@ -1157,37 +1163,41 @@ define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
 ; GFX8-LABEL: s_shl_v8i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshl_b32 s4, s8, s12
-; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
+; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, s5
 ; GFX8-NEXT:    s_lshl_b32 s5, s9, s13
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
+; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s5
 ; GFX8-NEXT:    s_lshl_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshl_b32 s6, s10, s14
-; GFX8-NEXT:    s_or_b32 s0, s4, s0
-; GFX8-NEXT:    s_lshl_b32 s4, s5, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
+; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
+; GFX8-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX8-NEXT:    s_or_b32 s1, s1, s4
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s6
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, s7
 ; GFX8-NEXT:    s_lshl_b32 s7, s11, s15
-; GFX8-NEXT:    s_or_b32 s1, s4, s1
-; GFX8-NEXT:    s_lshl_b32 s4, s6, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT:    s_or_b32 s2, s4, s2
-; GFX8-NEXT:    s_lshl_b32 s4, s7, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX8-NEXT:    s_or_b32 s3, s4, s3
+; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s7
+; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_shl_v8i16:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
index 855687281ce9ab..49ba01aaf9e4fb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
@@ -244,12 +244,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
 ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_splat:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_sub_i32 s0, s0, 0xffc0
-; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffc0
+; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffffffc0
+; GFX8-NEXT:    s_sub_i32 s0, s0, 0xffffffc0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_splat:
@@ -284,12 +284,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
 ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_lo:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_sub_i32 s0, s0, 0xffc0
 ; GFX8-NEXT:    s_sub_i32 s1, s1, 4
+; GFX8-NEXT:    s_sub_i32 s0, s0, 0xffffffc0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_lo:
@@ -324,12 +324,12 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
 ; GFX8-LABEL: s_sub_v2i16_neg_inline_imm_hi:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffffffc0
 ; GFX8-NEXT:    s_sub_i32 s0, s0, 4
-; GFX8-NEXT:    s_sub_i32 s1, s1, 0xffc0
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_sub_v2i16_neg_inline_imm_hi:
@@ -365,14 +365,13 @@ define amdgpu_ps i32 @s_sub_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
 ; GFX8-LABEL: s_sub_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sub_i32 s1, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_sub_v2i16:
@@ -412,14 +411,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sub_i32 s1, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_sub_v2i16_fneg_lhs:
@@ -463,14 +461,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sub_i32 s1, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_sub_v2i16_fneg_rhs:
@@ -516,14 +513,13 @@ define amdgpu_ps i32 @s_sub_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha
 ; GFX8-NEXT:    s_xor_b32 s0, s0, 0x80008000
 ; GFX8-NEXT:    s_xor_b32 s1, s1, 0x80008000
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX8-NEXT:    s_sub_i32 s0, s0, s1
 ; GFX8-NEXT:    s_sub_i32 s1, s2, s3
+; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_sub_v2i16_fneg_lhs_fneg_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
index 6bb4e2d3dbe26e..7c9d8cba0fbb27 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll
@@ -35,15 +35,8 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in
 ;
 ; GFX8-LABEL: scalar_xnor_v2i16_one_use:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_xor_b32 s0, s0, s1
-; GFX8-NEXT:    s_mov_b32 s3, s2
-; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NEXT:    s_xor_b32 s0, s0, -1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX900-LABEL: scalar_xnor_v2i16_one_use:
@@ -129,21 +122,10 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in
 ;
 ; GFX8-LABEL: scalar_xnor_v4i16_one_use:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s4, 0xffff
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT:    s_mov_b32 s4, -1
 ; GFX8-NEXT:    s_mov_b32 s5, s4
-; GFX8-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX8-NEXT:    s_and_b32 s2, s0, 0xffff
-; GFX8-NEXT:    s_lshr_b32 s7, s1, 16
-; GFX8-NEXT:    s_and_b32 s6, s1, 0xffff
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[4:5]
-; GFX8-NEXT:    s_xor_b64 s[2:3], s[6:7], s[4:5]
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX8-NEXT:    s_or_b32 s0, s1, s0
-; GFX8-NEXT:    s_lshl_b32 s1, s3, 16
-; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT:    s_or_b32 s1, s1, s2
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX900-LABEL: scalar_xnor_v4i16_one_use:
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index 4cc384e9d27188..b3cba9177c4f85 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -103,13 +103,13 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s1, s2, 16
-; VI-NEXT:    s_lshr_b32 s3, s0, 16
+; VI-NEXT:    s_add_i32 s1, s2, s0
+; VI-NEXT:    s_lshr_b32 s0, s0, 16
+; VI-NEXT:    s_lshr_b32 s2, s2, 16
 ; VI-NEXT:    s_add_i32 s2, s2, s0
-; VI-NEXT:    s_add_i32 s1, s1, s3
-; VI-NEXT:    s_and_b32 s0, s2, 0xffff
-; VI-NEXT:    s_lshl_b32 s1, s1, 16
-; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s0, s2, 16
+; VI-NEXT:    s_or_b32 s0, s1, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -175,8 +175,8 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s0, s2, 16
 ; VI-NEXT:    s_and_b32 s1, s2, 0xffff
-; VI-NEXT:    s_add_i32 s1, s1, s1
 ; VI-NEXT:    s_add_i32 s0, s0, s0
+; VI-NEXT:    s_add_i32 s1, s1, s1
 ; VI-NEXT:    s_lshl_b32 s0, s0, 16
 ; VI-NEXT:    s_and_b32 s1, s1, 0xffff
 ; VI-NEXT:    s_or_b32 s0, s1, s0
@@ -230,12 +230,12 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s2, 16
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
+; VI-NEXT:    s_lshr_b32 s4, s3, 16
+; VI-NEXT:    s_lshr_b32 s5, s2, 16
 ; VI-NEXT:    s_add_i32 s2, s2, s3
-; VI-NEXT:    s_add_i32 s4, s4, s5
+; VI-NEXT:    s_add_i32 s5, s5, s4
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_lshl_b32 s3, s4, 16
+; VI-NEXT:    s_lshl_b32 s3, s5, 16
 ; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index bf72cccd912cee..de318e7ae31a5b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -393,12 +393,11 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
 ; GCN-LABEL: select_add_lhs_const_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x83
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x80
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bitcmp1_b32 s0, 0
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT:    s_movk_i32 s0, 0x80
+; GCN-NEXT:    s_cselect_b32 s0, s0, 0x83
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    flat_store_short v[0:1], v0
 ; GCN-NEXT:    s_endpgm
   %select = select i1 %cond, i16 5, i16 8
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
index 210356d1313501..b8585120afa45f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck -check-prefix=SI %s
-; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -amdgpu-codegenprepare %s | FileCheck -check-prefix=VI %s
+; RUN: opt -S -amdgpu-codegenprepare-widen-16-bit-ops -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck -check-prefix=SI %s
+; RUN: opt -S -amdgpu-codegenprepare-widen-16-bit-ops -mtriple=amdgcn-- -mcpu=tonga -amdgpu-codegenprepare %s | FileCheck -check-prefix=VI %s
 
 define amdgpu_kernel void @add_i3(i3 %a, i3 %b) {
 ; SI-LABEL: @add_i3(
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 0025d23b108038..32b2fa238cbac4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -18,189 +18,33 @@ declare hidden half @_Z4pownDhi(half, i32)
 ; --------------------------------------------------------------------
 
 define half @test_pow_fast_f16(half %x, half %y) {
-; CHECK-LABEL: test_pow_fast_f16:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, _Z3powDhDh at rel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3powDhDh at rel32@hi+12
-; CHECK-NEXT:    s_setpc_b64 s[16:17]
   %pow = tail call fast half @_Z3powDhDh(half %x, half %y)
   ret half %pow
 }
 
 define float @test_pow_fast_f32(float %x, float %y) {
-; CHECK-LABEL: test_pow_fast_f32:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, _Z3powff at rel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3powff at rel32@hi+12
-; CHECK-NEXT:    s_setpc_b64 s[16:17]
   %pow = tail call fast float @_Z3powff(float %x, float %y)
   ret float %pow
 }
 
 define double @test_pow_fast_f64(double %x, double %y) {
-; CHECK-LABEL: test_pow_fast_f64:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, _Z3powdd at rel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, _Z3powdd at rel32@hi+12
-; CHECK-NEXT:    s_setpc_b64 s[16:17]
   %pow = tail call fast double @_Z3powdd(double %x, double %y)
   ret double %pow
 }
 
 define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
-; CHECK-LABEL: test_pow_fast_f16__integral_y:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; CHECK-NEXT:    v_log_f16_e64 v3, |v0|
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CHECK-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v2, v1
-; CHECK-NEXT:    v_lshlrev_b16_e32 v1, 15, v1
-; CHECK-NEXT:    v_and_b32_e32 v0, v1, v0
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_f16_e32 v2, v3, v2
-; CHECK-NEXT:    v_exp_f16_e32 v2, v2
-; CHECK-NEXT:    v_or_b32_e32 v0, v0, v2
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = sitofp i32 %y.i to half
   %pow = tail call fast half @_Z3powDhDh(half %x, half %y)
   ret half %pow
 }
 
 define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
-; CHECK-LABEL: test_pow_fast_f32__integral_y:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; CHECK-NEXT:    s_mov_b32 s4, 0x800000
-; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; CHECK-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
-; CHECK-NEXT:    v_mul_f32_e64 v3, |v0|, v3
-; CHECK-NEXT:    v_log_f32_e32 v3, v3
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v4, v1
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NEXT:    v_sub_f32_e32 v2, v3, v2
-; CHECK-NEXT:    v_mul_f32_e32 v3, v2, v4
-; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0x42800000
-; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; CHECK-NEXT:    v_fma_f32 v2, v2, v4, v3
-; CHECK-NEXT:    v_exp_f32_e32 v2, v2
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0x1f800000
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
-; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v3
-; CHECK-NEXT:    v_and_or_b32 v0, v1, v0, v2
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = sitofp i32 %y.i to float
   %pow = tail call fast float @_Z3powff(float %x, float %y)
   ret float %pow
 }
 
 define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
-; CHECK-LABEL: test_pow_fast_f64__integral_y:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v40, v31
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v41, v2
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v40
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v41
-; CHECK-NEXT:    v_and_b32_e32 v2, v2, v42
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xf800
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = sitofp i32 %y.i to double
   %pow = tail call fast double @_Z3powdd(double %x, double %y)
   ret double %pow
@@ -211,132 +55,16 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; --------------------------------------------------------------------
 
 define half @test_powr_fast_f16(half %x, half %y) {
-; CHECK-LABEL: test_powr_fast_f16:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_log_f16_e32 v0, v0
-; CHECK-NEXT:    v_mul_f16_e32 v0, v1, v0
-; CHECK-NEXT:    v_exp_f16_e32 v0, v0
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %powr = tail call fast half @_Z4powrDhDh(half %x, half %y)
   ret half %powr
 }
 
 define float @test_powr_fast_f32(float %x, float %y) {
-; CHECK-LABEL: test_powr_fast_f32:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s4, 0x800000
-; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
-; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v3
-; CHECK-NEXT:    v_log_f32_e32 v0, v0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
-; CHECK-NEXT:    v_sub_f32_e32 v0, v0, v2
-; CHECK-NEXT:    v_mul_f32_e32 v2, v1, v0
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
-; CHECK-NEXT:    v_fma_f32 v0, v1, v0, v2
-; CHECK-NEXT:    v_exp_f32_e32 v0, v0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %powr = tail call fast float @_Z4powrff(float %x, float %y)
   ret float %powr
 }
 
 define double @test_powr_fast_f64(double %x, double %y) {
-; CHECK-LABEL: test_powr_fast_f64:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v42, v31
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v41, v3
-; CHECK-NEXT:    v_mov_b32_e32 v40, v2
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_mul_f64 v[0:1], v[40:41], v[0:1]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v42
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xf800
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %powr = tail call fast double @_Z4powrdd(double %x, double %y)
   ret double %powr
 }
@@ -346,429 +74,51 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; --------------------------------------------------------------------
 
 define half @test_pown_fast_f16(half %x, i32 %y) {
-; CHECK-LABEL: test_pown_fast_f16:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v2, v1
-; CHECK-NEXT:    v_log_f16_e64 v3, |v0|
-; CHECK-NEXT:    v_lshlrev_b16_e32 v1, 15, v1
-; CHECK-NEXT:    v_and_b32_e32 v0, v1, v0
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_f16_e32 v2, v3, v2
-; CHECK-NEXT:    v_exp_f16_e32 v2, v2
-; CHECK-NEXT:    v_or_b32_e32 v0, v0, v2
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
   ret half %call
 }
 
 define float @test_pown_fast_f32(float %x, i32 %y) {
-; CHECK-LABEL: test_pown_fast_f32:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s4, 0x800000
-; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
-; CHECK-NEXT:    v_mul_f32_e64 v3, |v0|, v3
-; CHECK-NEXT:    v_log_f32_e32 v3, v3
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v4, v1
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NEXT:    v_sub_f32_e32 v2, v3, v2
-; CHECK-NEXT:    v_mul_f32_e32 v3, v2, v4
-; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0x42800000
-; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
-; CHECK-NEXT:    v_fma_f32 v2, v2, v4, v3
-; CHECK-NEXT:    v_exp_f32_e32 v2, v2
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0x1f800000
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
-; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v3
-; CHECK-NEXT:    v_and_or_b32 v0, v1, v0, v2
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
   ret float %call
 }
 
 define double @test_pown_fast_f64(double %x, i32 %y) {
-; CHECK-LABEL: test_pown_fast_f64:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v40, v31
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v41, v2
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v40
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v41
-; CHECK-NEXT:    v_and_b32_e32 v2, v2, v42
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xf800
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %call = tail call fast double @_Z4powndi(double %x, i32 %y)
   ret double %call
 }
 
 define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) {
-; CHECK-LABEL: test_pown_fast_f16_known_even:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; CHECK-NEXT:    v_log_f16_e64 v0, |v0|
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_f16_e32 v0, v0, v1
-; CHECK-NEXT:    v_exp_f16_e32 v0, v0
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = shl i32 %y.arg, 1
   %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
   ret half %call
 }
 
 define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
-; CHECK-LABEL: test_pown_fast_f32_known_even:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s4, 0x800000
-; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
-; CHECK-NEXT:    v_mul_f32_e64 v0, |v0|, v3
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; CHECK-NEXT:    v_log_f32_e32 v0, v0
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NEXT:    v_sub_f32_e32 v0, v0, v2
-; CHECK-NEXT:    v_mul_f32_e32 v2, v0, v1
-; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
-; CHECK-NEXT:    v_fma_f32 v0, v0, v1, v2
-; CHECK-NEXT:    v_exp_f32_e32 v0, v0
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0x1f800000
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = shl i32 %y.arg, 1
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
   ret float %call
 }
 
 define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
-; CHECK-LABEL: test_pown_fast_f64_known_even:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v42, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v42, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v42, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v42, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v42, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v42, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v42, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v42, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v42, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v42, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v42, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v42, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v42, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v42, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v42, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v40, v31
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v41, 1, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v40
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v42, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v42, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v42, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v42, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v42, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v42, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v42, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v42, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v42, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v42, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v42, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v42, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v42, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v42, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v42, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = shl i32 %y.arg, 1
   %call = tail call fast double @_Z4powndi(double %x, i32 %y)
   ret double %call
 }
 
 define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
-; CHECK-LABEL: test_pown_fast_f16_known_odd:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v1, 1, v1
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; CHECK-NEXT:    v_log_f16_e64 v2, |v0|
-; CHECK-NEXT:    s_movk_i32 s4, 0x7fff
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_f16_e32 v1, v2, v1
-; CHECK-NEXT:    v_exp_f16_e32 v1, v1
-; CHECK-NEXT:    v_bfi_b32 v0, s4, v1, v0
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
   ret half %call
 }
 
 define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
-; CHECK-LABEL: test_pown_fast_f32_known_odd:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s4, 0x800000
-; CHECK-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, 1.0, v3, vcc
-; CHECK-NEXT:    v_mul_f32_e64 v3, |v0|, v3
-; CHECK-NEXT:    v_or_b32_e32 v1, 1, v1
-; CHECK-NEXT:    v_log_f32_e32 v3, v3
-; CHECK-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0x42000000
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; CHECK-NEXT:    v_sub_f32_e32 v2, v3, v2
-; CHECK-NEXT:    v_mul_f32_e32 v3, v2, v1
-; CHECK-NEXT:    s_mov_b32 s4, 0xc2fc0000
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0x42800000
-; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
-; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; CHECK-NEXT:    v_fma_f32 v1, v2, v1, v3
-; CHECK-NEXT:    v_exp_f32_e32 v1, v1
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0x1f800000
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; CHECK-NEXT:    s_brev_b32 s4, -2
-; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
-; CHECK-NEXT:    v_bfi_b32 v0, s4, v1, v0
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast float @_Z4pownfi(float %x, i32 %y)
   ret float %call
 }
 
 define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
-; CHECK-LABEL: test_pown_fast_f64_known_odd:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s16, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
-; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v41, v1
-; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v41
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v40, v31
-; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT:    s_mov_b32 s42, s15
-; CHECK-NEXT:    s_mov_b32 s43, s14
-; CHECK-NEXT:    s_mov_b32 s44, s13
-; CHECK-NEXT:    s_mov_b32 s45, s12
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; CHECK-NEXT:    v_or_b32_e32 v42, 1, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
-; CHECK-NEXT:    s_mov_b64 s[6:7], s[34:35]
-; CHECK-NEXT:    s_mov_b64 s[8:9], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[36:37]
-; CHECK-NEXT:    s_mov_b32 s12, s45
-; CHECK-NEXT:    s_mov_b32 s13, s44
-; CHECK-NEXT:    s_mov_b32 s14, s43
-; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v40
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_and_b32_e32 v2, 0x80000000, v41
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
-; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xf800
-; CHECK-NEXT:    s_mov_b32 s33, s4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %call = tail call fast double @_Z4powndi(double %x, i32 %y)
   ret double %call
@@ -776,3 +126,5 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 9ec8e425a3f55c..5889af70a8f092 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -236,7 +236,7 @@ entry:
 ; R600-VECT: MOVA_INT
 
 ; SI-PROMOTE-VECT-DAG: s_lshl_b32
-; SI-PROMOTE-VECT-DAG: v_lshrrev
+; SI-PROMOTE-VECT-DAG: s_lshr_b32
 
 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ; encoding: [0x00,0x00,0x60,0xe0
 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:1 ; encoding: [0x01,0x00,0x60,0xe0
diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 8b6c8be9f37882..115cb40676da8c 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -27,11 +27,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX8-NEXT:    v_not_b32_e32 v0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -42,11 +40,9 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX9-NEXT:    v_not_b32_e32 v0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index 6f52da2631b8a6..89735592cfa8a2 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -50,8 +50,7 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #
 ; GISEL-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    s_and_b32 s2, s4, 0xffff
-; GISEL-NEXT:    s_brev_b32 s2, s2
+; GISEL-NEXT:    s_brev_b32 s2, s4
 ; GISEL-NEXT:    s_lshr_b32 s2, s2, 16
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GISEL-NEXT:    v_mov_b32_e32 v2, s2
@@ -80,11 +79,9 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #
 ; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX11-GISEL-NEXT:    s_brev_b32 s2, s4
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_brev_b32 s2, s2
 ; GFX11-GISEL-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-GISEL-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 04d72691a088ab..86254329923971 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -951,11 +951,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll
index 82808cd3092270..07816f1ed6a650 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=ISA
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -stop-before=si-fix-sgpr-copies < %s | FileCheck %s -check-prefix=MIR
 
@@ -41,67 +40,6 @@ define void @f(i32 %arg, ptr %ptr) {
 ; ISA-NEXT:    flat_store_dword v[1:2], v7
 ; ISA-NEXT:    s_waitcnt lgkmcnt(0)
 ; ISA-NEXT:    s_setpc_b64 s[30:31]
-  ; MIR-LABEL: name: f
-  ; MIR: bb.0.bb:
-  ; MIR-NEXT:   successors: %bb.1(0x80000000)
-  ; MIR-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  ; MIR-NEXT: {{  $}}
-  ; MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; MIR-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-  ; MIR-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; MIR-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; MIR-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[S_MOV_B64_]], 0, 0 :: (invariant load (s64) from `ptr addrspace(4) null`, align 4294967296, addrspace 4)
-  ; MIR-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-  ; MIR-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-  ; MIR-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-  ; MIR-NEXT:   [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc
-  ; MIR-NEXT:   [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_]], [[COPY5]], implicit-def dead $scc
-  ; MIR-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; MIR-NEXT:   S_CMP_LG_U32 [[COPY5]], [[S_MOV_B32_1]], implicit-def $scc
-  ; MIR-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
-  ; MIR-NEXT:   $scc = COPY [[COPY6]]
-  ; MIR-NEXT:   [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_1]], implicit $scc
-  ; MIR-NEXT:   [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec
-  ; MIR-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_]]
-  ; MIR-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
-  ; MIR-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-  ; MIR-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_2]]
-  ; MIR-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_3]], 0, [[COPY8]], [[COPY6]], implicit $exec
-  ; MIR-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]]
-  ; MIR-NEXT:   $scc = COPY [[COPY6]]
-  ; MIR-NEXT:   [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_1]], implicit $scc
-  ; MIR-NEXT:   [[V_CVT_F32_UBYTE0_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e64 killed [[S_CSELECT_B32_1]], 0, 0, implicit $exec
-  ; MIR-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]]
-  ; MIR-NEXT:   $scc = COPY [[COPY6]]
-  ; MIR-NEXT:   [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY4]], [[S_MOV_B32_1]], implicit $scc
-  ; MIR-NEXT:   [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec
-  ; MIR-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]]
-  ; MIR-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_]], implicit $exec
-  ; MIR-NEXT:   [[COPY12:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]]
-  ; MIR-NEXT: {{  $}}
-  ; MIR-NEXT: bb.1.bb14:
-  ; MIR-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
-  ; MIR-NEXT: {{  $}}
-  ; MIR-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %7, %bb.1
-  ; MIR-NEXT:   [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_3]], %bb.0, %8, %bb.1
-  ; MIR-NEXT:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY12]]
-  ; MIR-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY13]], [[PHI]], implicit-def dead $scc
-  ; MIR-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY9]], 0, 0, implicit $mode, implicit $exec
-  ; MIR-NEXT:   [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 2, [[COPY7]], 0, 0, implicit $mode, implicit $exec
-  ; MIR-NEXT:   [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY10]], 0, 0, implicit $mode, implicit $exec
-  ; MIR-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec
-  ; MIR-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]]
-  ; MIR-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; MIR-NEXT:   S_BRANCH %bb.2
-  ; MIR-NEXT: {{  $}}
-  ; MIR-NEXT: bb.2.bb21:
-  ; MIR-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1
-  ; MIR-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.1
-  ; MIR-NEXT:   SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; MIR-NEXT:   FLAT_STORE_DWORD [[COPY3]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr)
-  ; MIR-NEXT:   SI_RETURN
 bb:
   %i = load <2 x i32>, ptr addrspace(4) null, align 4294967296
   %i1 = extractelement <2 x i32> %i, i64 1
@@ -134,3 +72,5 @@ bb21:
 }
 
 declare float @llvm.fabs.f32(float)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; MIR: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 231d3d97c8f4f3..5f8f898351fe8d 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -1069,31 +1069,32 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) {
 ; VI-LABEL: amd_kernel_v2i8:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dword s0, s[2:3], 0x24
-; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    v_add_u32_sdwa v2, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_mov_b32_e32 v3, s0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_bfe_u32 s1, s0, 0x80008
+; VI-NEXT:    s_add_i32 s0, s0, s0
+; VI-NEXT:    s_add_i32 s1, s1, s1
+; VI-NEXT:    s_and_b32 s0, s0, 0xff
+; VI-NEXT:    s_lshl_b32 s1, s1, 8
+; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amd_kernel_v2i8:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b16 v0, 8, s0
-; GFX11-NEXT:    v_add_nc_u16 v1, s0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_bfe_u32 s1, s0, 0x80008
+; GFX11-NEXT:    s_add_i32 s0, s0, s0
+; GFX11-NEXT:    s_add_i32 s1, s1, s1
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1136,51 +1137,52 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) {
 ; VI-LABEL: amd_kernel_v4i8:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dword s0, s[2:3], 0x24
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s1, s0, 24
-; VI-NEXT:    s_lshr_b32 s2, s0, 16
+; VI-NEXT:    s_lshr_b32 s1, s0, 16
+; VI-NEXT:    s_lshr_b32 s2, s0, 24
+; VI-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; VI-NEXT:    s_add_i32 s2, s2, s2
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshl_b32 s2, s2, 8
+; VI-NEXT:    s_and_b32 s1, s1, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 8, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v0, 0
-; VI-NEXT:    v_mov_b32_e32 v1, 0
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_add_i32 s3, s3, s3
+; VI-NEXT:    s_or_b32 s1, s1, s2
+; VI-NEXT:    s_and_b32 s0, s0, 0xff
+; VI-NEXT:    s_lshl_b32 s2, s3, 8
+; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    s_lshl_b32 s1, s1, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amd_kernel_v4i8:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b16 v0, 8, s0
 ; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s2, s0, 24
-; GFX11-NEXT:    v_add_nc_u16 v1, s0, s0
-; GFX11-NEXT:    v_add_nc_u16 v2, s2, s2
-; GFX11-NEXT:    v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT:    v_add_nc_u16 v3, s1, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_lshlrev_b16 v2, 8, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b16 v0, 8, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX11-NEXT:    s_add_i32 s3, s0, s0
+; GFX11-NEXT:    s_bfe_u32 s0, s0, 0x80008
+; GFX11-NEXT:    s_add_i32 s2, s2, s2
+; GFX11-NEXT:    s_add_i32 s0, s0, s0
+; GFX11-NEXT:    s_add_i32 s1, s1, s1
+; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX11-NEXT:    s_or_b32 s0, s3, s0
+; GFX11-NEXT:    s_or_b32 s1, s1, s2
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1219,45 +1221,44 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) {
 ; VI-LABEL: amd_kernel_v3i8:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dword s0, s[2:3], 0x24
-; VI-NEXT:    v_mov_b32_e32 v2, 0
-; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s1, s0, 16
-; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_bfe_u32 s2, s0, 0x80008
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_add_i32 s2, s2, s2
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    v_or_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v0, 2
+; VI-NEXT:    s_and_b32 s0, s0, 0xff
+; VI-NEXT:    s_lshl_b32 s2, s2, 8
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    flat_store_byte v[0:1], v2
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    flat_store_byte v[0:1], v5
-; VI-NEXT:    flat_store_short v[2:3], v4
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amd_kernel_v3i8:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b16 v0, 8, s0
-; GFX11-NEXT:    v_add_nc_u16 v1, s0, s0
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u16 v6, s0, s0
-; GFX11-NEXT:    v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v1
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_bfe_u32 s2, s0, 0x80008
+; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX11-NEXT:    s_add_i32 s0, s0, s0
+; GFX11-NEXT:    s_add_i32 s2, s2, s2
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX11-NEXT:    s_add_i32 s1, s1, s1
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b8 v[0:1], v6, off
-; GFX11-NEXT:    global_store_b16 v[2:3], v4, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b16 v[2:3], v5, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1304,60 +1305,62 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
 ; VI-LABEL: amd_kernel_v5i8:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-NEXT:    v_mov_b32_e32 v0, 4
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s0, 24
-; VI-NEXT:    s_lshr_b32 s3, s0, 16
+; VI-NEXT:    s_lshr_b32 s2, s0, 16
+; VI-NEXT:    s_lshr_b32 s3, s0, 24
+; VI-NEXT:    s_bfe_u32 s4, s0, 0x80008
 ; VI-NEXT:    s_add_i32 s3, s3, s3
 ; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshl_b32 s3, s3, 8
+; VI-NEXT:    s_and_b32 s2, s2, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 8, s2
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_and_b32 s1, s1, 0xff
-; VI-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_add_i32 s4, s4, s4
+; VI-NEXT:    s_or_b32 s2, s2, s3
+; VI-NEXT:    s_and_b32 s0, s0, 0xff
+; VI-NEXT:    s_lshl_b32 s3, s4, 8
+; VI-NEXT:    s_or_b32 s0, s0, s3
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v0, 4
+; VI-NEXT:    s_lshl_b32 s2, s2, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    flat_store_byte v[0:1], v2
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    v_mov_b32_e32 v2, 0
-; VI-NEXT:    v_mov_b32_e32 v3, 0
-; VI-NEXT:    flat_store_byte v[0:1], v5
-; VI-NEXT:    flat_store_dword v[2:3], v4
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amd_kernel_v5i8:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b16 v0, 8, s0
 ; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s3, s0, 24
-; GFX11-NEXT:    v_add_nc_u16 v1, s0, s0
-; GFX11-NEXT:    v_add_nc_u16 v2, s3, s3
-; GFX11-NEXT:    v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT:    v_add_nc_u16 v3, s2, s2
-; GFX11-NEXT:    v_add_nc_u16 v6, s1, s1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_lshlrev_b16 v2, 8, v2
-; GFX11-NEXT:    v_lshlrev_b16 v0, 8, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v4, 0xffff, v0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_lshlrev_b32 v5, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    s_add_i32 s4, s0, s0
+; GFX11-NEXT:    s_bfe_u32 s0, s0, 0x80008
+; GFX11-NEXT:    s_add_i32 s3, s3, s3
+; GFX11-NEXT:    s_add_i32 s0, s0, s0
+; GFX11-NEXT:    s_add_i32 s2, s2, s2
+; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_or_b32 s0, s4, s0
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_add_i32 s1, s1, s1
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b8 v[0:1], v6, off
-; GFX11-NEXT:    global_store_b32 v[2:3], v4, off
+; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1416,35 +1419,43 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
 ; VI-LABEL: amd_kernel_v8i8:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s1, 24
-; VI-NEXT:    s_lshr_b32 s3, s1, 16
+; VI-NEXT:    s_lshr_b32 s2, s0, 16
+; VI-NEXT:    s_lshr_b32 s3, s0, 24
+; VI-NEXT:    s_lshr_b32 s4, s1, 16
+; VI-NEXT:    s_lshr_b32 s5, s1, 24
+; VI-NEXT:    s_bfe_u32 s6, s0, 0x80008
+; VI-NEXT:    s_bfe_u32 s7, s1, 0x80008
+; VI-NEXT:    s_add_i32 s5, s5, s5
+; VI-NEXT:    s_add_i32 s4, s4, s4
 ; VI-NEXT:    s_add_i32 s3, s3, s3
 ; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_lshl_b32 s5, s5, 8
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    v_lshlrev_b16_e64 v2, 8, s2
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    s_lshr_b32 s4, s0, 24
-; VI-NEXT:    s_lshr_b32 s5, s0, 16
-; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_i32 s7, s7, s7
+; VI-NEXT:    s_lshl_b32 s3, s3, 8
+; VI-NEXT:    s_and_b32 s2, s2, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_add_i32 s5, s5, s5
-; VI-NEXT:    s_add_i32 s4, s4, s4
-; VI-NEXT:    v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v2, 8, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s5
-; VI-NEXT:    v_or_b32_sdwa v4, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v2, 0
-; VI-NEXT:    v_mov_b32_e32 v3, 0
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_add_i32 s6, s6, s6
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_and_b32 s1, s1, 0xff
+; VI-NEXT:    s_lshl_b32 s5, s7, 8
+; VI-NEXT:    s_or_b32 s2, s2, s3
+; VI-NEXT:    s_and_b32 s0, s0, 0xff
+; VI-NEXT:    s_lshl_b32 s3, s6, 8
+; VI-NEXT:    s_or_b32 s1, s1, s5
+; VI-NEXT:    s_or_b32 s0, s0, s3
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s2, s2, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_or_b32 s1, s1, s4
+; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
@@ -1452,45 +1463,42 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b16 v0, 8, s0
-; GFX11-NEXT:    v_lshrrev_b16 v1, 8, s1
 ; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s3, s0, 24
 ; GFX11-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX11-NEXT:    s_lshr_b32 s5, s1, 24
-; GFX11-NEXT:    v_add_nc_u16 v2, s1, s1
-; GFX11-NEXT:    v_add_nc_u16 v3, s0, s0
-; GFX11-NEXT:    v_add_nc_u16 v4, s5, s5
-; GFX11-NEXT:    v_add_nc_u16 v5, s4, s4
-; GFX11-NEXT:    v_add_nc_u16 v6, s3, s3
-; GFX11-NEXT:    v_add_nc_u16 v7, s2, s2
-; GFX11-NEXT:    v_add_nc_u16 v1, v1, v1
-; GFX11-NEXT:    v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_lshlrev_b16 v4, 8, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v6, 8, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v0, 8, v0
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v5, 0xffff, v0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v4
-; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_bfe_u32 s6, s0, 0x80008
+; GFX11-NEXT:    s_bfe_u32 s7, s1, 0x80008
+; GFX11-NEXT:    s_add_i32 s1, s1, s1
+; GFX11-NEXT:    s_add_i32 s0, s0, s0
+; GFX11-NEXT:    s_add_i32 s7, s7, s7
+; GFX11-NEXT:    s_add_i32 s5, s5, s5
+; GFX11-NEXT:    s_add_i32 s4, s4, s4
+; GFX11-NEXT:    s_add_i32 s6, s6, s6
+; GFX11-NEXT:    s_add_i32 s3, s3, s3
+; GFX11-NEXT:    s_add_i32 s2, s2, s2
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_or_b32 s1, s1, s7
+; GFX11-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-NEXT:    s_or_b32 s0, s0, s6
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_lshl_b32 s3, s4, 16
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
+; GFX11-NEXT:    s_or_b32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -1583,61 +1591,77 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
 ; VI-LABEL: amd_kernel_v16i8:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT:    v_mov_b32_e32 v4, 0
+; VI-NEXT:    v_mov_b32_e32 v5, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s3, 24
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
+; VI-NEXT:    s_lshr_b32 s4, s0, 16
+; VI-NEXT:    s_lshr_b32 s5, s0, 24
+; VI-NEXT:    s_lshr_b32 s6, s1, 16
+; VI-NEXT:    s_lshr_b32 s7, s1, 24
+; VI-NEXT:    s_lshr_b32 s8, s2, 16
+; VI-NEXT:    s_lshr_b32 s9, s2, 24
+; VI-NEXT:    s_lshr_b32 s10, s3, 16
+; VI-NEXT:    s_lshr_b32 s11, s3, 24
+; VI-NEXT:    s_bfe_u32 s12, s0, 0x80008
+; VI-NEXT:    s_bfe_u32 s13, s1, 0x80008
+; VI-NEXT:    s_bfe_u32 s14, s2, 0x80008
+; VI-NEXT:    s_bfe_u32 s15, s3, 0x80008
+; VI-NEXT:    s_add_i32 s11, s11, s11
+; VI-NEXT:    s_add_i32 s10, s10, s10
+; VI-NEXT:    s_add_i32 s9, s9, s9
+; VI-NEXT:    s_add_i32 s8, s8, s8
+; VI-NEXT:    s_add_i32 s7, s7, s7
+; VI-NEXT:    s_add_i32 s6, s6, s6
 ; VI-NEXT:    s_add_i32 s5, s5, s5
 ; VI-NEXT:    s_add_i32 s4, s4, s4
-; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_lshl_b32 s11, s11, 8
+; VI-NEXT:    s_and_b32 s10, s10, 0xff
 ; VI-NEXT:    s_add_i32 s3, s3, s3
-; VI-NEXT:    v_lshlrev_b16_e64 v4, 8, s4
-; VI-NEXT:    v_mov_b32_e32 v5, s5
-; VI-NEXT:    s_lshr_b32 s6, s2, 24
-; VI-NEXT:    s_lshr_b32 s7, s2, 16
-; VI-NEXT:    v_add_u32_sdwa v3, vcc, v3, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v5, s3
-; VI-NEXT:    s_add_i32 s7, s7, s7
-; VI-NEXT:    s_add_i32 s6, s6, s6
-; VI-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_add_i32 s15, s15, s15
+; VI-NEXT:    s_lshl_b32 s9, s9, 8
+; VI-NEXT:    s_and_b32 s8, s8, 0xff
 ; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v4, 8, s6
-; VI-NEXT:    v_mov_b32_e32 v5, s7
-; VI-NEXT:    s_lshr_b32 s8, s1, 24
-; VI-NEXT:    s_lshr_b32 s9, s1, 16
-; VI-NEXT:    v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v5, s2
-; VI-NEXT:    s_add_i32 s9, s9, s9
-; VI-NEXT:    s_add_i32 s8, s8, s8
-; VI-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_add_i32 s14, s14, s14
+; VI-NEXT:    s_lshl_b32 s7, s7, 8
+; VI-NEXT:    s_and_b32 s6, s6, 0xff
 ; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v4, 8, s8
-; VI-NEXT:    v_mov_b32_e32 v5, s9
-; VI-NEXT:    s_lshr_b32 s10, s0, 24
-; VI-NEXT:    s_lshr_b32 s11, s0, 16
-; VI-NEXT:    v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v5, s1
-; VI-NEXT:    s_add_i32 s11, s11, s11
-; VI-NEXT:    s_add_i32 s10, s10, s10
-; VI-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_i32 s13, s13, s13
+; VI-NEXT:    s_lshl_b32 s5, s5, 8
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
 ; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v4, 8, s10
-; VI-NEXT:    v_mov_b32_e32 v5, s11
-; VI-NEXT:    v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_or_b32_sdwa v6, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v4, 0
-; VI-NEXT:    v_mov_b32_e32 v5, 0
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_add_i32 s12, s12, s12
+; VI-NEXT:    s_or_b32 s10, s10, s11
+; VI-NEXT:    s_and_b32 s3, s3, 0xff
+; VI-NEXT:    s_lshl_b32 s11, s15, 8
+; VI-NEXT:    s_or_b32 s8, s8, s9
+; VI-NEXT:    s_and_b32 s2, s2, 0xff
+; VI-NEXT:    s_lshl_b32 s9, s14, 8
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_and_b32 s1, s1, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s13, 8
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_and_b32 s0, s0, 0xff
+; VI-NEXT:    s_lshl_b32 s5, s12, 8
+; VI-NEXT:    s_or_b32 s3, s3, s11
+; VI-NEXT:    s_or_b32 s2, s2, s9
+; VI-NEXT:    s_or_b32 s1, s1, s7
+; VI-NEXT:    s_or_b32 s0, s0, s5
+; VI-NEXT:    s_lshl_b32 s10, s10, 16
+; VI-NEXT:    s_and_b32 s3, s3, 0xffff
+; VI-NEXT:    s_lshl_b32 s8, s8, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_or_b32 s3, s3, s10
+; VI-NEXT:    s_or_b32 s2, s2, s8
+; VI-NEXT:    s_or_b32 s1, s1, s6
+; VI-NEXT:    s_or_b32 s0, s0, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -1645,72 +1669,73 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX11-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX11-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX11-NEXT:    s_lshr_b32 s9, s2, 24
 ; GFX11-NEXT:    s_lshr_b32 s10, s3, 16
 ; GFX11-NEXT:    s_lshr_b32 s11, s3, 24
-; GFX11-NEXT:    v_lshrrev_b16 v2, 8, s2
-; GFX11-NEXT:    v_lshrrev_b16 v3, 8, s3
-; GFX11-NEXT:    v_add_nc_u16 v7, s11, s11
-; GFX11-NEXT:    v_add_nc_u16 v8, s10, s10
-; GFX11-NEXT:    v_add_nc_u16 v4, s3, s3
-; GFX11-NEXT:    v_add_nc_u16 v5, s2, s2
-; GFX11-NEXT:    v_add_nc_u16 v3, v3, v3
-; GFX11-NEXT:    v_add_nc_u16 v2, v2, v2
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    s_lshr_b32 s7, s1, 24
-; GFX11-NEXT:    v_lshrrev_b16 v1, 8, s1
-; GFX11-NEXT:    v_lshrrev_b16 v0, 8, s0
-; GFX11-NEXT:    v_add_nc_u16 v11, s7, s7
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v2, 8, v2
-; GFX11-NEXT:    v_or_b32_e32 v7, v8, v7
-; GFX11-NEXT:    s_lshr_b32 s6, s1, 16
 ; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX11-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s9, s2, 24
-; GFX11-NEXT:    v_add_nc_u16 v6, s1, s1
-; GFX11-NEXT:    v_add_nc_u16 v12, s6, s6
-; GFX11-NEXT:    v_add_nc_u16 v1, v1, v1
-; GFX11-NEXT:    v_add_nc_u16 v9, s9, s9
-; GFX11-NEXT:    v_add_nc_u16 v10, s8, s8
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX11-NEXT:    v_lshlrev_b16 v4, 8, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GFX11-NEXT:    v_add_nc_u16 v7, s0, s0
-; GFX11-NEXT:    v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT:    v_add_nc_u16 v8, s5, s5
-; GFX11-NEXT:    v_add_nc_u16 v11, s4, s4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v0, 8, v0
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v8
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_or_b32_e32 v1, v6, v1
-; GFX11-NEXT:    v_or_b32_e32 v4, v12, v4
-; GFX11-NEXT:    v_or_b32_e32 v9, v10, v9
-; GFX11-NEXT:    v_or_b32_e32 v0, v7, v0
-; GFX11-NEXT:    v_or_b32_e32 v6, v11, v8
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v7
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX11-NEXT:    s_bfe_u32 s12, s0, 0x80008
+; GFX11-NEXT:    s_bfe_u32 s13, s1, 0x80008
+; GFX11-NEXT:    s_bfe_u32 s14, s2, 0x80008
+; GFX11-NEXT:    s_bfe_u32 s15, s3, 0x80008
+; GFX11-NEXT:    s_add_i32 s11, s11, s11
+; GFX11-NEXT:    s_add_i32 s10, s10, s10
+; GFX11-NEXT:    s_add_i32 s9, s9, s9
+; GFX11-NEXT:    s_add_i32 s8, s8, s8
+; GFX11-NEXT:    s_add_i32 s7, s7, s7
+; GFX11-NEXT:    s_add_i32 s6, s6, s6
+; GFX11-NEXT:    s_add_i32 s3, s3, s3
+; GFX11-NEXT:    s_add_i32 s2, s2, s2
+; GFX11-NEXT:    s_add_i32 s15, s15, s15
+; GFX11-NEXT:    s_add_i32 s14, s14, s14
+; GFX11-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX11-NEXT:    s_and_b32 s10, s10, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX11-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX11-NEXT:    s_add_i32 s1, s1, s1
+; GFX11-NEXT:    s_add_i32 s13, s13, s13
+; GFX11-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX11-NEXT:    s_add_i32 s0, s0, s0
+; GFX11-NEXT:    s_add_i32 s12, s12, s12
+; GFX11-NEXT:    s_add_i32 s5, s5, s5
+; GFX11-NEXT:    s_add_i32 s4, s4, s4
+; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s15, s15, 8
+; GFX11-NEXT:    s_or_b32 s10, s10, s11
+; GFX11-NEXT:    s_lshl_b32 s11, s14, 8
+; GFX11-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s13, 8
+; GFX11-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s7, s12, 8
+; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-NEXT:    s_or_b32 s3, s3, s15
+; GFX11-NEXT:    s_or_b32 s2, s2, s11
+; GFX11-NEXT:    s_or_b32 s1, s1, s9
+; GFX11-NEXT:    s_or_b32 s0, s0, s7
+; GFX11-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s10, s10, 16
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-NEXT:    s_lshl_b32 s5, s6, 16
+; GFX11-NEXT:    s_or_b32 s3, s3, s10
+; GFX11-NEXT:    s_or_b32 s2, s2, s8
+; GFX11-NEXT:    s_or_b32 s0, s0, s4
+; GFX11-NEXT:    s_or_b32 s1, s1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, 0
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1875,258 +1900,292 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ; VI-LABEL: amd_kernel_v32i8:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
-; VI-NEXT:    v_mov_b32_e32 v10, 0
-; VI-NEXT:    v_mov_b32_e32 v11, 0
+; VI-NEXT:    v_mov_b32_e32 v4, 16
+; VI-NEXT:    v_mov_b32_e32 v5, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s8, s3, 24
-; VI-NEXT:    s_lshr_b32 s9, s3, 16
-; VI-NEXT:    s_add_i32 s9, s9, s9
-; VI-NEXT:    s_add_i32 s8, s8, s8
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    s_add_i32 s3, s3, s3
-; VI-NEXT:    v_lshlrev_b16_e64 v8, 8, s8
-; VI-NEXT:    v_mov_b32_e32 v9, s9
-; VI-NEXT:    s_lshr_b32 s10, s2, 24
-; VI-NEXT:    s_lshr_b32 s11, s2, 16
-; VI-NEXT:    v_add_u32_sdwa v3, vcc, v3, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v9, s3
-; VI-NEXT:    s_add_i32 s11, s11, s11
-; VI-NEXT:    s_add_i32 s10, s10, s10
-; VI-NEXT:    v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_add_i32 s2, s2, s2
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v8, 8, s10
-; VI-NEXT:    v_mov_b32_e32 v9, s11
-; VI-NEXT:    s_lshr_b32 s12, s1, 24
-; VI-NEXT:    s_lshr_b32 s13, s1, 16
-; VI-NEXT:    v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v9, s2
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    s_add_i32 s13, s13, s13
-; VI-NEXT:    s_add_i32 s12, s12, s12
-; VI-NEXT:    v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_sdwa v4, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_add_i32 s1, s1, s1
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v8, 8, s12
-; VI-NEXT:    v_mov_b32_e32 v9, s13
-; VI-NEXT:    s_lshr_b32 s14, s0, 24
-; VI-NEXT:    s_lshr_b32 s15, s0, 16
-; VI-NEXT:    v_add_u32_sdwa v5, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_mov_b32_e32 v0, s6
-; VI-NEXT:    v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v9, s1
-; VI-NEXT:    v_add_u32_sdwa v6, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_mov_b32_e32 v0, s7
+; VI-NEXT:    s_lshr_b32 s8, s4, 16
+; VI-NEXT:    s_lshr_b32 s9, s4, 24
+; VI-NEXT:    s_lshr_b32 s10, s5, 16
+; VI-NEXT:    s_lshr_b32 s11, s5, 24
+; VI-NEXT:    s_lshr_b32 s12, s6, 16
+; VI-NEXT:    s_lshr_b32 s13, s6, 24
+; VI-NEXT:    s_lshr_b32 s14, s7, 16
+; VI-NEXT:    s_lshr_b32 s15, s7, 24
+; VI-NEXT:    s_bfe_u32 s24, s4, 0x80008
+; VI-NEXT:    s_bfe_u32 s25, s5, 0x80008
+; VI-NEXT:    s_bfe_u32 s26, s6, 0x80008
+; VI-NEXT:    s_bfe_u32 s27, s7, 0x80008
 ; VI-NEXT:    s_add_i32 s15, s15, s15
 ; VI-NEXT:    s_add_i32 s14, s14, s14
-; VI-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_sdwa v7, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_add_i32 s0, s0, s0
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v8, 8, s14
-; VI-NEXT:    v_mov_b32_e32 v9, s15
-; VI-NEXT:    s_lshr_b32 s16, s7, 24
-; VI-NEXT:    s_lshr_b32 s17, s7, 16
-; VI-NEXT:    v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v9, s0
-; VI-NEXT:    s_add_i32 s17, s17, s17
-; VI-NEXT:    s_add_i32 s16, s16, s16
-; VI-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_add_i32 s13, s13, s13
+; VI-NEXT:    s_add_i32 s12, s12, s12
+; VI-NEXT:    s_add_i32 s11, s11, s11
+; VI-NEXT:    s_add_i32 s10, s10, s10
+; VI-NEXT:    s_add_i32 s9, s9, s9
+; VI-NEXT:    s_add_i32 s8, s8, s8
+; VI-NEXT:    s_lshr_b32 s16, s0, 16
+; VI-NEXT:    s_lshr_b32 s17, s0, 24
+; VI-NEXT:    s_lshr_b32 s18, s1, 16
+; VI-NEXT:    s_lshr_b32 s19, s1, 24
+; VI-NEXT:    s_lshr_b32 s20, s2, 16
+; VI-NEXT:    s_lshr_b32 s21, s2, 24
+; VI-NEXT:    s_lshr_b32 s22, s3, 16
+; VI-NEXT:    s_lshr_b32 s23, s3, 24
+; VI-NEXT:    s_lshl_b32 s15, s15, 8
+; VI-NEXT:    s_and_b32 s14, s14, 0xff
 ; VI-NEXT:    s_add_i32 s7, s7, s7
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v8, 8, s16
-; VI-NEXT:    v_mov_b32_e32 v9, s17
-; VI-NEXT:    s_lshr_b32 s18, s6, 24
-; VI-NEXT:    s_lshr_b32 s19, s6, 16
-; VI-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v9, s7
-; VI-NEXT:    s_add_i32 s19, s19, s19
-; VI-NEXT:    s_add_i32 s18, s18, s18
-; VI-NEXT:    v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_add_i32 s27, s27, s27
+; VI-NEXT:    s_lshl_b32 s13, s13, 8
+; VI-NEXT:    s_and_b32 s12, s12, 0xff
 ; VI-NEXT:    s_add_i32 s6, s6, s6
-; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v8, 8, s18
-; VI-NEXT:    v_mov_b32_e32 v9, s19
-; VI-NEXT:    s_lshr_b32 s20, s5, 24
-; VI-NEXT:    s_lshr_b32 s21, s5, 16
-; VI-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v9, s6
-; VI-NEXT:    s_add_i32 s21, s21, s21
-; VI-NEXT:    s_add_i32 s20, s20, s20
-; VI-NEXT:    v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_add_i32 s26, s26, s26
+; VI-NEXT:    s_lshl_b32 s11, s11, 8
+; VI-NEXT:    s_and_b32 s10, s10, 0xff
 ; VI-NEXT:    s_add_i32 s5, s5, s5
-; VI-NEXT:    v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v8, 8, s20
-; VI-NEXT:    v_mov_b32_e32 v9, s21
-; VI-NEXT:    s_lshr_b32 s22, s4, 24
-; VI-NEXT:    s_lshr_b32 s23, s4, 16
-; VI-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v9, s5
+; VI-NEXT:    s_add_i32 s25, s25, s25
+; VI-NEXT:    s_lshl_b32 s9, s9, 8
+; VI-NEXT:    s_and_b32 s8, s8, 0xff
+; VI-NEXT:    s_add_i32 s4, s4, s4
+; VI-NEXT:    s_add_i32 s24, s24, s24
+; VI-NEXT:    s_bfe_u32 s28, s0, 0x80008
+; VI-NEXT:    s_bfe_u32 s29, s1, 0x80008
+; VI-NEXT:    s_bfe_u32 s30, s2, 0x80008
+; VI-NEXT:    s_bfe_u32 s31, s3, 0x80008
 ; VI-NEXT:    s_add_i32 s23, s23, s23
 ; VI-NEXT:    s_add_i32 s22, s22, s22
-; VI-NEXT:    v_or_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_add_i32 s4, s4, s4
-; VI-NEXT:    v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v8, 8, s22
-; VI-NEXT:    v_mov_b32_e32 v9, s23
-; VI-NEXT:    v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v9, s4
-; VI-NEXT:    v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v8, 16
-; VI-NEXT:    v_mov_b32_e32 v9, 0
-; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; VI-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
+; VI-NEXT:    s_add_i32 s21, s21, s21
+; VI-NEXT:    s_add_i32 s20, s20, s20
+; VI-NEXT:    s_add_i32 s19, s19, s19
+; VI-NEXT:    s_add_i32 s18, s18, s18
+; VI-NEXT:    s_add_i32 s17, s17, s17
+; VI-NEXT:    s_add_i32 s16, s16, s16
+; VI-NEXT:    s_or_b32 s14, s14, s15
+; VI-NEXT:    s_and_b32 s7, s7, 0xff
+; VI-NEXT:    s_lshl_b32 s15, s27, 8
+; VI-NEXT:    s_or_b32 s12, s12, s13
+; VI-NEXT:    s_and_b32 s6, s6, 0xff
+; VI-NEXT:    s_lshl_b32 s13, s26, 8
+; VI-NEXT:    s_or_b32 s10, s10, s11
+; VI-NEXT:    s_and_b32 s5, s5, 0xff
+; VI-NEXT:    s_lshl_b32 s11, s25, 8
+; VI-NEXT:    s_or_b32 s8, s8, s9
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
+; VI-NEXT:    s_lshl_b32 s9, s24, 8
+; VI-NEXT:    s_lshl_b32 s23, s23, 8
+; VI-NEXT:    s_and_b32 s22, s22, 0xff
+; VI-NEXT:    s_add_i32 s3, s3, s3
+; VI-NEXT:    s_add_i32 s31, s31, s31
+; VI-NEXT:    s_lshl_b32 s21, s21, 8
+; VI-NEXT:    s_and_b32 s20, s20, 0xff
+; VI-NEXT:    s_add_i32 s2, s2, s2
+; VI-NEXT:    s_add_i32 s30, s30, s30
+; VI-NEXT:    s_lshl_b32 s19, s19, 8
+; VI-NEXT:    s_and_b32 s18, s18, 0xff
+; VI-NEXT:    s_add_i32 s1, s1, s1
+; VI-NEXT:    s_add_i32 s29, s29, s29
+; VI-NEXT:    s_lshl_b32 s17, s17, 8
+; VI-NEXT:    s_and_b32 s16, s16, 0xff
+; VI-NEXT:    s_add_i32 s0, s0, s0
+; VI-NEXT:    s_add_i32 s28, s28, s28
+; VI-NEXT:    s_or_b32 s7, s7, s15
+; VI-NEXT:    s_or_b32 s6, s6, s13
+; VI-NEXT:    s_or_b32 s5, s5, s11
+; VI-NEXT:    s_or_b32 s4, s4, s9
+; VI-NEXT:    s_or_b32 s22, s22, s23
+; VI-NEXT:    s_and_b32 s3, s3, 0xff
+; VI-NEXT:    s_lshl_b32 s23, s31, 8
+; VI-NEXT:    s_or_b32 s20, s20, s21
+; VI-NEXT:    s_and_b32 s2, s2, 0xff
+; VI-NEXT:    s_lshl_b32 s21, s30, 8
+; VI-NEXT:    s_or_b32 s18, s18, s19
+; VI-NEXT:    s_and_b32 s1, s1, 0xff
+; VI-NEXT:    s_lshl_b32 s19, s29, 8
+; VI-NEXT:    s_or_b32 s16, s16, s17
+; VI-NEXT:    s_and_b32 s0, s0, 0xff
+; VI-NEXT:    s_lshl_b32 s17, s28, 8
+; VI-NEXT:    s_lshl_b32 s14, s14, 16
+; VI-NEXT:    s_and_b32 s7, s7, 0xffff
+; VI-NEXT:    s_lshl_b32 s12, s12, 16
+; VI-NEXT:    s_and_b32 s6, s6, 0xffff
+; VI-NEXT:    s_lshl_b32 s10, s10, 16
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshl_b32 s8, s8, 16
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_or_b32 s3, s3, s23
+; VI-NEXT:    s_or_b32 s2, s2, s21
+; VI-NEXT:    s_or_b32 s1, s1, s19
+; VI-NEXT:    s_or_b32 s0, s0, s17
+; VI-NEXT:    s_or_b32 s7, s7, s14
+; VI-NEXT:    s_or_b32 s6, s6, s12
+; VI-NEXT:    s_or_b32 s5, s5, s10
+; VI-NEXT:    s_or_b32 s4, s4, s8
+; VI-NEXT:    s_lshl_b32 s22, s22, 16
+; VI-NEXT:    s_and_b32 s3, s3, 0xffff
+; VI-NEXT:    s_lshl_b32 s20, s20, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_lshl_b32 s18, s18, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s16, s16, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s7
+; VI-NEXT:    s_or_b32 s3, s3, s22
+; VI-NEXT:    s_or_b32 s2, s2, s20
+; VI-NEXT:    s_or_b32 s1, s1, s18
+; VI-NEXT:    s_or_b32 s0, s0, s16
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT:    v_mov_b32_e32 v4, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v5, 0
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amd_kernel_v32i8:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b16 v3, 8, s2
-; GFX11-NEXT:    v_lshrrev_b16 v7, 8, s3
-; GFX11-NEXT:    s_lshr_b32 s21, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s22, s3, 24
-; GFX11-NEXT:    v_add_nc_u16 v8, s3, s3
-; GFX11-NEXT:    v_add_nc_u16 v9, s2, s2
-; GFX11-NEXT:    v_add_nc_u16 v7, v7, v7
-; GFX11-NEXT:    v_add_nc_u16 v10, s22, s22
-; GFX11-NEXT:    v_add_nc_u16 v11, s21, s21
-; GFX11-NEXT:    v_add_nc_u16 v3, v3, v3
-; GFX11-NEXT:    v_lshrrev_b16 v2, 8, s1
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v10, 8, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    s_lshr_b32 s18, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s19, s1, 24
-; GFX11-NEXT:    s_lshr_b32 s20, s2, 24
-; GFX11-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX11-NEXT:    v_or_b32_e32 v7, v8, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, s20, s20
-; GFX11-NEXT:    v_or_b32_e32 v10, v11, v10
-; GFX11-NEXT:    v_or_b32_e32 v3, v9, v3
-; GFX11-NEXT:    v_add_nc_u16 v9, s2, s2
-; GFX11-NEXT:    v_add_nc_u16 v11, s1, s1
-; GFX11-NEXT:    v_add_nc_u16 v2, v2, v2
-; GFX11-NEXT:    v_add_nc_u16 v12, s19, s19
-; GFX11-NEXT:    v_add_nc_u16 v13, s18, s18
-; GFX11-NEXT:    v_lshrrev_b16 v1, 8, s0
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v2, 8, v2
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, v9, v8
-; GFX11-NEXT:    v_or_b32_e32 v2, v11, v2
-; GFX11-NEXT:    v_add_nc_u16 v9, s0, s0
-; GFX11-NEXT:    v_or_b32_e32 v8, v13, v12
-; GFX11-NEXT:    v_add_nc_u16 v1, v1, v1
-; GFX11-NEXT:    v_lshrrev_b16 v6, 8, s7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v1
-; GFX11-NEXT:    v_lshrrev_b16 v5, 8, s6
-; GFX11-NEXT:    s_lshr_b32 s14, s7, 16
-; GFX11-NEXT:    s_lshr_b32 s15, s7, 24
 ; GFX11-NEXT:    s_lshr_b32 s16, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s17, s0, 24
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, v14, v11
-; GFX11-NEXT:    v_add_nc_u16 v7, s7, s7
-; GFX11-NEXT:    v_or_b32_e32 v1, v12, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v13
-; GFX11-NEXT:    v_add_nc_u16 v9, s17, s17
-; GFX11-NEXT:    v_add_nc_u16 v10, s16, s16
-; GFX11-NEXT:    v_add_nc_u16 v6, v6, v6
-; GFX11-NEXT:    v_add_nc_u16 v11, s15, s15
-; GFX11-NEXT:    v_add_nc_u16 v12, s14, s14
-; GFX11-NEXT:    v_add_nc_u16 v13, s6, s6
-; GFX11-NEXT:    v_add_nc_u16 v5, v5, v5
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v6, 8, v6
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshrrev_b16 v0, 8, s4
-; GFX11-NEXT:    v_lshrrev_b16 v4, 8, s5
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-NEXT:    s_lshr_b32 s20, s2, 16
+; GFX11-NEXT:    s_lshr_b32 s21, s2, 24
+; GFX11-NEXT:    s_lshr_b32 s14, s7, 16
+; GFX11-NEXT:    s_lshr_b32 s15, s7, 24
+; GFX11-NEXT:    s_bfe_u32 s27, s7, 0x80008
+; GFX11-NEXT:    s_add_i32 s17, s17, s17
+; GFX11-NEXT:    s_add_i32 s16, s16, s16
+; GFX11-NEXT:    s_lshr_b32 s18, s1, 16
+; GFX11-NEXT:    s_lshr_b32 s19, s1, 24
+; GFX11-NEXT:    s_lshr_b32 s22, s3, 16
+; GFX11-NEXT:    s_lshr_b32 s23, s3, 24
+; GFX11-NEXT:    s_bfe_u32 s29, s1, 0x80008
+; GFX11-NEXT:    s_bfe_u32 s30, s3, 0x80008
+; GFX11-NEXT:    s_add_i32 s21, s21, s21
+; GFX11-NEXT:    s_add_i32 s20, s20, s20
+; GFX11-NEXT:    s_lshl_b32 s17, s17, 8
+; GFX11-NEXT:    s_and_b32 s16, s16, 0xff
+; GFX11-NEXT:    s_add_i32 s7, s7, s7
+; GFX11-NEXT:    s_add_i32 s27, s27, s27
+; GFX11-NEXT:    s_add_i32 s15, s15, s15
+; GFX11-NEXT:    s_add_i32 s14, s14, s14
+; GFX11-NEXT:    s_add_i32 s3, s3, s3
+; GFX11-NEXT:    s_add_i32 s30, s30, s30
+; GFX11-NEXT:    s_add_i32 s23, s23, s23
+; GFX11-NEXT:    s_add_i32 s22, s22, s22
+; GFX11-NEXT:    s_lshl_b32 s21, s21, 8
+; GFX11-NEXT:    s_and_b32 s20, s20, 0xff
+; GFX11-NEXT:    s_add_i32 s1, s1, s1
+; GFX11-NEXT:    s_add_i32 s29, s29, s29
+; GFX11-NEXT:    s_add_i32 s19, s19, s19
+; GFX11-NEXT:    s_add_i32 s18, s18, s18
+; GFX11-NEXT:    s_lshr_b32 s10, s5, 16
+; GFX11-NEXT:    s_lshr_b32 s11, s5, 24
 ; GFX11-NEXT:    s_lshr_b32 s12, s6, 16
 ; GFX11-NEXT:    s_lshr_b32 s13, s6, 24
+; GFX11-NEXT:    s_or_b32 s16, s16, s17
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX11-NEXT:    s_lshl_b32 s17, s27, 8
+; GFX11-NEXT:    s_lshl_b32 s15, s15, 8
+; GFX11-NEXT:    s_and_b32 s14, s14, 0xff
+; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX11-NEXT:    s_lshl_b32 s30, s30, 8
+; GFX11-NEXT:    s_lshl_b32 s23, s23, 8
+; GFX11-NEXT:    s_and_b32 s22, s22, 0xff
+; GFX11-NEXT:    s_or_b32 s20, s20, s21
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX11-NEXT:    s_lshl_b32 s21, s29, 8
+; GFX11-NEXT:    s_lshl_b32 s19, s19, 8
+; GFX11-NEXT:    s_and_b32 s18, s18, 0xff
 ; GFX11-NEXT:    s_lshr_b32 s8, s4, 16
 ; GFX11-NEXT:    s_lshr_b32 s9, s4, 24
-; GFX11-NEXT:    s_lshr_b32 s10, s5, 16
-; GFX11-NEXT:    s_lshr_b32 s11, s5, 24
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v12, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, v10, v9
-; GFX11-NEXT:    v_add_nc_u16 v9, s13, s13
-; GFX11-NEXT:    v_add_nc_u16 v10, s12, s12
-; GFX11-NEXT:    v_or_b32_e32 v5, v13, v5
-; GFX11-NEXT:    v_add_nc_u16 v11, s5, s5
-; GFX11-NEXT:    v_add_nc_u16 v4, v4, v4
-; GFX11-NEXT:    v_add_nc_u16 v13, s11, s11
-; GFX11-NEXT:    v_add_nc_u16 v14, s10, s10
-; GFX11-NEXT:    v_add_nc_u16 v15, s4, s4
-; GFX11-NEXT:    v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT:    v_add_nc_u16 v16, s9, s9
-; GFX11-NEXT:    v_add_nc_u16 v17, s8, s8
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v4, 8, v4
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_lshlrev_b16 v0, 8, v0
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-NEXT:    v_or_b32_e32 v9, v10, v9
-; GFX11-NEXT:    v_or_b32_e32 v4, v11, v4
-; GFX11-NEXT:    v_or_b32_e32 v10, v14, v13
-; GFX11-NEXT:    v_or_b32_e32 v0, v15, v0
-; GFX11-NEXT:    v_or_b32_e32 v11, v17, v16
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v6, v5, v9
-; GFX11-NEXT:    v_mov_b32_e32 v8, 16
-; GFX11-NEXT:    v_mov_b32_e32 v9, 0
-; GFX11-NEXT:    v_or_b32_e32 v5, v4, v10
-; GFX11-NEXT:    v_or_b32_e32 v4, v0, v11
-; GFX11-NEXT:    v_mov_b32_e32 v10, 0
-; GFX11-NEXT:    v_mov_b32_e32 v11, 0
-; GFX11-NEXT:    v_or_b32_e32 v0, v12, v13
+; GFX11-NEXT:    s_bfe_u32 s24, s4, 0x80008
+; GFX11-NEXT:    s_bfe_u32 s25, s5, 0x80008
+; GFX11-NEXT:    s_bfe_u32 s26, s6, 0x80008
+; GFX11-NEXT:    s_or_b32 s7, s7, s17
+; GFX11-NEXT:    s_or_b32 s14, s14, s15
+; GFX11-NEXT:    s_add_i32 s13, s13, s13
+; GFX11-NEXT:    s_add_i32 s12, s12, s12
+; GFX11-NEXT:    s_add_i32 s11, s11, s11
+; GFX11-NEXT:    s_add_i32 s10, s10, s10
+; GFX11-NEXT:    s_bfe_u32 s28, s0, 0x80008
+; GFX11-NEXT:    s_or_b32 s3, s3, s30
+; GFX11-NEXT:    s_or_b32 s22, s22, s23
+; GFX11-NEXT:    s_bfe_u32 s23, s2, 0x80008
+; GFX11-NEXT:    s_or_b32 s1, s1, s21
+; GFX11-NEXT:    s_or_b32 s18, s18, s19
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s14, s14, 16
+; GFX11-NEXT:    s_add_i32 s6, s6, s6
+; GFX11-NEXT:    s_add_i32 s26, s26, s26
+; GFX11-NEXT:    s_lshl_b32 s13, s13, 8
+; GFX11-NEXT:    s_and_b32 s12, s12, 0xff
+; GFX11-NEXT:    s_add_i32 s5, s5, s5
+; GFX11-NEXT:    s_add_i32 s25, s25, s25
+; GFX11-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX11-NEXT:    s_and_b32 s10, s10, 0xff
+; GFX11-NEXT:    s_add_i32 s4, s4, s4
+; GFX11-NEXT:    s_add_i32 s24, s24, s24
+; GFX11-NEXT:    s_add_i32 s9, s9, s9
+; GFX11-NEXT:    s_add_i32 s8, s8, s8
+; GFX11-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s22, s22, 16
+; GFX11-NEXT:    s_add_i32 s2, s2, s2
+; GFX11-NEXT:    s_add_i32 s23, s23, s23
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s18, s18, 16
+; GFX11-NEXT:    s_add_i32 s0, s0, s0
+; GFX11-NEXT:    s_add_i32 s28, s28, s28
+; GFX11-NEXT:    s_or_b32 s7, s7, s14
+; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX11-NEXT:    s_lshl_b32 s14, s26, 8
+; GFX11-NEXT:    s_or_b32 s12, s12, s13
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xff
+; GFX11-NEXT:    s_lshl_b32 s13, s25, 8
+; GFX11-NEXT:    s_or_b32 s10, s10, s11
+; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-NEXT:    s_lshl_b32 s11, s24, 8
+; GFX11-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX11-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX11-NEXT:    s_or_b32 s3, s3, s22
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshl_b32 s22, s23, 8
+; GFX11-NEXT:    s_or_b32 s1, s1, s18
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_lshl_b32 s18, s28, 8
+; GFX11-NEXT:    s_or_b32 s6, s6, s14
+; GFX11-NEXT:    s_or_b32 s5, s5, s13
+; GFX11-NEXT:    s_or_b32 s4, s4, s11
+; GFX11-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-NEXT:    s_or_b32 s2, s2, s22
+; GFX11-NEXT:    s_or_b32 s0, s0, s18
+; GFX11-NEXT:    s_and_b32 s6, s6, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s12, s12, 16
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX11-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_lshl_b32 s9, s10, 16
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s20, s20, 16
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s16, s16, 16
+; GFX11-NEXT:    s_or_b32 s6, s6, s12
+; GFX11-NEXT:    s_or_b32 s4, s4, s8
+; GFX11-NEXT:    s_or_b32 s5, s5, s9
+; GFX11-NEXT:    s_or_b32 s2, s2, s20
+; GFX11-NEXT:    s_or_b32 s0, s0, s16
+; GFX11-NEXT:    v_dual_mov_b32 v8, 16 :: v_dual_mov_b32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v4, s0
+; GFX11-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v11, 0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v[8:9], v[4:7], off
-; GFX11-NEXT:    global_store_b128 v[10:11], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[10:11], v[4:7], off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -2777,57 +2836,56 @@ define amdgpu_cs void @amdgpu_cs_inreg_v8i1(<8 x i1> inreg %arg0) {
 ;
 ; VI-LABEL: amdgpu_cs_inreg_v8i1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_and_b32_e64 v1, s6, 1
-; VI-NEXT:    v_lshlrev_b16_e64 v0, 3, s7
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 2, v1
-; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 1, s5
-; VI-NEXT:    v_and_b32_e64 v2, s4, 1
-; VI-NEXT:    v_or_b32_e32 v1, v2, v1
-; VI-NEXT:    v_and_b32_e32 v1, 3, v1
-; VI-NEXT:    v_and_b32_e64 v2, s2, 1
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 3, s3
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
-; VI-NEXT:    v_or_b32_e32 v1, v1, v2
-; VI-NEXT:    v_lshlrev_b16_e64 v2, 1, s1
-; VI-NEXT:    v_and_b32_e64 v3, s0, 1
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    v_and_b32_e32 v2, 3, v2
-; VI-NEXT:    v_or_b32_e32 v1, v2, v1
-; VI-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
-; VI-NEXT:    v_and_b32_e32 v1, 15, v1
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    s_and_b32 s6, s6, 1
+; VI-NEXT:    s_lshl_b32 s5, s5, 1
+; VI-NEXT:    s_and_b32 s4, s4, 1
+; VI-NEXT:    s_and_b32 s2, s2, 1
+; VI-NEXT:    s_lshl_b32 s1, s1, 1
+; VI-NEXT:    s_and_b32 s0, s0, 1
+; VI-NEXT:    s_lshl_b32 s7, s7, 3
+; VI-NEXT:    s_lshl_b32 s6, s6, 2
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_lshl_b32 s3, s3, 3
+; VI-NEXT:    s_lshl_b32 s2, s2, 2
+; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_or_b32 s6, s7, s6
+; VI-NEXT:    s_and_b32 s4, s4, 3
+; VI-NEXT:    s_or_b32 s2, s3, s2
+; VI-NEXT:    s_and_b32 s0, s0, 3
+; VI-NEXT:    s_or_b32 s4, s4, s6
+; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    s_lshl_b32 s4, s4, 4
+; VI-NEXT:    s_and_b32 s0, s0, 15
+; VI-NEXT:    s_or_b32 s0, s0, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_byte v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amdgpu_cs_inreg_v8i1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e64 v1, s6, 1
-; GFX11-NEXT:    v_lshlrev_b16 v2, 1, s5
-; GFX11-NEXT:    v_and_b32_e64 v3, s4, 1
-; GFX11-NEXT:    v_and_b32_e64 v4, s2, 1
-; GFX11-NEXT:    v_lshlrev_b16 v5, 1, s1
-; GFX11-NEXT:    v_and_b32_e64 v6, s0, 1
-; GFX11-NEXT:    v_lshlrev_b16 v0, 3, s7
-; GFX11-NEXT:    v_lshlrev_b16 v1, 2, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_lshlrev_b16 v3, 3, s3
-; GFX11-NEXT:    v_lshlrev_b16 v4, 2, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v0, 4, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 15, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_and_b32 s6, s6, 1
+; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX11-NEXT:    s_and_b32 s4, s4, 1
+; GFX11-NEXT:    s_and_b32 s2, s2, 1
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-NEXT:    s_lshl_b32 s7, s7, 3
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 2
+; GFX11-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 3
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_or_b32 s5, s7, s6
+; GFX11-NEXT:    s_and_b32 s4, s4, 3
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_and_b32 s0, s0, 3
+; GFX11-NEXT:    s_or_b32 s2, s4, s5
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_lshl_b32 s1, s2, 4
+; GFX11-NEXT:    s_and_b32 s0, s0, 15
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2892,105 +2950,104 @@ define amdgpu_cs void @amdgpu_cs_inreg_v16i1(<16 x i1> inreg %arg0) {
 ;
 ; VI-LABEL: amdgpu_cs_inreg_v16i1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_and_b32_e64 v1, s14, 1
-; VI-NEXT:    v_lshlrev_b16_e64 v0, 3, s15
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 2, v1
-; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 1, s13
-; VI-NEXT:    v_and_b32_e64 v2, s12, 1
-; VI-NEXT:    v_or_b32_e32 v1, v2, v1
-; VI-NEXT:    v_and_b32_e32 v1, 3, v1
-; VI-NEXT:    v_and_b32_e64 v2, s10, 1
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 3, s11
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
-; VI-NEXT:    v_or_b32_e32 v1, v1, v2
-; VI-NEXT:    v_lshlrev_b16_e64 v2, 1, s9
-; VI-NEXT:    v_and_b32_e64 v3, s8, 1
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    v_and_b32_e32 v2, 3, v2
-; VI-NEXT:    v_or_b32_e32 v1, v2, v1
-; VI-NEXT:    v_mov_b32_e32 v2, 15
-; VI-NEXT:    v_lshlrev_b16_e32 v0, 12, v0
-; VI-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_and_b32_e64 v2, s6, 1
-; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 3, s7
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
-; VI-NEXT:    v_or_b32_e32 v1, v1, v2
-; VI-NEXT:    v_lshlrev_b16_e64 v2, 1, s5
-; VI-NEXT:    v_and_b32_e64 v3, s4, 1
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    v_and_b32_e32 v2, 3, v2
-; VI-NEXT:    v_and_b32_e64 v3, s2, 1
-; VI-NEXT:    v_or_b32_e32 v1, v2, v1
-; VI-NEXT:    v_lshlrev_b16_e64 v2, 3, s3
-; VI-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    v_lshlrev_b16_e64 v3, 1, s1
-; VI-NEXT:    v_and_b32_e64 v4, s0, 1
-; VI-NEXT:    v_or_b32_e32 v3, v4, v3
-; VI-NEXT:    v_and_b32_e32 v3, 3, v3
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 4, v1
-; VI-NEXT:    v_and_b32_e32 v2, 15, v2
-; VI-NEXT:    v_or_b32_e32 v1, v2, v1
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_and_b32 s10, s10, 1
+; VI-NEXT:    s_lshl_b32 s9, s9, 1
+; VI-NEXT:    s_and_b32 s8, s8, 1
+; VI-NEXT:    s_and_b32 s6, s6, 1
+; VI-NEXT:    s_lshl_b32 s5, s5, 1
+; VI-NEXT:    s_and_b32 s4, s4, 1
+; VI-NEXT:    s_and_b32 s2, s2, 1
+; VI-NEXT:    s_lshl_b32 s1, s1, 1
+; VI-NEXT:    s_and_b32 s0, s0, 1
+; VI-NEXT:    s_and_b32 s14, s14, 1
+; VI-NEXT:    s_lshl_b32 s13, s13, 1
+; VI-NEXT:    s_and_b32 s12, s12, 1
+; VI-NEXT:    s_lshl_b32 s11, s11, 3
+; VI-NEXT:    s_lshl_b32 s10, s10, 2
+; VI-NEXT:    s_or_b32 s8, s8, s9
+; VI-NEXT:    s_lshl_b32 s7, s7, 3
+; VI-NEXT:    s_lshl_b32 s6, s6, 2
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_lshl_b32 s3, s3, 3
+; VI-NEXT:    s_lshl_b32 s2, s2, 2
+; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_lshl_b32 s15, s15, 3
+; VI-NEXT:    s_lshl_b32 s14, s14, 2
+; VI-NEXT:    s_or_b32 s12, s12, s13
+; VI-NEXT:    s_or_b32 s10, s11, s10
+; VI-NEXT:    s_and_b32 s8, s8, 3
+; VI-NEXT:    s_or_b32 s6, s7, s6
+; VI-NEXT:    s_and_b32 s4, s4, 3
+; VI-NEXT:    s_or_b32 s2, s3, s2
+; VI-NEXT:    s_and_b32 s0, s0, 3
+; VI-NEXT:    s_or_b32 s14, s15, s14
+; VI-NEXT:    s_and_b32 s12, s12, 3
+; VI-NEXT:    s_or_b32 s8, s8, s10
+; VI-NEXT:    s_or_b32 s4, s4, s6
+; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    s_or_b32 s12, s12, s14
+; VI-NEXT:    s_and_b32 s8, s8, 15
+; VI-NEXT:    s_lshl_b32 s4, s4, 4
+; VI-NEXT:    s_and_b32 s0, s0, 15
+; VI-NEXT:    s_lshl_b32 s12, s12, 12
+; VI-NEXT:    s_lshl_b32 s8, s8, 8
+; VI-NEXT:    s_or_b32 s0, s0, s4
+; VI-NEXT:    s_or_b32 s8, s12, s8
+; VI-NEXT:    s_and_b32 s0, s0, 0xff
+; VI-NEXT:    s_or_b32 s0, s0, s8
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_short v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amdgpu_cs_inreg_v16i1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e64 v0, s10, 1
-; GFX11-NEXT:    v_lshlrev_b16 v2, 1, s13
-; GFX11-NEXT:    v_and_b32_e64 v3, s12, 1
-; GFX11-NEXT:    v_lshlrev_b16 v5, 1, s9
-; GFX11-NEXT:    v_and_b32_e64 v6, s8, 1
-; GFX11-NEXT:    v_lshlrev_b16 v4, 3, s11
-; GFX11-NEXT:    v_lshlrev_b16 v0, 2, v0
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_and_b32_e64 v8, s4, 1
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v5
-; GFX11-NEXT:    v_and_b32_e64 v5, s6, 1
-; GFX11-NEXT:    v_lshlrev_b16 v6, 1, s5
-; GFX11-NEXT:    v_and_b32_e64 v9, s2, 1
-; GFX11-NEXT:    v_lshlrev_b16 v10, 1, s1
-; GFX11-NEXT:    v_and_b32_e64 v11, s0, 1
-; GFX11-NEXT:    v_and_b32_e64 v1, s14, 1
-; GFX11-NEXT:    v_or_b32_e32 v0, v4, v0
-; GFX11-NEXT:    v_lshlrev_b16 v4, 3, s7
-; GFX11-NEXT:    v_lshlrev_b16 v5, 2, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v8, v6
-; GFX11-NEXT:    v_lshlrev_b16 v8, 3, s3
-; GFX11-NEXT:    v_lshlrev_b16 v9, 2, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v11, v10
-; GFX11-NEXT:    v_lshlrev_b16 v7, 3, s15
-; GFX11-NEXT:    v_lshlrev_b16 v1, 2, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 3, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v8, v9
-; GFX11-NEXT:    v_and_b32_e32 v8, 3, v10
-; GFX11-NEXT:    v_or_b32_e32 v1, v7, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, v8, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b16 v2, 4, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 15, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b16 v1, 12, v1
-; GFX11-NEXT:    v_lshlrev_b16 v0, 8, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT:    s_and_b32 s10, s10, 1
+; GFX11-NEXT:    s_lshl_b32 s9, s9, 1
+; GFX11-NEXT:    s_and_b32 s8, s8, 1
+; GFX11-NEXT:    s_and_b32 s6, s6, 1
+; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX11-NEXT:    s_and_b32 s4, s4, 1
+; GFX11-NEXT:    s_and_b32 s2, s2, 1
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-NEXT:    s_and_b32 s14, s14, 1
+; GFX11-NEXT:    s_lshl_b32 s13, s13, 1
+; GFX11-NEXT:    s_and_b32 s12, s12, 1
+; GFX11-NEXT:    s_lshl_b32 s11, s11, 3
+; GFX11-NEXT:    s_lshl_b32 s10, s10, 2
+; GFX11-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-NEXT:    s_lshl_b32 s7, s7, 3
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 2
+; GFX11-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 3
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_lshl_b32 s15, s15, 3
+; GFX11-NEXT:    s_lshl_b32 s14, s14, 2
+; GFX11-NEXT:    s_or_b32 s12, s12, s13
+; GFX11-NEXT:    s_or_b32 s9, s11, s10
+; GFX11-NEXT:    s_and_b32 s8, s8, 3
+; GFX11-NEXT:    s_or_b32 s5, s7, s6
+; GFX11-NEXT:    s_and_b32 s4, s4, 3
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_and_b32 s0, s0, 3
+; GFX11-NEXT:    s_or_b32 s13, s15, s14
+; GFX11-NEXT:    s_and_b32 s12, s12, 3
+; GFX11-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-NEXT:    s_or_b32 s2, s4, s5
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_or_b32 s10, s12, s13
+; GFX11-NEXT:    s_and_b32 s8, s8, 15
+; GFX11-NEXT:    s_lshl_b32 s1, s2, 4
+; GFX11-NEXT:    s_and_b32 s0, s0, 15
+; GFX11-NEXT:    s_lshl_b32 s9, s10, 12
+; GFX11-NEXT:    s_lshl_b32 s2, s8, 8
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_or_b32 s1, s9, s2
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3103,196 +3160,200 @@ define amdgpu_cs void @amdgpu_cs_inreg_v32i1(<32 x i1> inreg %arg0) {
 ;
 ; VI-LABEL: amdgpu_cs_inreg_v32i1:
 ; VI:       ; %bb.0:
-; VI-NEXT:    v_and_b32_e64 v1, s14, 1
-; VI-NEXT:    v_lshlrev_b16_e64 v0, 3, s15
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 2, v1
-; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 1, s13
-; VI-NEXT:    v_and_b32_e64 v2, s12, 1
-; VI-NEXT:    v_or_b32_e32 v1, v2, v1
-; VI-NEXT:    v_and_b32_e32 v1, 3, v1
-; VI-NEXT:    v_and_b32_e64 v2, s10, 1
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 3, s11
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 2, v2
-; VI-NEXT:    v_or_b32_e32 v1, v1, v2
-; VI-NEXT:    v_lshlrev_b16_e64 v2, 1, s9
-; VI-NEXT:    v_and_b32_e64 v3, s8, 1
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    v_and_b32_e32 v2, 3, v2
-; VI-NEXT:    v_or_b32_e32 v1, v2, v1
-; VI-NEXT:    v_mov_b32_e32 v2, 15
-; VI-NEXT:    v_lshlrev_b16_e32 v0, 12, v0
-; VI-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_and_b32_e64 v3, s6, 1
-; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 3, s7
-; VI-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
-; VI-NEXT:    v_or_b32_e32 v1, v1, v3
-; VI-NEXT:    v_lshlrev_b16_e64 v3, 1, s5
-; VI-NEXT:    v_and_b32_e64 v4, s4, 1
-; VI-NEXT:    v_or_b32_e32 v3, v4, v3
-; VI-NEXT:    v_and_b32_e32 v3, 3, v3
-; VI-NEXT:    v_and_b32_e64 v4, s2, 1
-; VI-NEXT:    v_or_b32_e32 v1, v3, v1
-; VI-NEXT:    v_lshlrev_b16_e64 v3, 3, s3
-; VI-NEXT:    v_lshlrev_b16_e32 v4, 2, v4
-; VI-NEXT:    v_or_b32_e32 v3, v3, v4
-; VI-NEXT:    v_lshlrev_b16_e64 v4, 1, s1
-; VI-NEXT:    v_and_b32_e64 v5, s0, 1
-; VI-NEXT:    v_or_b32_e32 v4, v5, v4
-; VI-NEXT:    v_and_b32_e32 v4, 3, v4
-; VI-NEXT:    v_or_b32_e32 v3, v4, v3
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 4, v1
-; VI-NEXT:    v_and_b32_e32 v3, 15, v3
-; VI-NEXT:    v_or_b32_e32 v1, v3, v1
-; VI-NEXT:    v_and_b32_e64 v3, s30, 1
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 3, s31
-; VI-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
-; VI-NEXT:    v_or_b32_e32 v1, v1, v3
-; VI-NEXT:    v_lshlrev_b16_e64 v3, 1, s29
-; VI-NEXT:    v_and_b32_e64 v4, s28, 1
-; VI-NEXT:    v_or_b32_e32 v3, v4, v3
-; VI-NEXT:    v_and_b32_e32 v3, 3, v3
-; VI-NEXT:    v_and_b32_e64 v4, s26, 1
-; VI-NEXT:    v_or_b32_e32 v1, v3, v1
-; VI-NEXT:    v_lshlrev_b16_e64 v3, 3, s27
-; VI-NEXT:    v_lshlrev_b16_e32 v4, 2, v4
-; VI-NEXT:    v_or_b32_e32 v3, v3, v4
-; VI-NEXT:    v_lshlrev_b16_e64 v4, 1, s25
-; VI-NEXT:    v_and_b32_e64 v5, s24, 1
-; VI-NEXT:    v_or_b32_e32 v4, v5, v4
-; VI-NEXT:    v_and_b32_e32 v4, 3, v4
-; VI-NEXT:    v_or_b32_e32 v3, v4, v3
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 12, v1
-; VI-NEXT:    v_and_b32_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_and_b32_e64 v3, s22, 1
-; VI-NEXT:    v_or_b32_e32 v1, v1, v2
-; VI-NEXT:    v_lshlrev_b16_e64 v2, 3, s23
-; VI-NEXT:    v_lshlrev_b16_e32 v3, 2, v3
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    v_lshlrev_b16_e64 v3, 1, s21
-; VI-NEXT:    v_and_b32_e64 v4, s20, 1
-; VI-NEXT:    v_or_b32_e32 v3, v4, v3
-; VI-NEXT:    v_and_b32_e32 v3, 3, v3
-; VI-NEXT:    v_and_b32_e64 v4, s18, 1
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    v_lshlrev_b16_e64 v3, 3, s19
-; VI-NEXT:    v_lshlrev_b16_e32 v4, 2, v4
-; VI-NEXT:    v_or_b32_e32 v3, v3, v4
-; VI-NEXT:    v_lshlrev_b16_e64 v4, 1, s17
-; VI-NEXT:    v_and_b32_e64 v5, s16, 1
-; VI-NEXT:    v_or_b32_e32 v4, v5, v4
-; VI-NEXT:    v_and_b32_e32 v4, 3, v4
-; VI-NEXT:    v_or_b32_e32 v3, v4, v3
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 4, v2
-; VI-NEXT:    v_and_b32_e32 v3, 15, v3
-; VI-NEXT:    v_or_b32_e32 v2, v3, v2
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_and_b32 s26, s26, 1
+; VI-NEXT:    s_lshl_b32 s25, s25, 1
+; VI-NEXT:    s_and_b32 s24, s24, 1
+; VI-NEXT:    s_and_b32 s22, s22, 1
+; VI-NEXT:    s_lshl_b32 s21, s21, 1
+; VI-NEXT:    s_and_b32 s20, s20, 1
+; VI-NEXT:    s_and_b32 s18, s18, 1
+; VI-NEXT:    s_lshl_b32 s17, s17, 1
+; VI-NEXT:    s_and_b32 s16, s16, 1
+; VI-NEXT:    s_and_b32 s10, s10, 1
+; VI-NEXT:    s_lshl_b32 s9, s9, 1
+; VI-NEXT:    s_and_b32 s8, s8, 1
+; VI-NEXT:    s_and_b32 s6, s6, 1
+; VI-NEXT:    s_lshl_b32 s5, s5, 1
+; VI-NEXT:    s_and_b32 s4, s4, 1
+; VI-NEXT:    s_and_b32 s2, s2, 1
+; VI-NEXT:    s_lshl_b32 s1, s1, 1
+; VI-NEXT:    s_and_b32 s0, s0, 1
+; VI-NEXT:    s_and_b32 s30, s30, 1
+; VI-NEXT:    s_lshl_b32 s29, s29, 1
+; VI-NEXT:    s_and_b32 s28, s28, 1
+; VI-NEXT:    s_lshl_b32 s27, s27, 3
+; VI-NEXT:    s_lshl_b32 s26, s26, 2
+; VI-NEXT:    s_or_b32 s24, s24, s25
+; VI-NEXT:    s_lshl_b32 s23, s23, 3
+; VI-NEXT:    s_lshl_b32 s22, s22, 2
+; VI-NEXT:    s_or_b32 s20, s20, s21
+; VI-NEXT:    s_lshl_b32 s19, s19, 3
+; VI-NEXT:    s_lshl_b32 s18, s18, 2
+; VI-NEXT:    s_or_b32 s16, s16, s17
+; VI-NEXT:    s_and_b32 s14, s14, 1
+; VI-NEXT:    s_lshl_b32 s13, s13, 1
+; VI-NEXT:    s_and_b32 s12, s12, 1
+; VI-NEXT:    s_lshl_b32 s11, s11, 3
+; VI-NEXT:    s_lshl_b32 s10, s10, 2
+; VI-NEXT:    s_or_b32 s8, s8, s9
+; VI-NEXT:    s_lshl_b32 s7, s7, 3
+; VI-NEXT:    s_lshl_b32 s6, s6, 2
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_lshl_b32 s3, s3, 3
+; VI-NEXT:    s_lshl_b32 s2, s2, 2
+; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_lshl_b32 s31, s31, 3
+; VI-NEXT:    s_lshl_b32 s30, s30, 2
+; VI-NEXT:    s_or_b32 s28, s28, s29
+; VI-NEXT:    s_or_b32 s26, s27, s26
+; VI-NEXT:    s_and_b32 s24, s24, 3
+; VI-NEXT:    s_or_b32 s22, s23, s22
+; VI-NEXT:    s_and_b32 s20, s20, 3
+; VI-NEXT:    s_or_b32 s18, s19, s18
+; VI-NEXT:    s_and_b32 s16, s16, 3
+; VI-NEXT:    s_lshl_b32 s15, s15, 3
+; VI-NEXT:    s_lshl_b32 s14, s14, 2
+; VI-NEXT:    s_or_b32 s12, s12, s13
+; VI-NEXT:    s_or_b32 s10, s11, s10
+; VI-NEXT:    s_and_b32 s8, s8, 3
+; VI-NEXT:    s_or_b32 s6, s7, s6
+; VI-NEXT:    s_and_b32 s4, s4, 3
+; VI-NEXT:    s_or_b32 s2, s3, s2
+; VI-NEXT:    s_and_b32 s0, s0, 3
+; VI-NEXT:    s_or_b32 s30, s31, s30
+; VI-NEXT:    s_and_b32 s28, s28, 3
+; VI-NEXT:    s_or_b32 s24, s24, s26
+; VI-NEXT:    s_or_b32 s20, s20, s22
+; VI-NEXT:    s_or_b32 s16, s16, s18
+; VI-NEXT:    s_or_b32 s14, s15, s14
+; VI-NEXT:    s_and_b32 s12, s12, 3
+; VI-NEXT:    s_or_b32 s8, s8, s10
+; VI-NEXT:    s_or_b32 s4, s4, s6
+; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    s_or_b32 s28, s28, s30
+; VI-NEXT:    s_and_b32 s24, s24, 15
+; VI-NEXT:    s_lshl_b32 s20, s20, 4
+; VI-NEXT:    s_and_b32 s16, s16, 15
+; VI-NEXT:    s_or_b32 s12, s12, s14
+; VI-NEXT:    s_and_b32 s8, s8, 15
+; VI-NEXT:    s_lshl_b32 s4, s4, 4
+; VI-NEXT:    s_and_b32 s0, s0, 15
+; VI-NEXT:    s_lshl_b32 s28, s28, 12
+; VI-NEXT:    s_lshl_b32 s24, s24, 8
+; VI-NEXT:    s_or_b32 s16, s16, s20
+; VI-NEXT:    s_lshl_b32 s12, s12, 12
+; VI-NEXT:    s_lshl_b32 s8, s8, 8
+; VI-NEXT:    s_or_b32 s0, s0, s4
+; VI-NEXT:    s_or_b32 s24, s28, s24
+; VI-NEXT:    s_and_b32 s16, s16, 0xff
+; VI-NEXT:    s_or_b32 s8, s12, s8
+; VI-NEXT:    s_and_b32 s0, s0, 0xff
+; VI-NEXT:    s_or_b32 s16, s16, s24
+; VI-NEXT:    s_or_b32 s0, s0, s8
+; VI-NEXT:    s_lshl_b32 s16, s16, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_or_b32 s0, s0, s16
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: amdgpu_cs_inreg_v32i1:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e64 v0, s14, 1
-; GFX11-NEXT:    v_lshlrev_b16 v1, 1, s13
-; GFX11-NEXT:    v_and_b32_e64 v2, s12, 1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 3, s15
-; GFX11-NEXT:    v_lshlrev_b16 v4, 1, s9
-; GFX11-NEXT:    v_lshlrev_b16 v0, 2, v0
-; GFX11-NEXT:    v_and_b32_e64 v5, s8, 1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX11-NEXT:    v_and_b32_e64 v2, s10, 1
-; GFX11-NEXT:    v_lshlrev_b16 v6, 1, s5
-; GFX11-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 3, s11
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX11-NEXT:    v_lshlrev_b16 v2, 2, v2
-; GFX11-NEXT:    v_and_b32_e64 v5, s6, 1
-; GFX11-NEXT:    v_and_b32_e64 v7, s4, 1
-; GFX11-NEXT:    v_lshlrev_b16 v8, 1, s1
-; GFX11-NEXT:    v_and_b32_e64 v9, s0, 1
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 3, v4
-; GFX11-NEXT:    v_lshlrev_b16 v4, 3, s7
-; GFX11-NEXT:    v_lshlrev_b16 v5, 2, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX11-NEXT:    v_and_b32_e64 v7, s2, 1
-; GFX11-NEXT:    v_and_b32_e32 v1, 3, v1
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 3, v6
-; GFX11-NEXT:    v_lshlrev_b16 v6, 3, s3
-; GFX11-NEXT:    v_lshlrev_b16 v7, 2, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 3, v8
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    v_lshlrev_b16 v6, 1, s29
-; GFX11-NEXT:    v_and_b32_e64 v7, s28, 1
-; GFX11-NEXT:    v_lshlrev_b16 v9, 1, s25
-; GFX11-NEXT:    v_and_b32_e64 v10, s24, 1
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX11-NEXT:    v_and_b32_e64 v4, s26, 1
-; GFX11-NEXT:    v_lshlrev_b16 v8, 3, s27
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v10, v9
-; GFX11-NEXT:    v_and_b32_e64 v9, s22, 1
-; GFX11-NEXT:    v_lshlrev_b16 v4, 2, v4
-; GFX11-NEXT:    v_lshlrev_b16 v10, 1, s21
-; GFX11-NEXT:    v_and_b32_e64 v12, s20, 1
-; GFX11-NEXT:    v_and_b32_e64 v13, s18, 1
-; GFX11-NEXT:    v_lshlrev_b16 v14, 1, s17
-; GFX11-NEXT:    v_and_b32_e64 v15, s16, 1
-; GFX11-NEXT:    v_and_b32_e64 v5, s30, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, v8, v4
-; GFX11-NEXT:    v_lshlrev_b16 v8, 3, s23
-; GFX11-NEXT:    v_lshlrev_b16 v9, 2, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v12, v10
-; GFX11-NEXT:    v_lshlrev_b16 v12, 3, s19
-; GFX11-NEXT:    v_lshlrev_b16 v13, 2, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v15, v14
-; GFX11-NEXT:    v_lshlrev_b16 v11, 3, s31
-; GFX11-NEXT:    v_lshlrev_b16 v5, 2, v5
-; GFX11-NEXT:    v_and_b32_e32 v7, 3, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 3, v10
-; GFX11-NEXT:    v_or_b32_e32 v10, v12, v13
-; GFX11-NEXT:    v_and_b32_e32 v12, 3, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v11, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 3, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, v12, v10
-; GFX11-NEXT:    v_and_b32_e32 v1, 15, v1
-; GFX11-NEXT:    v_lshlrev_b16 v2, 4, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 15, v3
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 15, v4
-; GFX11-NEXT:    v_lshlrev_b16 v6, 4, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 15, v8
-; GFX11-NEXT:    v_lshlrev_b16 v0, 12, v0
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_lshlrev_b16 v3, 12, v5
-; GFX11-NEXT:    v_lshlrev_b16 v4, 8, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    s_and_b32 s10, s10, 1
+; GFX11-NEXT:    s_lshl_b32 s9, s9, 1
+; GFX11-NEXT:    s_and_b32 s8, s8, 1
+; GFX11-NEXT:    s_and_b32 s14, s14, 1
+; GFX11-NEXT:    s_lshl_b32 s13, s13, 1
+; GFX11-NEXT:    s_and_b32 s12, s12, 1
+; GFX11-NEXT:    s_lshl_b32 s11, s11, 3
+; GFX11-NEXT:    s_lshl_b32 s10, s10, 2
+; GFX11-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-NEXT:    s_and_b32 s6, s6, 1
+; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX11-NEXT:    s_and_b32 s4, s4, 1
+; GFX11-NEXT:    s_and_b32 s2, s2, 1
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX11-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-NEXT:    s_lshl_b32 s15, s15, 3
+; GFX11-NEXT:    s_lshl_b32 s14, s14, 2
+; GFX11-NEXT:    s_or_b32 s12, s12, s13
+; GFX11-NEXT:    s_or_b32 s9, s11, s10
+; GFX11-NEXT:    s_and_b32 s8, s8, 3
+; GFX11-NEXT:    s_lshl_b32 s7, s7, 3
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 2
+; GFX11-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 3
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_or_b32 s13, s15, s14
+; GFX11-NEXT:    s_and_b32 s12, s12, 3
+; GFX11-NEXT:    s_or_b32 s8, s8, s9
+; GFX11-NEXT:    s_or_b32 s5, s7, s6
+; GFX11-NEXT:    s_and_b32 s4, s4, 3
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_and_b32 s0, s0, 3
+; GFX11-NEXT:    s_or_b32 s10, s12, s13
+; GFX11-NEXT:    s_and_b32 s8, s8, 15
+; GFX11-NEXT:    s_or_b32 s2, s4, s5
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_lshl_b32 s9, s10, 12
+; GFX11-NEXT:    s_lshl_b32 s1, s2, 4
+; GFX11-NEXT:    s_and_b32 s0, s0, 15
+; GFX11-NEXT:    s_lshl_b32 s2, s8, 8
+; GFX11-NEXT:    s_and_b32 s3, s30, 1
+; GFX11-NEXT:    s_lshl_b32 s4, s29, 1
+; GFX11-NEXT:    s_and_b32 s5, s28, 1
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_or_b32 s1, s9, s2
+; GFX11-NEXT:    s_lshl_b32 s2, s31, 3
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 2
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s5, s26, 1
+; GFX11-NEXT:    s_lshl_b32 s6, s25, 1
+; GFX11-NEXT:    s_and_b32 s7, s24, 1
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 s3, s4, 3
+; GFX11-NEXT:    s_lshl_b32 s4, s27, 3
+; GFX11-NEXT:    s_lshl_b32 s5, s5, 2
+; GFX11-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-NEXT:    s_and_b32 s5, s6, 3
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_or_b32 s3, s5, s4
+; GFX11-NEXT:    s_and_b32 s5, s22, 1
+; GFX11-NEXT:    s_lshl_b32 s6, s21, 1
+; GFX11-NEXT:    s_and_b32 s7, s20, 1
+; GFX11-NEXT:    s_lshl_b32 s4, s23, 3
+; GFX11-NEXT:    s_lshl_b32 s5, s5, 2
+; GFX11-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-NEXT:    s_and_b32 s7, s18, 1
+; GFX11-NEXT:    s_lshl_b32 s8, s17, 1
+; GFX11-NEXT:    s_and_b32 s9, s16, 1
+; GFX11-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-NEXT:    s_and_b32 s5, s6, 3
+; GFX11-NEXT:    s_lshl_b32 s6, s19, 3
+; GFX11-NEXT:    s_lshl_b32 s7, s7, 2
+; GFX11-NEXT:    s_or_b32 s8, s9, s8
+; GFX11-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-NEXT:    s_and_b32 s7, s8, 3
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    s_or_b32 s5, s7, s6
+; GFX11-NEXT:    s_and_b32 s3, s3, 15
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 4
+; GFX11-NEXT:    s_and_b32 s5, s5, 15
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 12
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_or_b32 s4, s5, s4
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 s3, s4, 0xff
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
index 237e06def15763..d1f3f7e819221e 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -mtriple=amdgcn-- -codegenprepare < %s | FileCheck -check-prefix=OPT %s
 ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -codegenprepare < %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN  %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN  %s
 
 ; This particular case will actually be worse in terms of code size
 ; from sinking into both.
@@ -120,15 +120,12 @@ ret:
 ; GCN-LABEL: {{^}}sink_ubfe_i16:
 ; GCN-NOT: lshr
 ; VI: s_load_dword [[ARG:s[0-9]+]], s[2:3], 0x2c
-; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004
 ; GCN: s_cbranch_scc{{[0-1]}}
 
-; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
-; VI: v_mov_b32_e32 v{{[0-9]+}}, 0x7f
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
 
 ; GCN: .LBB2_3:
-; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
-; VI: v_mov_b32_e32 v{{[0-9]+}}, 0xff
+; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
 
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index 9e5dbe91504a0c..456d0ffd48e7f1 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -539,7 +539,7 @@ define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:2
-; VI-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
 ; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:1
 ; VI-NEXT:    s_endpgm
   %val = load <3 x i8>, ptr addrspace(1) %in, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index a0b549711f339b..2a8e852ba84d08 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1650,15 +1650,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    s_mov_b32 s8, s2
 ; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v2, v0
-; VI-NEXT:    v_min_u32_e32 v2, 32, v2
-; VI-NEXT:    v_add_u32_e32 v2, vcc, -16, v2
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -1696,11 +1691,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, -16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1711,11 +1702,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX10-GISEL-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-GISEL-NEXT:    v_subrev_nc_u32_e32 v2, 16, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
@@ -1727,13 +1717,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, -16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 2168e7fe1dd285..1d3b308f346fcf 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -608,8 +608,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 24, v0
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; VI-NEXT:    v_ffbh_u32_e32 v1, v1
-; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -656,8 +657,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
 ; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -708,15 +708,19 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v2
+; VI-NEXT:    v_readfirstlane_b32 s2, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; VI-NEXT:    v_ffbh_u32_e32 v1, v1
-; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT:    v_readfirstlane_b32 s3, v0
+; VI-NEXT:    s_lshl_b32 s2, s2, 8
+; VI-NEXT:    s_or_b32 s2, s2, s3
+; VI-NEXT:    s_lshl_b32 s3, s2, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_flbit_i32_b32 s3, s3
+; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_cselect_b32 s2, s3, 32
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -762,8 +766,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
 ; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 14e6c4bcf6d8fe..2ae4bf0a6ceecd 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -1402,14 +1402,10 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    s_mov_b32 s8, s2
 ; VI-NEXT:    s_mov_b32 s9, s3
 ; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v2, 0x10000, v0
-; VI-NEXT:    v_ffbl_b32_e32 v2, v2
-; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v0, v0
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -1447,10 +1443,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_or_b32_e32 v2, 0x10000, v1
-; GFX10-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
 ; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1462,9 +1455,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
-; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 81ed823bad2044..966619a090d288 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -584,7 +584,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_ffbl_b32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -628,8 +629,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
 ; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -679,11 +679,12 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v1, v0
-; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -729,8 +730,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
 ; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -1458,12 +1458,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, 0xff
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v2, 0x100, v0
-; VI-NEXT:    v_ffbl_b32_e32 v2, v2
-; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; VI-NEXT:    v_ffbl_b32_e32 v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_byte v[0:1], v2
@@ -1500,14 +1496,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[6:7]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, 0x100, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -1554,15 +1549,11 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v0, v2, v0
-; VI-NEXT:    v_or_b32_e32 v2, 0x10000, v0
-; VI-NEXT:    v_ffbl_b32_e32 v2, v2
-; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_ffbl_b32_e32 v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_short v[0:1], v2
@@ -1607,8 +1598,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
index 4b1eb7cb08e306..f4d8ec180cf916 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
@@ -126,7 +126,7 @@ define amdgpu_kernel void @sel_constants_sub_constant_sel_constants(ptr addrspac
 }
 
 ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16:
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 9,
+; GCN: s_cselect_b32 s{{[0-9]+}}, 9, 2
 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, i16 -4, i16 3
   %bo = sub i16 5, %sel
@@ -135,8 +135,7 @@ define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(ptr addr
 }
 
 ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16_neg:
-; GCN: v_mov_b32_e32 [[F:v[0-9]+]], 0xfffff449
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, [[F]], -3,
+; GCN: s_cselect_b32 s[[SGPR:[0-9]+]], s[[SGPR]], 0xf449
 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16_neg(ptr addrspace(1) %p, i1 %cond) {
   %sel = select i1 %cond, i16 4, i16 3000
   %bo = sub i16 1, %sel
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 0a2cac5a3e26ba..6c2e9c16bbfedb 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -232,28 +232,22 @@ define amdgpu_kernel void @short8_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s4, 1
-; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, 2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, 2, s[2:3]
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 3
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 4
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 4
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 4, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 5
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 5
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 6
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 6
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 6, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 7
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 7
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 7, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, 8, v0, vcc
+; GCN-NEXT:    s_cselect_b32 s2, s2, 8
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_short v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -668,52 +662,38 @@ define amdgpu_kernel void @byte16_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s4, 1
-; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, 2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 1, 2, s[2:3]
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 3
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 3, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 4
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 4
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 4, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 5
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 5
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 6
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 6
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 6, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 7
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 7
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 7, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 8
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 8
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 8, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 9
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 9
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 9, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 10
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 10
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 10, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 11
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 11
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 11, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 12
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 12
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 12, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 13
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 13
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 13, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 14
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 14
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 14, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 15
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 15
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 15, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, 16, v0, vcc
+; GCN-NEXT:    s_cselect_b32 s2, s2, 16
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_byte v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -751,388 +731,264 @@ define amdgpu_kernel void @bit128_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 1
 ; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s4, 2
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_lg_u32 s4, 2
+; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 4
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 5
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 6
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 7
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 8
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 9
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 10
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 11
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 12
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 13
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 14
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 15
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 16
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 17
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 18
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 19
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 20
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 21
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 22
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 23
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 24
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 25
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 26
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 27
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 28
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 29
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 30
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 31
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 32
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 33
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 34
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 35
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 36
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 37
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 38
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 39
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 40
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 41
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 42
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 43
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 44
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 45
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 46
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 47
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 48
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 49
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 50
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 51
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 52
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 53
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 54
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 55
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 56
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 57
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 58
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 59
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 60
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 61
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 62
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 63
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 64
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x41
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x42
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x43
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x44
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x45
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x46
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x47
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x48
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x49
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x4a
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x4b
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x4c
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x4d
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x4e
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x4f
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x50
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x51
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x52
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x53
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x54
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x55
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x56
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x57
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x58
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x59
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x5a
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x5b
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x5c
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x5d
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x5e
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x5f
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x60
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x61
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x62
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x63
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x64
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x65
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x66
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x67
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x68
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x69
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x6a
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x6b
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x6c
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x6d
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x6e
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x6f
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x70
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x71
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x72
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x73
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x74
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x75
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x76
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x77
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x78
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x79
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x7a
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x7b
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x7c
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x7d
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x7e
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
 ; GCN-NEXT:    s_cmpk_lg_i32 s4, 0x7f
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_and_b32_e32 v2, 1, v0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
+; GCN-NEXT:    s_and_b32 s2, s2, 1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index b3a3c775e76f43..b602adedcbb605 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -53,7 +53,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b16_e64 v1, 8, s4
+; VI-NEXT:    s_lshr_b32 s5, s4, 8
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -341,8 +342,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b32 s4, s4, 3
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_lshrrev_b16_e32 v0, s4, v0
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_lshr_b32 s4, s5, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index fe672f1b3b1313..e6f9889440f0cd 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -369,11 +369,11 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, s4, v0
+; VI-NEXT:    s_and_b32 s2, s4, 0x8000
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -381,10 +381,10 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff8000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX9-NEXT:    s_and_b32 s2, s4, 0x8000
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -393,9 +393,10 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e64 v1, 0xffff8000, s4
+; GFX11-NEXT:    s_and_b32 s2, s4, 0x8000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -426,12 +427,12 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v0, s4, v0
-; VI-NEXT:    v_or_b32_e32 v2, 0x3c00, v0
+; VI-NEXT:    s_and_b32 s2, s4, 0x8000
+; VI-NEXT:    s_or_b32 s2, s2, 0x3c00
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -439,11 +440,11 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff8000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_or_b32_e32 v1, 0x3c00, v1
+; GFX9-NEXT:    s_and_b32 s2, s4, 0x8000
+; GFX9-NEXT:    s_or_b32 s2, s2, 0x3c00
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -452,12 +453,12 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e64 v0, 0xffff8000, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, 0x3c00, v0
-; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT:    s_and_b32 s2, s4, 0x8000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s2, s2, 0x3c00
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -487,12 +488,12 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v0, s4, v0
-; VI-NEXT:    v_or_b32_e32 v2, 0x4900, v0
+; VI-NEXT:    s_and_b32 s2, s4, 0x8000
+; VI-NEXT:    s_or_b32 s2, s2, 0x4900
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -500,11 +501,11 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff8000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_or_b32_e32 v1, 0x4900, v1
+; GFX9-NEXT:    s_and_b32 s2, s4, 0x8000
+; GFX9-NEXT:    s_or_b32 s2, s2, 0x4900
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -513,12 +514,12 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e64 v0, 0xffff8000, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, 0x4900, v0
-; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT:    s_and_b32 s2, s4, 0x8000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s2, s2, 0x4900
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -547,12 +548,12 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v0, s4, v0
-; VI-NEXT:    v_or_b32_e32 v2, 0x3c00, v0
+; VI-NEXT:    s_and_b32 s2, s4, 0x8000
+; VI-NEXT:    s_or_b32 s2, s2, 0x3c00
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -560,11 +561,11 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff8000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_or_b32_e32 v1, 0x3c00, v1
+; GFX9-NEXT:    s_and_b32 s2, s4, 0x8000
+; GFX9-NEXT:    s_or_b32 s2, s2, 0x3c00
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -573,12 +574,12 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e64 v0, 0xffff8000, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, 0x3c00, v0
-; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT:    s_and_b32 s2, s4, 0x8000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s2, s2, 0x3c00
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -608,12 +609,12 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 0xffff8000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v0, s4, v0
-; VI-NEXT:    v_or_b32_e32 v2, 0x4900, v0
+; VI-NEXT:    s_and_b32 s2, s4, 0x8000
+; VI-NEXT:    s_or_b32 s2, s2, 0x4900
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -621,11 +622,11 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff8000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX9-NEXT:    v_or_b32_e32 v1, 0x4900, v1
+; GFX9-NEXT:    s_and_b32 s2, s4, 0x8000
+; GFX9-NEXT:    s_or_b32 s2, s2, 0x4900
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -634,12 +635,12 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e64 v0, 0xffff8000, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, 0x4900, v0
-; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT:    s_and_b32 s2, s4, 0x8000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s2, s2, 0x4900
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index e447429539e6ff..9c3f5f1cd672d8 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -635,12 +635,7 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) {
 ; VI-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s4, 16
-; VI-NEXT:    s_xor_b32 s3, s4, 0x8000
-; VI-NEXT:    s_xor_b32 s2, s2, 0x8000
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_or_b32 s2, s3, s2
+; VI-NEXT:    s_xor_b32 s2, s4, 0x80008000
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
@@ -721,11 +716,9 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg)
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x4000
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s2, s4, 16
-; VI-NEXT:    s_xor_b32 s2, s2, 0x8000
-; VI-NEXT:    s_xor_b32 s3, s4, 0x8000
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_add_f16_e64 v1, s3, 2.0
-; VI-NEXT:    v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_sub_f16_e64 v1, 2.0, s4
+; VI-NEXT:    v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v2, v1, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 932b10f14780b1..4f3086a9eb1f9a 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -6,34 +6,6 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s
 
 define double @v_sqrt_f64(double %x) {
-; SDAG-LABEL: v_sqrt_f64:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -66,34 +38,6 @@ define double @v_sqrt_f64(double %x) {
 }
 
 define double @v_sqrt_f64_fneg(double %x) {
-; SDAG-LABEL: v_sqrt_f64_fneg:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 9
-; SDAG-NEXT:    v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_fneg:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -127,34 +71,6 @@ define double @v_sqrt_f64_fneg(double %x) {
 }
 
 define double @v_sqrt_f64_fabs(double %x) {
-; SDAG-LABEL: v_sqrt_f64_fabs:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_fabs:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -188,34 +104,6 @@ define double @v_sqrt_f64_fabs(double %x) {
 }
 
 define double @v_sqrt_f64_fneg_fabs(double %x) {
-; SDAG-LABEL: v_sqrt_f64_fneg_fabs:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 9
-; SDAG-NEXT:    v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_fneg_fabs:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -250,34 +138,6 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
 }
 
 define double @v_sqrt_f64_ninf(double %x) {
-; SDAG-LABEL: v_sqrt_f64_ninf:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -310,34 +170,6 @@ define double @v_sqrt_f64_ninf(double %x) {
 }
 
 define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true" {
-; SDAG-LABEL: v_sqrt_f64_no_infs_attribute:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_no_infs_attribute:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -370,34 +202,6 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true"
 }
 
 define double @v_sqrt_f64_nnan(double %x) {
-; SDAG-LABEL: v_sqrt_f64_nnan:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_nnan:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -706,34 +510,6 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
 }
 
 define double @v_sqrt_f64_nsz(double %x) {
-; SDAG-LABEL: v_sqrt_f64_nsz:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -766,34 +542,6 @@ define double @v_sqrt_f64_nsz(double %x) {
 }
 
 define double @v_sqrt_f64_nnan_ninf(double %x) {
-; SDAG-LABEL: v_sqrt_f64_nnan_ninf:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_nnan_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -826,34 +574,6 @@ define double @v_sqrt_f64_nnan_ninf(double %x) {
 }
 
 define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
-; SDAG-LABEL: v_sqrt_f64_nnan_ninf_nsz:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -886,34 +606,6 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
 }
 
 define double @v_sqrt_f64_afn(double %x) {
-; SDAG-LABEL: v_sqrt_f64_afn:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_afn:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -946,34 +638,6 @@ define double @v_sqrt_f64_afn(double %x) {
 }
 
 define double @v_sqrt_f64_afn_nsz(double %x) {
-; SDAG-LABEL: v_sqrt_f64_afn_nsz:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_afn_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1106,34 +770,6 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 }
 
 define double @v_sqrt_f64_afn_nnan(double %x) {
-; SDAG-LABEL: v_sqrt_f64_afn_nnan:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1166,34 +802,6 @@ define double @v_sqrt_f64_afn_nnan(double %x) {
 }
 
 define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
-; SDAG-LABEL: v_sqrt_f64_fabs_afn_ninf:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1227,34 +835,6 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
 }
 
 define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
-; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1387,34 +967,6 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 }
 
 define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
-; SDAG-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1447,34 +999,6 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
 }
 
 define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
-; SDAG-LABEL: v_sqrt_f64__approx_func_fp_math:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64__approx_func_fp_math:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1507,34 +1031,6 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
 }
 
 define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
-; SDAG-LABEL: v_sqrt_f64__enough_unsafe_attrs:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1567,34 +1063,6 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
 }
 
 define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
-; SDAG-LABEL: v_sqrt_f64__unsafe_attr:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_mov_b32 s4, 0
-; SDAG-NEXT:    s_brev_b32 s5, 8
-; SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
-; SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
-; SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1]
-; SDAG-NEXT:    v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5]
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffff80
-; SDAG-NEXT:    v_mov_b32_e32 v5, 0x260
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
-; SDAG-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v5
-; SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-LABEL: v_sqrt_f64__unsafe_attr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 645e48f1bb1ab0..545a9af3f9a0bd 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -2990,13 +2990,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i8 at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i8 at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
@@ -3025,10 +3026,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i8 at abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v0
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
@@ -3060,9 +3062,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b16 v1, 8, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
@@ -3090,10 +3093,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_lshrrev_b16 v1, 8, v0
+; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
@@ -4152,13 +4156,14 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
 ; GFX9-NEXT:    global_load_ushort v0, v[40:41], off
 ; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i8_ret at abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i8_ret at abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -4191,14 +4196,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
 ; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX10-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i8_ret at abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i8_ret at abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    global_load_ushort v0, v[40:41], off
 ; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v0
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
@@ -4239,14 +4245,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b16 v1, 8, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    global_store_b16 v[40:41], v0, off
 ; GFX11-NEXT:    s_clause 0x1
@@ -4274,14 +4282,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i8_ret at abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i8_ret at abs32@lo
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    global_load_ushort v0, v[40:41], off
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_lshrrev_b16 v1, 8, v0
+; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 81210d8f5d0ca3..42f2163faaeee8 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -638,92 +638,96 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-LABEL: udiv16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_movk_i32 s2, 0x400
-; GFX9-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s2, 0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB4_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_add_u16_e32 v2, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s2, v2
-; GFX9-NEXT:    v_mul_f32_e32 v6, v4, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
-; GFX9-NEXT:    v_mad_f32 v4, -v6, v0, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, v0
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1]
-; GFX9-NEXT:    global_store_short v5, v4, s[4:5]
-; GFX9-NEXT:    s_cbranch_vccz .LBB4_1
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX9-NEXT:    s_lshl_b32 s3, s2, 1
+; GFX9-NEXT:    s_add_i32 s2, s2, 1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v4
+; GFX9-NEXT:    v_mad_f32 v2, -v4, v0, v2
+; GFX9-NEXT:    s_and_b32 s3, s2, 0xffff
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX9-NEXT:    s_cmpk_eq_i32 s3, 0x400
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v3, v2, s[0:1]
+; GFX9-NEXT:    s_cbranch_scc0 .LBB4_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udiv16_invariant_denom:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x2c
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 1
+; GFX10-NEXT:    s_load_dword s4, s[2:3], 0x2c
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX10-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX10-NEXT:    s_mov_b32 s2, 0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB4_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_add_nc_u16 v2, v2, 1
-; GFX10-NEXT:    v_mul_f32_e32 v6, v4, v1
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX10-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
-; GFX10-NEXT:    v_mad_f32 v4, -v6, v0, v4
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s0, 0, v6, s0
-; GFX10-NEXT:    global_store_short v5, v4, s[4:5]
-; GFX10-NEXT:    s_cbranch_vccz .LBB4_1
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX10-NEXT:    s_lshl_b32 s3, s2, 1
+; GFX10-NEXT:    s_add_i32 s2, s2, 1
+; GFX10-NEXT:    v_mov_b32_e32 v4, s3
+; GFX10-NEXT:    s_and_b32 s3, s2, 0xffff
+; GFX10-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX10-NEXT:    s_cmpk_eq_i32 s3, 0x400
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX10-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT:    global_store_short v4, v2, s[0:1]
+; GFX10-NEXT:    s_cbranch_scc0 .LBB4_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: udiv16_invariant_denom:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX11-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX11-NEXT:    s_mov_b32 s2, 0
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB4_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v2, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v3
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX11-NEXT:    s_lshl_b32 s3, s2, 1
+; GFX11-NEXT:    s_add_i32 s2, s2, 1
+; GFX11-NEXT:    v_mov_b32_e32 v4, s3
+; GFX11-NEXT:    s_and_b32 s3, s2, 0xffff
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v1
-; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
+; GFX11-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX11-NEXT:    s_cmpk_eq_i32 s3, 0x400
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX11-NEXT:    v_fma_f32 v4, -v5, v0, v4
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s0, 0, v5, s0
-; GFX11-NEXT:    global_store_b16 v3, v4, s[2:3]
-; GFX11-NEXT:    s_cbranch_vccz .LBB4_1
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX11-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT:    global_store_b16 v4, v2, s[0:1]
+; GFX11-NEXT:    s_cbranch_scc0 .LBB4_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -749,31 +753,32 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-LABEL: urem16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x2c
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s4, s0, 0xffff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9-NEXT:    s_movk_i32 s2, 0x400
+; GFX9-NEXT:    s_mov_b32 s2, 0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB5_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
-; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s4
-; GFX9-NEXT:    v_sub_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_add_u16_e32 v2, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s2, v2
+; GFX9-NEXT:    s_and_b32 s3, 0xffff, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s3
+; GFX9-NEXT:    s_lshl_b32 s5, s3, 1
+; GFX9-NEXT:    s_add_i32 s2, s3, 1
+; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
+; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s4
+; GFX9-NEXT:    s_and_b32 s5, s2, 0xffff
+; GFX9-NEXT:    s_cmpk_eq_i32 s5, 0x400
+; GFX9-NEXT:    v_sub_u32_e32 v2, s3, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_short v5, v4, s[0:1]
-; GFX9-NEXT:    s_cbranch_vccz .LBB5_1
+; GFX9-NEXT:    global_store_short v3, v2, s[0:1]
+; GFX9-NEXT:    s_cbranch_scc0 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -782,28 +787,30 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 1
+; GFX10-NEXT:    s_mov_b32 s3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB5_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v1
-; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX10-NEXT:    v_mad_f32 v4, -v5, v0, v4
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v4|, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT:    v_mul_lo_u32 v4, v4, s2
-; GFX10-NEXT:    v_sub_nc_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_add_nc_u16 v2, v2, 1
-; GFX10-NEXT:    global_store_short v5, v4, s[0:1]
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX10-NEXT:    s_cbranch_vccz .LBB5_1
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX10-NEXT:    s_lshl_b32 s5, s4, 1
+; GFX10-NEXT:    s_add_i32 s3, s4, 1
+; GFX10-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX10-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v3, s5
+; GFX10-NEXT:    v_mul_lo_u32 v2, v2, s2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s4, v2
+; GFX10-NEXT:    s_and_b32 s4, s3, 0xffff
+; GFX10-NEXT:    s_cmpk_eq_i32 s4, 0x400
+; GFX10-NEXT:    global_store_short v3, v2, s[0:1]
+; GFX10-NEXT:    s_cbranch_scc0 .LBB5_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -812,7 +819,7 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s2, s4, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -821,26 +828,28 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB5_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v2, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GFX11-NEXT:    s_and_b32 s4, 0xffff, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX11-NEXT:    s_lshl_b32 s5, s4, 1
+; GFX11-NEXT:    s_add_i32 s3, s4, 1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v1
-; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX11-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_f32 v4, -v5, v0, v4
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX11-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v4|, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 1, v3
-; GFX11-NEXT:    v_mul_lo_u32 v4, v4, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, v3, v4
-; GFX11-NEXT:    global_store_b16 v5, v3, s[0:1]
-; GFX11-NEXT:    s_cbranch_vccz .LBB5_1
+; GFX11-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX11-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v3, s5
+; GFX11-NEXT:    v_mul_lo_u32 v2, v2, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, s4, v2
+; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff
+; GFX11-NEXT:    s_cmpk_eq_i32 s4, 0x400
+; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1]
+; GFX11-NEXT:    s_cbranch_scc0 .LBB5_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -870,33 +879,32 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0
-; GFX9-NEXT:    s_movk_i32 s2, 0x400
+; GFX9-NEXT:    s_mov_b32 s2, 0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB6_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_sext_i32_i16 s5, s3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s5
-; GFX9-NEXT:    s_xor_b32 s6, s5, s4
-; GFX9-NEXT:    s_ashr_i32 s5, s6, 30
-; GFX9-NEXT:    s_or_b32 s5, s5, 1
-; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
+; GFX9-NEXT:    s_sext_i32_i16 s3, s2
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s3
+; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 30
+; GFX9-NEXT:    s_or_b32 s3, s3, 1
+; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v2|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], exec
-; GFX9-NEXT:    v_add_u16_e64 v2, s3, 1
-; GFX9-NEXT:    s_cselect_b32 s5, s5, 0
-; GFX9-NEXT:    s_and_b32 s6, 0xffff, s3
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s2, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, s5, v4
-; GFX9-NEXT:    s_lshl_b32 s5, s6, 1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    v_add_u32_e32 v2, s3, v3
+; GFX9-NEXT:    s_lshl_b32 s3, s2, 1
+; GFX9-NEXT:    s_add_i32 s2, s2, 1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    s_and_b32 s3, s2, 0xffff
+; GFX9-NEXT:    s_cmpk_eq_i32 s3, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v3, v2, s[0:1]
-; GFX9-NEXT:    s_cbranch_vccz .LBB6_1
+; GFX9-NEXT:    s_cbranch_scc0 .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -913,26 +921,26 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:  .LBB6_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_sext_i32_i16 s4, s3
-; GFX10-NEXT:    v_add_nc_u16 v2, s3, 1
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, s4
-; GFX10-NEXT:    s_xor_b32 s5, s4, s2
-; GFX10-NEXT:    s_ashr_i32 s4, s5, 30
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v1
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, s4
+; GFX10-NEXT:    s_xor_b32 s4, s4, s2
+; GFX10-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX10-NEXT:    s_or_b32 s4, s4, 1
-; GFX10-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX10-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s5, |v3|, |v0|
+; GFX10-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX10-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s5, |v2|, |v0|
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX10-NEXT:    s_and_b32 s5, s5, exec_lo
 ; GFX10-NEXT:    s_cselect_b32 s4, s4, 0
-; GFX10-NEXT:    s_and_b32 s5, 0xffff, s3
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, s4, v4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, s4, v2
+; GFX10-NEXT:    s_lshl_b32 s4, s3, 1
+; GFX10-NEXT:    s_add_i32 s3, s3, 1
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    s_and_b32 s4, s3, 0xffff
+; GFX10-NEXT:    s_cmpk_eq_i32 s4, 0x400
 ; GFX10-NEXT:    global_store_short v3, v2, s[0:1]
-; GFX10-NEXT:    s_cbranch_vccz .LBB6_1
+; GFX10-NEXT:    s_cbranch_scc0 .LBB6_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -951,30 +959,32 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:  .LBB6_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_sext_i32_i16 s4, s3
-; GFX11-NEXT:    v_add_nc_u16 v2, s3, 1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, s4
-; GFX11-NEXT:    s_xor_b32 s5, s4, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    s_ashr_i32 s4, s5, 30
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, s4
+; GFX11-NEXT:    s_xor_b32 s4, s4, s2
+; GFX11-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_or_b32 s4, s4, 1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX11-NEXT:    v_fma_f32 v3, -v4, v0, v3
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s5, |v3|, |v0|
+; GFX11-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s5, |v2|, |v0|
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_and_b32 s5, s5, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s4, s4, 0
-; GFX11-NEXT:    s_and_b32 s5, 0xffff, s3
-; GFX11-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, s4, v2
+; GFX11-NEXT:    s_lshl_b32 s4, s3, 1
+; GFX11-NEXT:    s_add_i32 s3, s3, 1
+; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, s5 :: v_dual_add_nc_u32 v2, s4, v4
+; GFX11-NEXT:    s_cmpk_eq_i32 s4, 0x400
 ; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1]
-; GFX11-NEXT:    s_cbranch_vccz .LBB6_1
+; GFX11-NEXT:    s_cbranch_scc0 .LBB6_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1004,36 +1014,34 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9-NEXT:    s_mov_b32 s3, 0
-; GFX9-NEXT:    s_movk_i32 s2, 0x400
+; GFX9-NEXT:    s_mov_b32 s2, 0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB7_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_sext_i32_i16 s5, s3
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s5
-; GFX9-NEXT:    s_xor_b32 s6, s5, s4
-; GFX9-NEXT:    s_ashr_i32 s6, s6, 30
-; GFX9-NEXT:    s_or_b32 s8, s6, 1
-; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
+; GFX9-NEXT:    s_sext_i32_i16 s3, s2
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s3
+; GFX9-NEXT:    s_xor_b32 s5, s3, s4
+; GFX9-NEXT:    s_ashr_i32 s5, s5, 30
+; GFX9-NEXT:    s_or_b32 s5, s5, 1
+; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v2|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], exec
-; GFX9-NEXT:    v_add_u16_e64 v2, s3, 1
-; GFX9-NEXT:    s_cselect_b32 s6, s8, 0
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s2, v2
-; GFX9-NEXT:    s_and_b32 s7, 0xffff, s3
-; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, s6, v4
+; GFX9-NEXT:    s_cselect_b32 s5, s5, 0
+; GFX9-NEXT:    v_add_u32_e32 v2, s5, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s4
-; GFX9-NEXT:    s_lshl_b32 s6, s7, 1
-; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
+; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_lshl_b32 s5, s2, 1
+; GFX9-NEXT:    s_add_i32 s2, s2, 1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    s_and_b32 s5, s2, 0xffff
+; GFX9-NEXT:    s_cmpk_eq_i32 s5, 0x400
+; GFX9-NEXT:    v_sub_u32_e32 v2, s3, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v3, v2, s[0:1]
-; GFX9-NEXT:    s_cbranch_vccz .LBB7_1
+; GFX9-NEXT:    s_cbranch_scc0 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1050,29 +1058,28 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:  .LBB7_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_sext_i32_i16 s4, s3
-; GFX10-NEXT:    v_add_nc_u16 v2, s3, 1
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX10-NEXT:    s_xor_b32 s5, s4, s2
 ; GFX10-NEXT:    s_ashr_i32 s5, s5, 30
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v1
 ; GFX10-NEXT:    s_or_b32 s5, s5, 1
-; GFX10-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX10-NEXT:    v_mad_f32 v3, -v4, v0, v3
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s6, |v3|, |v0|
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v3, v4
+; GFX10-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX10-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s6, |v2|, |v0|
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX10-NEXT:    s_and_b32 s6, s6, exec_lo
 ; GFX10-NEXT:    s_cselect_b32 s5, s5, 0
-; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, s5, v3
-; GFX10-NEXT:    s_and_b32 s5, 0xffff, s3
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-NEXT:    v_mul_lo_u32 v3, v3, s2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s4, v3
-; GFX10-NEXT:    global_store_short v2, v3, s[0:1]
-; GFX10-NEXT:    s_cbranch_vccz .LBB7_1
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, s5, v2
+; GFX10-NEXT:    s_lshl_b32 s5, s3, 1
+; GFX10-NEXT:    s_add_i32 s3, s3, 1
+; GFX10-NEXT:    v_mov_b32_e32 v3, s5
+; GFX10-NEXT:    v_mul_lo_u32 v2, v2, s2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s4, v2
+; GFX10-NEXT:    s_and_b32 s4, s3, 0xffff
+; GFX10-NEXT:    s_cmpk_eq_i32 s4, 0x400
+; GFX10-NEXT:    global_store_short v3, v2, s[0:1]
+; GFX10-NEXT:    s_cbranch_scc0 .LBB7_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1091,35 +1098,35 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:  .LBB7_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_sext_i32_i16 s4, s3
-; GFX11-NEXT:    v_add_nc_u16 v2, s3, 1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX11-NEXT:    s_xor_b32 s5, s4, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    s_ashr_i32 s5, s5, 30
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_or_b32 s5, s5, 1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX11-NEXT:    v_fma_f32 v3, -v4, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s6, |v3|, |v0|
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v3, v4
+; GFX11-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s6, |v2|, |v0|
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s5, s5, 0
-; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, s5, v3
-; GFX11-NEXT:    s_and_b32 s5, 0xffff, s3
-; GFX11-NEXT:    v_readfirstlane_b32 s3, v2
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX11-NEXT:    v_mov_b32_e32 v2, s5
-; GFX11-NEXT:    v_mul_lo_u32 v3, v3, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, s4, v3
-; GFX11-NEXT:    global_store_b16 v2, v3, s[0:1]
-; GFX11-NEXT:    s_cbranch_vccz .LBB7_1
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, s5, v2
+; GFX11-NEXT:    s_lshl_b32 s5, s3, 1
+; GFX11-NEXT:    s_add_i32 s3, s3, 1
+; GFX11-NEXT:    v_mov_b32_e32 v3, s5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_lo_u32 v2, v2, s2
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, s4, v2
+; GFX11-NEXT:    s_and_b32 s4, s3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_cmpk_eq_i32 s4, 0x400
+; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1]
+; GFX11-NEXT:    s_cbranch_scc0 .LBB7_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll
index f407a1c26dd3eb..ecece35337a7a8 100644
--- a/llvm/test/CodeGen/AMDGPU/imm16.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm16.ll
@@ -1505,7 +1505,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a
 ; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x00,0x02,0x80]
 ; GFX10-NEXT:    s_mov_b32 s5, s1 ; encoding: [0x01,0x03,0x85,0xbe]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    v_add_nc_u16 v0, v0, -1 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x83,0x01,0x00]
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, -1, v0 ; encoding: [0xc1,0x00,0x00,0x4a]
 ; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
@@ -1523,7 +1523,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
 ; GFX11-NEXT:    s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT:    v_add_nc_u16 v0, v0, -1 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x83,0x01,0x00]
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, -1, v0 ; encoding: [0xc1,0x00,0x00,0x4a]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
 ; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
@@ -1543,7 +1543,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
 ; VI-NEXT:    s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
 ; VI-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
-; VI-NEXT:    v_add_u16_e32 v0, -1, v0 ; encoding: [0xc1,0x00,0x00,0x4c]
+; VI-NEXT:    v_add_u32_e32 v0, vcc, -1, v0 ; encoding: [0xc1,0x00,0x00,0x32]
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
 ; VI-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
@@ -1586,7 +1586,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a
 ; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x00,0x02,0x80]
 ; GFX10-NEXT:    s_mov_b32 s5, s1 ; encoding: [0x01,0x03,0x85,0xbe]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    v_add_nc_u16 v0, v0, -2 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x85,0x01,0x00]
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xfffe, v0 ; encoding: [0xff,0x00,0x00,0x4a,0xfe,0xff,0x00,0x00]
 ; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
@@ -1604,7 +1604,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
 ; GFX11-NEXT:    s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT:    v_add_nc_u16 v0, v0, -2 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x85,0x01,0x00]
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0xfffe, v0 ; encoding: [0xff,0x00,0x00,0x4a,0xfe,0xff,0x00,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
 ; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
@@ -1624,7 +1624,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
 ; VI-NEXT:    s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
 ; VI-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
-; VI-NEXT:    v_add_u16_e32 v0, -2, v0 ; encoding: [0xc2,0x00,0x00,0x4c]
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0xfffe, v0 ; encoding: [0xff,0x00,0x00,0x32,0xfe,0xff,0x00,0x00]
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
 ; VI-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
@@ -1667,7 +1667,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr
 ; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x00,0x02,0x80]
 ; GFX10-NEXT:    s_mov_b32 s5, s1 ; encoding: [0x01,0x03,0x85,0xbe]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
-; GFX10-NEXT:    v_add_nc_u16 v0, v0, -16 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0xa1,0x01,0x00]
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xfff0, v0 ; encoding: [0xff,0x00,0x00,0x4a,0xf0,0xff,0x00,0x00]
 ; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
@@ -1685,7 +1685,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
 ; GFX11-NEXT:    s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT:    v_add_nc_u16 v0, v0, -16 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0xa1,0x01,0x00]
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0xfff0, v0 ; encoding: [0xff,0x00,0x00,0x4a,0xf0,0xff,0x00,0x00]
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
 ; GFX11-NEXT:    s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
@@ -1705,7 +1705,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
 ; VI-NEXT:    s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
 ; VI-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
-; VI-NEXT:    v_add_u16_e32 v0, -16, v0 ; encoding: [0xd0,0x00,0x00,0x4c]
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 0xfff0, v0 ; encoding: [0xff,0x00,0x00,0x32,0xf0,0xff,0x00,0x00]
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
 ; VI-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index cddfb21a6fbdf4..9d1368b2ec105a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -90,77 +90,85 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s14, s23
 ; GFX11-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; GFX11-NEXT:    s_mov_b32 s1, -1
-; GFX11-NEXT:    s_cbranch_execz .LBB2_4
-; GFX11-NEXT:    s_branch .LBB2_12
-; GFX11-NEXT:  .LBB2_3:
-; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:  .LBB2_3: ; %Flow10
 ; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB2_12
-; GFX11-NEXT:  .LBB2_4: ; %bb16
-; GFX11-NEXT:    s_load_b32 s6, s[2:3], 0x54
+; GFX11-NEXT:  ; %bb.4: ; %bb16
+; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x54
 ; GFX11-NEXT:    s_bitcmp1_b32 s19, 0
-; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-NEXT:    s_and_b32 s7, s19, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_bitcmp1_b32 s6, 0
-; GFX11-NEXT:    s_mov_b32 s6, -1
 ; GFX11-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s7, 0
+; GFX11-NEXT:    s_and_b32 s1, s19, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s0, -1
+; GFX11-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB2_8
 ; GFX11-NEXT:  ; %bb.5: ; %bb18.preheader
 ; GFX11-NEXT:    s_load_b128 s[24:27], s[2:3], 0x44
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mul_hi_u32 s6, s25, s24
-; GFX11-NEXT:    s_mul_i32 s7, s25, s24
+; GFX11-NEXT:    s_mul_hi_u32 s0, s25, s24
+; GFX11-NEXT:    s_mul_i32 s1, s25, s24
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_alignbit_b32 v0, s6, s7, 1
-; GFX11-NEXT:    s_mov_b32 s7, 0
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s22
+; GFX11-NEXT:    v_alignbit_b32 v0, s0, s1, 1
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s6, s6, 1
-; GFX11-NEXT:    s_lshr_b32 s6, s6, s26
+; GFX11-NEXT:    s_or_b32 s0, s0, 1
+; GFX11-NEXT:    s_lshr_b32 s0, s0, s26
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mul_i32 s6, s6, s18
-; GFX11-NEXT:    s_mul_i32 s6, s6, s16
+; GFX11-NEXT:    s_mul_i32 s0, s0, s18
+; GFX11-NEXT:    s_mul_i32 s0, s0, s16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s6, s21, s6
-; GFX11-NEXT:    s_lshl_b64 s[18:19], s[6:7], 1
-; GFX11-NEXT:    global_load_u16 v1, v2, s[18:19]
+; GFX11-NEXT:    s_or_b32 s0, s21, s0
+; GFX11-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
+; GFX11-NEXT:    s_mov_b32 s0, s1
+; GFX11-NEXT:    global_load_u16 v1, v0, s[18:19]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s22
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB2_6: ; %bb18
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_cmp_ne_u16_e64 s6, s7, 0
-; GFX11-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-NEXT:    s_and_b32 vcc_lo, s8, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
-; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    s_bitcmp1_b32 s6, 0
-; GFX11-NEXT:    s_cselect_b32 s6, 0x100, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s7, s6, s7
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT:    s_and_b32 s1, s7, s1
+; GFX11-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s13, v2
+; GFX11-NEXT:    s_cselect_b32 s1, s13, s9
+; GFX11-NEXT:    s_and_b32 s9, 0xffff, s0
+; GFX11-NEXT:    s_and_b32 s1, s1, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX11-NEXT:    s_cselect_b32 s9, -1, 0
+; GFX11-NEXT:    s_and_b32 s16, s8, exec_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s9
+; GFX11-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s13, v2
+; GFX11-NEXT:    s_cselect_b32 s9, s13, s9
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_bitcmp1_b32 s9, 0
+; GFX11-NEXT:    s_cselect_b32 s9, 0x100, 0
+; GFX11-NEXT:    s_or_b32 s0, s9, s0
 ; GFX11-NEXT:    s_cbranch_vccz .LBB2_6
 ; GFX11-NEXT:  ; %bb.7: ; %Flow
-; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB2_8: ; %Flow12
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s6
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_vccz .LBB2_12
 ; GFX11-NEXT:  ; %bb.9:
-; GFX11-NEXT:    s_xor_b32 s0, s8, -1
+; GFX11-NEXT:    s_xor_b32 s0, s7, -1
 ; GFX11-NEXT:  .LBB2_10: ; %bb17
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -169,7 +177,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:  ; %bb.11: ; %Flow6
 ; GFX11-NEXT:    s_mov_b32 s17, -1
 ; GFX11-NEXT:  .LBB2_12: ; %Flow11
-; GFX11-NEXT:    s_and_b32 s6, s1, exec_lo
+; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
 ; GFX11-NEXT:    s_or_not1_b32 s0, s17, exec_lo
 ; GFX11-NEXT:  .LBB2_13: ; %Flow9
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s20
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 247ec407df5fd3..f7715364637787 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -473,101 +473,89 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec,
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshr_b32 s2, s7, 24
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 15
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_lshr_b32 s2, s7, 16
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
+; GCN-NEXT:    s_lshr_b32 s3, s7, 16
+; GCN-NEXT:    s_lshl_b32 s2, s2, 8
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 14
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_lshr_b32 s2, s7, 8
-; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
+; GCN-NEXT:    s_cselect_b32 s3, s3, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 0xff
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_lshr_b32 s9, s7, 8
+; GCN-NEXT:    s_lshl_b32 s2, s2, 16
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 13
-; GCN-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s3, s9, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 8
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 12
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v2, 1, v2, vcc
-; GCN-NEXT:    s_lshr_b32 s2, s6, 24
-; GCN-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    s_cselect_b32 s7, s7, 1
+; GCN-NEXT:    s_and_b32 s7, s7, 0xff
+; GCN-NEXT:    s_or_b32 s3, s7, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 0xffff
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_lshr_b32 s3, s6, 24
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 11
-; GCN-NEXT:    v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_lshr_b32 s2, s6, 16
+; GCN-NEXT:    s_cselect_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 8
+; GCN-NEXT:    s_lshr_b32 s7, s6, 16
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 10
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_lshr_b32 s2, s6, 8
-; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
+; GCN-NEXT:    s_cselect_b32 s7, s7, 1
+; GCN-NEXT:    s_and_b32 s7, s7, 0xff
+; GCN-NEXT:    s_or_b32 s3, s7, s3
+; GCN-NEXT:    s_lshl_b32 s3, s3, 16
+; GCN-NEXT:    s_lshr_b32 s7, s6, 8
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 9
-; GCN-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s7, s7, 1
+; GCN-NEXT:    s_lshl_b32 s7, s7, 8
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 8
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v2, 1, v2, vcc
-; GCN-NEXT:    s_lshr_b32 s2, s5, 24
-; GCN-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    s_cselect_b32 s6, s6, 1
+; GCN-NEXT:    s_and_b32 s6, s6, 0xff
+; GCN-NEXT:    s_or_b32 s6, s6, s7
+; GCN-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-NEXT:    s_or_b32 s3, s6, s3
+; GCN-NEXT:    s_lshr_b32 s6, s5, 24
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 7
-; GCN-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_lshr_b32 s2, s5, 16
+; GCN-NEXT:    s_cselect_b32 s6, s6, 1
+; GCN-NEXT:    s_lshl_b32 s6, s6, 8
+; GCN-NEXT:    s_lshr_b32 s7, s5, 16
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 6
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_lshr_b32 s2, s5, 8
-; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
+; GCN-NEXT:    s_cselect_b32 s7, s7, 1
+; GCN-NEXT:    s_and_b32 s7, s7, 0xff
+; GCN-NEXT:    s_or_b32 s6, s7, s6
+; GCN-NEXT:    s_lshl_b32 s6, s6, 16
+; GCN-NEXT:    s_lshr_b32 s7, s5, 8
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 5
-; GCN-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s7, s7, 1
+; GCN-NEXT:    s_lshl_b32 s7, s7, 8
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 4
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v4, s5
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
-; GCN-NEXT:    s_lshr_b32 s2, s4, 24
-; GCN-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    s_cselect_b32 s5, s5, 1
+; GCN-NEXT:    s_and_b32 s5, s5, 0xff
+; GCN-NEXT:    s_or_b32 s5, s5, s7
+; GCN-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NEXT:    s_or_b32 s5, s5, s6
+; GCN-NEXT:    s_lshr_b32 s6, s4, 24
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 3
-; GCN-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_lshr_b32 s2, s4, 16
+; GCN-NEXT:    s_cselect_b32 s6, s6, 1
+; GCN-NEXT:    s_lshl_b32 s6, s6, 8
+; GCN-NEXT:    s_lshr_b32 s7, s4, 16
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 2
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_lshr_b32 s2, s4, 8
-; GCN-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
+; GCN-NEXT:    s_cselect_b32 s7, s7, 1
+; GCN-NEXT:    s_and_b32 s7, s7, 0xff
+; GCN-NEXT:    s_or_b32 s6, s7, s6
+; GCN-NEXT:    s_lshl_b32 s6, s6, 16
+; GCN-NEXT:    s_lshr_b32 s7, s4, 8
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 1
-; GCN-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cselect_b32 s7, s7, 1
+; GCN-NEXT:    s_lshl_b32 s7, s7, 8
 ; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
-; GCN-NEXT:    v_mov_b32_e32 v5, s4
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v5, 1, v5, vcc
-; GCN-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GCN-NEXT:    s_cselect_b32 s4, s4, 1
+; GCN-NEXT:    s_and_b32 s4, s4, 0xff
+; GCN-NEXT:    s_or_b32 s4, s4, s7
+; GCN-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NEXT:    s_or_b32 s4, s4, s6
 ; GCN-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
@@ -971,22 +959,22 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32
 ; GCN-NEXT:    s_mov_b32 s15, 0xe80000
 ; GCN-NEXT:    s_add_u32 s12, s12, s9
 ; GCN-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bfe_u32 s6, s2, 0x10003
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    s_bfe_u32 s5, s2, 0x20002
+; GCN-NEXT:    buffer_store_byte v1, off, s[12:15], 0
+; GCN-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-NEXT:    s_bfe_u32 s4, s2, 0x10001
+; GCN-NEXT:    buffer_store_byte v1, off, s[12:15], 0 offset:3
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_and_b32 s3, s3, 3
-; GCN-NEXT:    v_lshrrev_b16_e64 v2, 1, s2
-; GCN-NEXT:    v_mov_b32_e32 v3, s2
-; GCN-NEXT:    v_lshrrev_b16_e64 v4, 2, s2
-; GCN-NEXT:    v_lshrrev_b16_e64 v5, 3, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, 1
+; GCN-NEXT:    buffer_store_byte v1, off, s[12:15], 0 offset:2
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    v_or_b32_e32 v0, s3, v0
-; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_and_b32_e32 v4, 3, v4
-; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
-; GCN-NEXT:    buffer_store_byte v3, off, s[12:15], 0
-; GCN-NEXT:    buffer_store_byte v5, off, s[12:15], 0 offset:3
-; GCN-NEXT:    buffer_store_byte v4, off, s[12:15], 0 offset:2
-; GCN-NEXT:    buffer_store_byte v2, off, s[12:15], 0 offset:1
+; GCN-NEXT:    buffer_store_byte v1, off, s[12:15], 0 offset:1
+; GCN-NEXT:    v_mov_b32_e32 v1, 1
 ; GCN-NEXT:    buffer_store_byte v1, v0, s[12:15], 0 offen
 ; GCN-NEXT:    buffer_load_ubyte v0, off, s[12:15], 0
 ; GCN-NEXT:    buffer_load_ubyte v1, off, s[12:15], 0 offset:1
@@ -1020,909 +1008,838 @@ define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec,
 ; GCN-LABEL: bit128_inselt:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x34
+; GCN-NEXT:    s_load_dword s8, s[2:3], 0x44
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_load_dword s2, s[2:3], 0x44
+; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s3, s4, 24
-; GCN-NEXT:    s_lshr_b32 s8, s4, 16
-; GCN-NEXT:    s_lshr_b32 s9, s4, 17
-; GCN-NEXT:    s_lshr_b32 s10, s4, 18
-; GCN-NEXT:    s_lshr_b32 s11, s4, 19
-; GCN-NEXT:    s_lshr_b32 s12, s4, 20
-; GCN-NEXT:    s_lshr_b32 s13, s4, 21
-; GCN-NEXT:    s_lshr_b32 s14, s4, 22
-; GCN-NEXT:    s_lshr_b32 s15, s4, 23
-; GCN-NEXT:    s_lshr_b32 s16, s5, 24
-; GCN-NEXT:    s_lshr_b32 s17, s5, 16
-; GCN-NEXT:    s_lshr_b32 s18, s5, 17
-; GCN-NEXT:    s_lshr_b32 s19, s5, 18
-; GCN-NEXT:    s_lshr_b32 s20, s5, 19
-; GCN-NEXT:    s_lshr_b32 s21, s5, 20
-; GCN-NEXT:    s_lshr_b32 s22, s5, 21
-; GCN-NEXT:    s_lshr_b32 s23, s5, 22
-; GCN-NEXT:    s_lshr_b32 s24, s5, 23
-; GCN-NEXT:    s_lshr_b32 s25, s6, 24
-; GCN-NEXT:    s_lshr_b32 s26, s6, 16
-; GCN-NEXT:    s_lshr_b32 s27, s6, 17
-; GCN-NEXT:    s_lshr_b32 s28, s6, 18
-; GCN-NEXT:    s_lshr_b32 s29, s6, 19
-; GCN-NEXT:    s_lshr_b32 s30, s6, 20
-; GCN-NEXT:    s_lshr_b32 s31, s6, 21
-; GCN-NEXT:    s_lshr_b32 s33, s6, 22
-; GCN-NEXT:    s_lshr_b32 s34, s6, 23
-; GCN-NEXT:    s_lshr_b32 s35, s7, 24
-; GCN-NEXT:    s_lshr_b32 s36, s7, 16
-; GCN-NEXT:    s_lshr_b32 s37, s7, 17
-; GCN-NEXT:    s_lshr_b32 s38, s7, 18
-; GCN-NEXT:    s_lshr_b32 s39, s7, 19
-; GCN-NEXT:    s_lshr_b32 s40, s7, 20
-; GCN-NEXT:    s_lshr_b32 s41, s7, 21
-; GCN-NEXT:    s_lshr_b32 s42, s7, 22
-; GCN-NEXT:    s_lshr_b32 s43, s7, 23
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x77
-; GCN-NEXT:    v_mov_b32_e32 v15, s43
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x76
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_mov_b32_e32 v18, s42
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x75
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v18
-; GCN-NEXT:    v_mov_b32_e32 v18, s41
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x74
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_mov_b32_e32 v19, s40
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x73
-; GCN-NEXT:    v_or_b32_e32 v15, v18, v15
-; GCN-NEXT:    v_mov_b32_e32 v18, s39
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x72
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_mov_b32_e32 v19, s38
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x71
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_mov_b32_e32 v19, s37
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x70
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_mov_b32_e32 v20, s36
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
-; GCN-NEXT:    v_and_b32_e32 v18, 15, v18
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x7f
-; GCN-NEXT:    v_or_b32_e32 v15, v18, v15
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 7, s35
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x7e
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 6, s35
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x7d
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 5, s35
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x7c
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 4, s35
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x7b
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 3, s35
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x7a
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 2, s35
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x78
-; GCN-NEXT:    v_mov_b32_e32 v13, s35
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 3, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x79
-; GCN-NEXT:    v_or_b32_e32 v19, v19, v20
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 1, s35
-; GCN-NEXT:    v_cndmask_b32_e32 v13, 1, v13, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_and_b32_e32 v13, 1, v13
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v13, v13, v20
-; GCN-NEXT:    v_and_b32_e32 v13, 3, v13
-; GCN-NEXT:    v_or_b32_e32 v19, v13, v19
-; GCN-NEXT:    v_mov_b32_e32 v13, 15
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 12, v18
-; GCN-NEXT:    v_and_b32_sdwa v19, v19, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x6f
-; GCN-NEXT:    v_or_b32_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 15, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x6e
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 14, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x6d
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 13, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x6c
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 12, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x6b
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 11, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x6a
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 10, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 3, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x69
-; GCN-NEXT:    v_or_b32_e32 v19, v19, v20
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 9, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x68
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 8, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v20
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 12, v18
-; GCN-NEXT:    v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x67
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 7, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x66
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 6, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x65
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 5, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x64
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 4, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x63
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 3, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x62
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 2, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 3, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x61
-; GCN-NEXT:    v_or_b32_e32 v19, v19, v20
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 1, s7
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x60
-; GCN-NEXT:    v_mov_b32_e32 v16, s7
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v20
-; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 4, v18
-; GCN-NEXT:    v_and_b32_e32 v16, 15, v16
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v18
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x57
-; GCN-NEXT:    v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v17, s34
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x56
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_mov_b32_e32 v18, s33
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x55
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
-; GCN-NEXT:    v_mov_b32_e32 v18, s31
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x54
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_mov_b32_e32 v19, s30
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x53
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_mov_b32_e32 v18, s29
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x52
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_mov_b32_e32 v19, s28
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x51
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_mov_b32_e32 v19, s27
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x50
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_mov_b32_e32 v20, s26
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 15, v18
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x5f
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 7, s25
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x5e
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 6, s25
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x5d
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 5, s25
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x5c
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 4, s25
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x5b
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 3, s25
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x5a
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 2, s25
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x58
-; GCN-NEXT:    v_mov_b32_e32 v3, s25
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 3, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x59
-; GCN-NEXT:    v_or_b32_e32 v19, v19, v20
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 1, s25
-; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v20
-; GCN-NEXT:    v_and_b32_e32 v3, 3, v3
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 12, v18
-; GCN-NEXT:    v_and_b32_sdwa v3, v3, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v3, v18, v3
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x4f
-; GCN-NEXT:    v_or_b32_sdwa v17, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_lshrrev_b16_e64 v3, 15, s6
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x4e
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 14, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x4d
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 13, s6
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x4c
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 12, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x4b
-; GCN-NEXT:    v_or_b32_e32 v3, v18, v3
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 11, s6
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x4a
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 10, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 3, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x49
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 9, s6
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x48
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 8, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v19, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v3, 12, v3
-; GCN-NEXT:    v_and_b32_sdwa v18, v18, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x47
-; GCN-NEXT:    v_or_b32_e32 v18, v3, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v3, 7, s6
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x46
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 6, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x45
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 5, s6
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x44
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 4, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_or_b32_e32 v19, v20, v19
-; GCN-NEXT:    v_and_b32_e32 v19, 3, v19
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x43
-; GCN-NEXT:    v_or_b32_e32 v19, v19, v3
-; GCN-NEXT:    v_lshrrev_b16_e64 v3, 3, s6
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x42
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 2, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v3, 1, v3, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
-; GCN-NEXT:    v_lshlrev_b16_e32 v3, 3, v3
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 2, v20
-; GCN-NEXT:    s_cmpk_lg_i32 s2, 0x41
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v20
-; GCN-NEXT:    v_lshrrev_b16_e64 v20, 1, s6
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 64
-; GCN-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NEXT:    v_cndmask_b32_e32 v20, 1, v20, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v2, 1, v2, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v20, 1, v20
-; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v20
-; GCN-NEXT:    v_and_b32_e32 v2, 3, v2
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
-; GCN-NEXT:    v_or_b32_sdwa v3, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v19
-; GCN-NEXT:    v_and_b32_e32 v2, 15, v2
-; GCN-NEXT:    s_cmp_lg_u32 s2, 55
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v15
-; GCN-NEXT:    v_mov_b32_e32 v15, s24
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 54
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_mov_b32_e32 v16, s23
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
-; GCN-NEXT:    s_cmp_lg_u32 s2, 53
-; GCN-NEXT:    v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
-; GCN-NEXT:    v_mov_b32_e32 v16, s22
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 52
-; GCN-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_mov_b32_e32 v17, s21
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
-; GCN-NEXT:    s_cmp_lg_u32 s2, 51
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_mov_b32_e32 v16, s20
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 50
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_mov_b32_e32 v17, s19
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
-; GCN-NEXT:    s_cmp_lg_u32 s2, 49
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_mov_b32_e32 v17, s18
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 48
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_mov_b32_e32 v18, s17
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
-; GCN-NEXT:    v_and_b32_e32 v16, 15, v16
-; GCN-NEXT:    s_cmp_lg_u32 s2, 63
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 7, s16
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 62
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 6, s16
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
-; GCN-NEXT:    s_cmp_lg_u32 s2, 61
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 5, s16
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 60
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 4, s16
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    s_cmp_lg_u32 s2, 59
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 3, s16
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 58
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 2, s16
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    s_cmp_lg_u32 s2, 56
-; GCN-NEXT:    v_mov_b32_e32 v14, s16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 57
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 1, s16
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v18
-; GCN-NEXT:    v_and_b32_e32 v14, 3, v14
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 12, v16
-; GCN-NEXT:    v_and_b32_sdwa v14, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v14, v16, v14
-; GCN-NEXT:    s_cmp_lg_u32 s2, 47
-; GCN-NEXT:    v_or_b32_sdwa v15, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_lshrrev_b16_e64 v14, 15, s5
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 46
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 14, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
-; GCN-NEXT:    s_cmp_lg_u32 s2, 45
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 13, s5
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 44
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 12, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
-; GCN-NEXT:    s_cmp_lg_u32 s2, 43
-; GCN-NEXT:    v_or_b32_e32 v14, v16, v14
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 11, s5
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 42
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 10, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
-; GCN-NEXT:    s_cmp_lg_u32 s2, 41
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 9, s5
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 40
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 8, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 12, v14
-; GCN-NEXT:    v_and_b32_sdwa v16, v16, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    s_cmp_lg_u32 s2, 39
-; GCN-NEXT:    v_or_b32_e32 v16, v14, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v14, 7, s5
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 38
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 6, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
-; GCN-NEXT:    s_cmp_lg_u32 s2, 37
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 5, s5
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 36
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 4, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    s_cmp_lg_u32 s2, 35
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v14
-; GCN-NEXT:    v_lshrrev_b16_e64 v14, 3, s5
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 34
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 2, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 3, v14
-; GCN-NEXT:    v_lshlrev_b16_e32 v18, 2, v18
-; GCN-NEXT:    s_cmp_lg_u32 s2, 33
-; GCN-NEXT:    v_or_b32_e32 v18, v14, v18
-; GCN-NEXT:    v_lshrrev_b16_e64 v14, 1, s5
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 32
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 1, v14
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v14
-; GCN-NEXT:    v_and_b32_e32 v1, 3, v1
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 4, v17
-; GCN-NEXT:    v_and_b32_e32 v1, 15, v1
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v17
-; GCN-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    s_cmp_lg_u32 s2, 23
-; GCN-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v15, s15
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 22
-; GCN-NEXT:    v_cndmask_b32_e32 v15, 1, v15, vcc
-; GCN-NEXT:    v_mov_b32_e32 v16, s14
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 3, v15
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 2, v16
-; GCN-NEXT:    s_cmp_lg_u32 s2, 21
-; GCN-NEXT:    v_or_b32_e32 v15, v15, v16
-; GCN-NEXT:    v_mov_b32_e32 v16, s13
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 20
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_mov_b32_e32 v17, s12
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_and_b32_e32 v16, 3, v16
-; GCN-NEXT:    s_cmp_lg_u32 s2, 19
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_mov_b32_e32 v16, s11
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 18
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    v_mov_b32_e32 v17, s10
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
-; GCN-NEXT:    s_cmp_lg_u32 s2, 17
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_mov_b32_e32 v17, s9
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 16
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_mov_b32_e32 v19, s8
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v17, v19, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v15, 4, v15
-; GCN-NEXT:    v_and_b32_e32 v16, 15, v16
-; GCN-NEXT:    s_cmp_lg_u32 s2, 31
-; GCN-NEXT:    v_or_b32_e32 v15, v16, v15
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 7, s3
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 30
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 6, s3
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
-; GCN-NEXT:    s_cmp_lg_u32 s2, 29
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 5, s3
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 28
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 4, s3
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v17, v19, v17
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    s_cmp_lg_u32 s2, 27
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 3, s3
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 26
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 2, s3
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    s_cmp_lg_u32 s2, 24
-; GCN-NEXT:    v_mov_b32_e32 v18, s3
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 2, v19
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 25
-; GCN-NEXT:    v_or_b32_e32 v17, v17, v19
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 1, s3
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v19, 1, v19, vcc
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_lshlrev_b16_e32 v19, 1, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v19
-; GCN-NEXT:    v_and_b32_e32 v18, 3, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 12, v16
-; GCN-NEXT:    v_and_b32_sdwa v17, v17, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    s_cmp_lg_u32 s2, 15
-; GCN-NEXT:    v_or_b32_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_lshrrev_b16_e64 v16, 15, s4
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 14
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 14, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v16, 1, v16, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 3, v16
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 2, v17
-; GCN-NEXT:    s_cmp_lg_u32 s2, 13
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_lshrrev_b16_e64 v17, 13, s4
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 12
-; GCN-NEXT:    v_lshrrev_b16_e64 v18, 12, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v17, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v18, 1, v18, vcc
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_or_b32_e32 v17, v18, v17
-; GCN-NEXT:    s_cmp_lg_u32 s2, 11
-; GCN-NEXT:    v_lshrrev_b16_e64 v19, 11, s4
-; GCN-NEXT:    v_and_b32_e32 v17, 3, v17
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 10
-; GCN-NEXT:    v_lshrrev_b16_e64 v14, 10, s4
-; GCN-NEXT:    v_or_b32_e32 v16, v17, v16
-; GCN-NEXT:    v_cndmask_b32_e32 v17, 1, v19, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 9
-; GCN-NEXT:    v_lshrrev_b16_e64 v12, 9, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v14, 1, v14, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 8
-; GCN-NEXT:    v_lshrrev_b16_e64 v11, 8, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v12, 1, v12, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 7
-; GCN-NEXT:    v_lshrrev_b16_e64 v10, 7, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v11, 1, v11, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 6
-; GCN-NEXT:    v_lshrrev_b16_e64 v9, 6, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v10, 1, v10, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 5
-; GCN-NEXT:    v_lshrrev_b16_e64 v8, 5, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v9, 1, v9, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 4
-; GCN-NEXT:    v_lshrrev_b16_e64 v7, 4, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v8, 1, v8, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 3
-; GCN-NEXT:    v_lshrrev_b16_e64 v6, 3, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v7, 1, v7, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 2
-; GCN-NEXT:    v_lshrrev_b16_e64 v5, 2, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v6, 1, v6, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 1
-; GCN-NEXT:    v_lshrrev_b16_e64 v4, 1, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v5, 1, v5, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_cndmask_b32_e32 v4, 1, v4, vcc
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
-; GCN-NEXT:    v_lshlrev_b16_e32 v12, 1, v12
-; GCN-NEXT:    v_and_b32_e32 v11, 1, v11
-; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
-; GCN-NEXT:    v_lshlrev_b16_e32 v8, 1, v8
-; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
-; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
-; GCN-NEXT:    v_lshlrev_b16_e32 v4, 1, v4
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_lshlrev_b16_e32 v17, 3, v17
-; GCN-NEXT:    v_lshlrev_b16_e32 v14, 2, v14
-; GCN-NEXT:    v_or_b32_e32 v11, v11, v12
-; GCN-NEXT:    v_lshlrev_b16_e32 v10, 3, v10
-; GCN-NEXT:    v_lshlrev_b16_e32 v9, 2, v9
-; GCN-NEXT:    v_or_b32_e32 v7, v7, v8
-; GCN-NEXT:    v_lshlrev_b16_e32 v6, 3, v6
-; GCN-NEXT:    v_lshlrev_b16_e32 v5, 2, v5
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v4
-; GCN-NEXT:    v_or_b32_e32 v14, v17, v14
-; GCN-NEXT:    v_and_b32_e32 v11, 3, v11
-; GCN-NEXT:    v_or_b32_e32 v9, v10, v9
-; GCN-NEXT:    v_and_b32_e32 v7, 3, v7
-; GCN-NEXT:    v_or_b32_e32 v5, v6, v5
-; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
-; GCN-NEXT:    v_or_b32_e32 v11, v11, v14
-; GCN-NEXT:    v_or_b32_e32 v7, v7, v9
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v5
-; GCN-NEXT:    v_lshlrev_b16_e32 v16, 12, v16
-; GCN-NEXT:    v_and_b32_sdwa v11, v11, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GCN-NEXT:    v_lshlrev_b16_e32 v7, 4, v7
-; GCN-NEXT:    v_and_b32_e32 v0, 15, v0
-; GCN-NEXT:    v_or_b32_e32 v11, v16, v11
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v7
-; GCN-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-NEXT:    v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0xf0001
+; GCN-NEXT:    s_lshr_b32 s42, s5, 16
+; GCN-NEXT:    v_writelane_b32 v0, s0, 0
+; GCN-NEXT:    v_writelane_b32 v0, s1, 1
+; GCN-NEXT:    s_lshr_b32 s0, s4, 16
+; GCN-NEXT:    v_writelane_b32 v0, s0, 2
+; GCN-NEXT:    s_lshr_b32 s0, s4, 17
+; GCN-NEXT:    v_writelane_b32 v0, s0, 3
+; GCN-NEXT:    s_lshr_b32 s0, s4, 18
+; GCN-NEXT:    v_writelane_b32 v0, s0, 4
+; GCN-NEXT:    s_lshr_b32 s0, s4, 19
+; GCN-NEXT:    v_writelane_b32 v0, s0, 5
+; GCN-NEXT:    s_lshr_b32 s0, s4, 20
+; GCN-NEXT:    v_writelane_b32 v0, s0, 6
+; GCN-NEXT:    s_lshr_b32 s0, s4, 21
+; GCN-NEXT:    v_writelane_b32 v0, s0, 7
+; GCN-NEXT:    s_lshr_b32 s0, s4, 22
+; GCN-NEXT:    v_writelane_b32 v0, s0, 8
+; GCN-NEXT:    s_lshr_b32 s0, s4, 23
+; GCN-NEXT:    v_writelane_b32 v0, s0, 9
+; GCN-NEXT:    s_lshr_b32 s0, s4, 24
+; GCN-NEXT:    v_writelane_b32 v0, s0, 10
+; GCN-NEXT:    s_lshr_b32 s0, s4, 25
+; GCN-NEXT:    v_writelane_b32 v0, s0, 11
+; GCN-NEXT:    s_lshr_b32 s0, s4, 26
+; GCN-NEXT:    v_writelane_b32 v0, s0, 12
+; GCN-NEXT:    s_lshr_b32 s0, s4, 27
+; GCN-NEXT:    v_writelane_b32 v0, s0, 13
+; GCN-NEXT:    s_lshr_b32 s0, s4, 28
+; GCN-NEXT:    v_writelane_b32 v0, s0, 14
+; GCN-NEXT:    s_lshr_b32 s0, s4, 29
+; GCN-NEXT:    v_writelane_b32 v0, s0, 15
+; GCN-NEXT:    s_lshr_b32 s0, s4, 30
+; GCN-NEXT:    v_writelane_b32 v0, s0, 16
+; GCN-NEXT:    s_lshr_b32 s0, s4, 31
+; GCN-NEXT:    v_writelane_b32 v0, s0, 17
+; GCN-NEXT:    v_writelane_b32 v0, s9, 18
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0xe0002
+; GCN-NEXT:    v_writelane_b32 v0, s9, 19
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0xd0003
+; GCN-NEXT:    v_writelane_b32 v0, s9, 20
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0xc0004
+; GCN-NEXT:    v_writelane_b32 v0, s9, 21
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0xb0005
+; GCN-NEXT:    v_writelane_b32 v0, s9, 22
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0xa0006
+; GCN-NEXT:    v_writelane_b32 v0, s9, 23
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0x90007
+; GCN-NEXT:    v_writelane_b32 v0, s9, 24
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0x80008
+; GCN-NEXT:    v_writelane_b32 v0, s9, 25
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0x70009
+; GCN-NEXT:    v_writelane_b32 v0, s9, 26
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0x6000a
+; GCN-NEXT:    v_writelane_b32 v0, s9, 27
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0x5000b
+; GCN-NEXT:    v_writelane_b32 v0, s9, 28
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0x4000c
+; GCN-NEXT:    v_writelane_b32 v0, s9, 29
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0x3000d
+; GCN-NEXT:    v_writelane_b32 v0, s9, 30
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0x2000e
+; GCN-NEXT:    v_writelane_b32 v0, s9, 31
+; GCN-NEXT:    s_bfe_u32 s9, s4, 0x1000f
+; GCN-NEXT:    v_writelane_b32 v0, s9, 32
+; GCN-NEXT:    s_bfe_u32 s9, s5, 0xf0001
+; GCN-NEXT:    s_lshr_b32 s43, s5, 17
+; GCN-NEXT:    s_lshr_b32 s45, s5, 18
+; GCN-NEXT:    s_lshr_b32 s47, s5, 19
+; GCN-NEXT:    s_lshr_b32 s50, s5, 20
+; GCN-NEXT:    s_lshr_b32 s51, s5, 21
+; GCN-NEXT:    s_lshr_b32 s53, s5, 22
+; GCN-NEXT:    s_lshr_b32 s55, s5, 23
+; GCN-NEXT:    s_lshr_b32 s58, s5, 24
+; GCN-NEXT:    s_lshr_b32 s59, s5, 25
+; GCN-NEXT:    s_lshr_b32 s61, s5, 26
+; GCN-NEXT:    s_lshr_b32 s63, s5, 27
+; GCN-NEXT:    s_lshr_b32 s66, s5, 28
+; GCN-NEXT:    s_lshr_b32 s67, s5, 29
+; GCN-NEXT:    s_lshr_b32 s68, s5, 30
+; GCN-NEXT:    s_lshr_b32 s69, s5, 31
+; GCN-NEXT:    s_lshr_b32 s73, s6, 16
+; GCN-NEXT:    s_lshr_b32 s74, s6, 17
+; GCN-NEXT:    s_lshr_b32 s77, s6, 18
+; GCN-NEXT:    s_lshr_b32 s78, s6, 19
+; GCN-NEXT:    s_lshr_b32 s81, s6, 20
+; GCN-NEXT:    s_lshr_b32 s82, s6, 21
+; GCN-NEXT:    s_lshr_b32 s84, s6, 22
+; GCN-NEXT:    s_lshr_b32 s86, s6, 23
+; GCN-NEXT:    s_lshr_b32 s89, s6, 24
+; GCN-NEXT:    s_lshr_b32 s90, s6, 25
+; GCN-NEXT:    s_lshr_b32 s93, s6, 26
+; GCN-NEXT:    s_lshr_b32 s94, s6, 27
+; GCN-NEXT:    s_lshr_b32 vcc_hi, s6, 28
+; GCN-NEXT:    s_lshr_b32 s39, s6, 29
+; GCN-NEXT:    s_lshr_b32 s38, s6, 30
+; GCN-NEXT:    s_lshr_b32 s37, s6, 31
+; GCN-NEXT:    s_lshr_b32 s33, s7, 16
+; GCN-NEXT:    s_lshr_b32 s31, s7, 17
+; GCN-NEXT:    s_lshr_b32 s28, s7, 18
+; GCN-NEXT:    s_lshr_b32 s27, s7, 19
+; GCN-NEXT:    s_lshr_b32 s24, s7, 20
+; GCN-NEXT:    s_lshr_b32 s23, s7, 21
+; GCN-NEXT:    s_lshr_b32 s20, s7, 22
+; GCN-NEXT:    s_lshr_b32 s19, s7, 23
+; GCN-NEXT:    s_lshr_b32 s16, s7, 24
+; GCN-NEXT:    s_lshr_b32 s15, s7, 25
+; GCN-NEXT:    s_lshr_b32 s12, s7, 26
+; GCN-NEXT:    s_lshr_b32 s11, s7, 27
+; GCN-NEXT:    s_lshr_b32 s3, s7, 28
+; GCN-NEXT:    s_lshr_b32 s2, s7, 29
+; GCN-NEXT:    s_lshr_b32 s1, s7, 30
+; GCN-NEXT:    s_lshr_b32 s0, s7, 31
+; GCN-NEXT:    v_writelane_b32 v0, s9, 33
+; GCN-NEXT:    s_bfe_u32 s40, s5, 0xe0002
+; GCN-NEXT:    s_bfe_u32 s41, s5, 0xd0003
+; GCN-NEXT:    s_bfe_u32 s44, s5, 0xc0004
+; GCN-NEXT:    s_bfe_u32 s46, s5, 0xb0005
+; GCN-NEXT:    s_bfe_u32 s48, s5, 0xa0006
+; GCN-NEXT:    s_bfe_u32 s49, s5, 0x90007
+; GCN-NEXT:    s_bfe_u32 s52, s5, 0x80008
+; GCN-NEXT:    s_bfe_u32 s54, s5, 0x70009
+; GCN-NEXT:    s_bfe_u32 s56, s5, 0x6000a
+; GCN-NEXT:    s_bfe_u32 s57, s5, 0x5000b
+; GCN-NEXT:    s_bfe_u32 s60, s5, 0x4000c
+; GCN-NEXT:    s_bfe_u32 s62, s5, 0x3000d
+; GCN-NEXT:    s_bfe_u32 s64, s5, 0x2000e
+; GCN-NEXT:    s_bfe_u32 s65, s5, 0x1000f
+; GCN-NEXT:    s_bfe_u32 s70, s6, 0xf0001
+; GCN-NEXT:    s_bfe_u32 s71, s6, 0xe0002
+; GCN-NEXT:    s_bfe_u32 s72, s6, 0xd0003
+; GCN-NEXT:    s_bfe_u32 s75, s6, 0xc0004
+; GCN-NEXT:    s_bfe_u32 s76, s6, 0xb0005
+; GCN-NEXT:    s_bfe_u32 s79, s6, 0xa0006
+; GCN-NEXT:    s_bfe_u32 s80, s6, 0x90007
+; GCN-NEXT:    s_bfe_u32 s83, s6, 0x80008
+; GCN-NEXT:    s_bfe_u32 s85, s6, 0x70009
+; GCN-NEXT:    s_bfe_u32 s87, s6, 0x6000a
+; GCN-NEXT:    s_bfe_u32 s88, s6, 0x5000b
+; GCN-NEXT:    s_bfe_u32 s91, s6, 0x4000c
+; GCN-NEXT:    s_bfe_u32 s92, s6, 0x3000d
+; GCN-NEXT:    s_bfe_u32 s95, s6, 0x2000e
+; GCN-NEXT:    s_bfe_u32 vcc_lo, s6, 0x1000f
+; GCN-NEXT:    s_bfe_u32 s36, s7, 0xf0001
+; GCN-NEXT:    s_bfe_u32 s35, s7, 0xe0002
+; GCN-NEXT:    s_bfe_u32 s34, s7, 0xd0003
+; GCN-NEXT:    s_bfe_u32 s30, s7, 0xc0004
+; GCN-NEXT:    s_bfe_u32 s29, s7, 0xb0005
+; GCN-NEXT:    s_bfe_u32 s26, s7, 0xa0006
+; GCN-NEXT:    s_bfe_u32 s25, s7, 0x90007
+; GCN-NEXT:    s_bfe_u32 s22, s7, 0x80008
+; GCN-NEXT:    s_bfe_u32 s21, s7, 0x70009
+; GCN-NEXT:    s_bfe_u32 s18, s7, 0x6000a
+; GCN-NEXT:    s_bfe_u32 s17, s7, 0x5000b
+; GCN-NEXT:    s_bfe_u32 s14, s7, 0x4000c
+; GCN-NEXT:    s_bfe_u32 s13, s7, 0x3000d
+; GCN-NEXT:    s_bfe_u32 s10, s7, 0x2000e
+; GCN-NEXT:    s_bfe_u32 s9, s7, 0x1000f
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x7f
+; GCN-NEXT:    s_cselect_b32 s0, s0, 1
+; GCN-NEXT:    s_lshl_b32 s0, s0, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x7e
+; GCN-NEXT:    s_cselect_b32 s1, s1, 1
+; GCN-NEXT:    s_and_b32 s1, s1, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 2
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x7d
+; GCN-NEXT:    s_cselect_b32 s1, s2, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x7c
+; GCN-NEXT:    s_cselect_b32 s2, s3, 1
+; GCN-NEXT:    s_and_b32 s2, s2, 1
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_and_b32 s1, s1, 3
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_lshl_b32 s0, s0, 12
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x7b
+; GCN-NEXT:    s_cselect_b32 s1, s11, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x7a
+; GCN-NEXT:    s_cselect_b32 s2, s12, 1
+; GCN-NEXT:    s_and_b32 s2, s2, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 2
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x79
+; GCN-NEXT:    s_cselect_b32 s2, s15, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x78
+; GCN-NEXT:    s_cselect_b32 s3, s16, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 3
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_and_b32 s1, s1, 15
+; GCN-NEXT:    s_lshl_b32 s1, s1, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x77
+; GCN-NEXT:    s_cselect_b32 s1, s19, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x76
+; GCN-NEXT:    s_cselect_b32 s2, s20, 1
+; GCN-NEXT:    s_and_b32 s2, s2, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 2
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x75
+; GCN-NEXT:    s_cselect_b32 s2, s23, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x74
+; GCN-NEXT:    s_cselect_b32 s3, s24, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 3
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 4
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x73
+; GCN-NEXT:    s_cselect_b32 s2, s27, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x72
+; GCN-NEXT:    s_cselect_b32 s3, s28, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 2
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x71
+; GCN-NEXT:    s_cselect_b32 s3, s31, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x70
+; GCN-NEXT:    s_cselect_b32 s11, s33, 1
+; GCN-NEXT:    s_and_b32 s11, s11, 1
+; GCN-NEXT:    s_or_b32 s3, s11, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 3
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 15
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_and_b32 s1, s1, 0xff
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_lshl_b32 s0, s0, 16
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x6f
+; GCN-NEXT:    s_cselect_b32 s1, s9, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x6e
+; GCN-NEXT:    s_cselect_b32 s2, s10, 1
+; GCN-NEXT:    s_and_b32 s2, s2, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 2
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x6d
+; GCN-NEXT:    s_cselect_b32 s2, s13, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x6c
+; GCN-NEXT:    s_cselect_b32 s3, s14, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 3
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 12
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x6b
+; GCN-NEXT:    s_cselect_b32 s2, s17, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x6a
+; GCN-NEXT:    s_cselect_b32 s3, s18, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 2
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x69
+; GCN-NEXT:    s_cselect_b32 s3, s21, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x68
+; GCN-NEXT:    s_cselect_b32 s9, s22, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_or_b32 s3, s9, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 3
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 15
+; GCN-NEXT:    s_lshl_b32 s2, s2, 8
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x67
+; GCN-NEXT:    s_cselect_b32 s2, s25, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x66
+; GCN-NEXT:    s_cselect_b32 s3, s26, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 2
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x65
+; GCN-NEXT:    s_cselect_b32 s3, s29, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x64
+; GCN-NEXT:    s_cselect_b32 s9, s30, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_or_b32 s3, s9, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 3
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_lshl_b32 s2, s2, 4
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x63
+; GCN-NEXT:    s_cselect_b32 s3, s34, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x62
+; GCN-NEXT:    s_cselect_b32 s9, s35, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_lshl_b32 s9, s9, 2
+; GCN-NEXT:    s_or_b32 s3, s3, s9
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x60
+; GCN-NEXT:    s_cselect_b32 s7, s7, 1
+; GCN-NEXT:    s_and_b32 s7, s7, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x61
+; GCN-NEXT:    s_cselect_b32 s9, s36, 1
+; GCN-NEXT:    s_lshl_b32 s9, s9, 1
+; GCN-NEXT:    s_or_b32 s7, s7, s9
+; GCN-NEXT:    s_and_b32 s7, s7, 3
+; GCN-NEXT:    s_or_b32 s3, s7, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 15
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 0xff
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_and_b32 s1, s1, 0xffff
+; GCN-NEXT:    s_or_b32 s7, s1, s0
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x5f
+; GCN-NEXT:    s_cselect_b32 s0, s37, 1
+; GCN-NEXT:    s_lshl_b32 s0, s0, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x5e
+; GCN-NEXT:    s_cselect_b32 s1, s38, 1
+; GCN-NEXT:    s_and_b32 s1, s1, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 2
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x5d
+; GCN-NEXT:    s_cselect_b32 s1, s39, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x5c
+; GCN-NEXT:    s_cselect_b32 s2, vcc_hi, 1
+; GCN-NEXT:    s_and_b32 s2, s2, 1
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_and_b32 s1, s1, 3
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_lshl_b32 s0, s0, 12
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x5b
+; GCN-NEXT:    s_cselect_b32 s1, s94, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x5a
+; GCN-NEXT:    s_cselect_b32 s2, s93, 1
+; GCN-NEXT:    s_and_b32 s2, s2, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 2
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x59
+; GCN-NEXT:    s_cselect_b32 s2, s90, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x58
+; GCN-NEXT:    s_cselect_b32 s3, s89, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 3
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_and_b32 s1, s1, 15
+; GCN-NEXT:    s_lshl_b32 s1, s1, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x57
+; GCN-NEXT:    s_cselect_b32 s1, s86, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x56
+; GCN-NEXT:    s_cselect_b32 s2, s84, 1
+; GCN-NEXT:    s_and_b32 s2, s2, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 2
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x55
+; GCN-NEXT:    s_cselect_b32 s2, s82, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x54
+; GCN-NEXT:    s_cselect_b32 s3, s81, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 3
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 4
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x53
+; GCN-NEXT:    s_cselect_b32 s2, s78, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x52
+; GCN-NEXT:    s_cselect_b32 s3, s77, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 2
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x51
+; GCN-NEXT:    s_cselect_b32 s3, s74, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x50
+; GCN-NEXT:    s_cselect_b32 s9, s73, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_or_b32 s3, s9, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 3
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 15
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_and_b32 s1, s1, 0xff
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_lshl_b32 s0, s0, 16
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x4f
+; GCN-NEXT:    s_cselect_b32 s1, vcc_lo, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x4e
+; GCN-NEXT:    s_cselect_b32 s2, s95, 1
+; GCN-NEXT:    s_and_b32 s2, s2, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 2
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x4d
+; GCN-NEXT:    s_cselect_b32 s2, s92, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x4c
+; GCN-NEXT:    s_cselect_b32 s3, s91, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 3
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 12
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x4b
+; GCN-NEXT:    s_cselect_b32 s2, s88, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x4a
+; GCN-NEXT:    s_cselect_b32 s3, s87, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 2
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x49
+; GCN-NEXT:    s_cselect_b32 s3, s85, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x48
+; GCN-NEXT:    s_cselect_b32 s9, s83, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_or_b32 s3, s9, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 3
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 15
+; GCN-NEXT:    s_lshl_b32 s2, s2, 8
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x47
+; GCN-NEXT:    s_cselect_b32 s2, s80, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x46
+; GCN-NEXT:    s_cselect_b32 s3, s79, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 2
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x45
+; GCN-NEXT:    s_cselect_b32 s3, s76, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x44
+; GCN-NEXT:    s_cselect_b32 s9, s75, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_or_b32 s3, s9, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 3
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_lshl_b32 s2, s2, 4
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x43
+; GCN-NEXT:    s_cselect_b32 s3, s72, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 3
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x42
+; GCN-NEXT:    s_cselect_b32 s9, s71, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_lshl_b32 s9, s9, 2
+; GCN-NEXT:    s_or_b32 s3, s3, s9
+; GCN-NEXT:    s_cmp_lg_u32 s8, 64
+; GCN-NEXT:    s_cselect_b32 s6, s6, 1
+; GCN-NEXT:    s_and_b32 s6, s6, 1
+; GCN-NEXT:    s_cmpk_lg_i32 s8, 0x41
+; GCN-NEXT:    s_cselect_b32 s9, s70, 1
+; GCN-NEXT:    s_lshl_b32 s9, s9, 1
+; GCN-NEXT:    s_or_b32 s6, s6, s9
+; GCN-NEXT:    s_and_b32 s6, s6, 3
+; GCN-NEXT:    s_or_b32 s3, s6, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 15
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 0xff
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_and_b32 s1, s1, 0xffff
+; GCN-NEXT:    s_or_b32 s6, s1, s0
+; GCN-NEXT:    s_cmp_lg_u32 s8, 63
+; GCN-NEXT:    s_cselect_b32 s0, s69, 1
+; GCN-NEXT:    s_lshl_b32 s0, s0, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 62
+; GCN-NEXT:    s_cselect_b32 s1, s68, 1
+; GCN-NEXT:    s_and_b32 s1, s1, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 2
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 61
+; GCN-NEXT:    s_cselect_b32 s1, s67, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 60
+; GCN-NEXT:    s_cselect_b32 s2, s66, 1
+; GCN-NEXT:    s_and_b32 s2, s2, 1
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_and_b32 s1, s1, 3
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_lshl_b32 s0, s0, 12
+; GCN-NEXT:    s_cmp_lg_u32 s8, 59
+; GCN-NEXT:    s_cselect_b32 s1, s63, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 58
+; GCN-NEXT:    s_cselect_b32 s2, s61, 1
+; GCN-NEXT:    s_and_b32 s2, s2, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 2
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_cmp_lg_u32 s8, 57
+; GCN-NEXT:    s_cselect_b32 s2, s59, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 56
+; GCN-NEXT:    s_cselect_b32 s3, s58, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 3
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_and_b32 s1, s1, 15
+; GCN-NEXT:    s_lshl_b32 s1, s1, 8
+; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 55
+; GCN-NEXT:    s_cselect_b32 s1, s55, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 54
+; GCN-NEXT:    s_cselect_b32 s2, s53, 1
+; GCN-NEXT:    s_and_b32 s2, s2, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 2
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_cmp_lg_u32 s8, 53
+; GCN-NEXT:    s_cselect_b32 s2, s51, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 52
+; GCN-NEXT:    s_cselect_b32 s3, s50, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 3
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 4
+; GCN-NEXT:    s_cmp_lg_u32 s8, 51
+; GCN-NEXT:    s_cselect_b32 s2, s47, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 50
+; GCN-NEXT:    s_cselect_b32 s3, s45, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 2
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 49
+; GCN-NEXT:    s_cselect_b32 s3, s43, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 48
+; GCN-NEXT:    s_cselect_b32 s9, s42, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_or_b32 s3, s9, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 3
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 15
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_and_b32 s1, s1, 0xff
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_lshl_b32 s0, s0, 16
+; GCN-NEXT:    s_cmp_lg_u32 s8, 47
+; GCN-NEXT:    s_cselect_b32 s1, s65, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 46
+; GCN-NEXT:    s_cselect_b32 s2, s64, 1
+; GCN-NEXT:    s_and_b32 s2, s2, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 2
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_cmp_lg_u32 s8, 45
+; GCN-NEXT:    s_cselect_b32 s2, s62, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 44
+; GCN-NEXT:    s_cselect_b32 s3, s60, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 3
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 12
+; GCN-NEXT:    s_cmp_lg_u32 s8, 43
+; GCN-NEXT:    s_cselect_b32 s2, s57, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 42
+; GCN-NEXT:    s_cselect_b32 s3, s56, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 2
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 41
+; GCN-NEXT:    s_cselect_b32 s3, s54, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 40
+; GCN-NEXT:    s_cselect_b32 s9, s52, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_or_b32 s3, s9, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 3
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 15
+; GCN-NEXT:    s_lshl_b32 s2, s2, 8
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_cmp_lg_u32 s8, 39
+; GCN-NEXT:    s_cselect_b32 s2, s49, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 38
+; GCN-NEXT:    s_cselect_b32 s3, s48, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 2
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 37
+; GCN-NEXT:    s_cselect_b32 s3, s46, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 36
+; GCN-NEXT:    s_cselect_b32 s9, s44, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_or_b32 s3, s9, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 3
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_lshl_b32 s2, s2, 4
+; GCN-NEXT:    s_cmp_lg_u32 s8, 35
+; GCN-NEXT:    s_cselect_b32 s3, s41, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 34
+; GCN-NEXT:    s_cselect_b32 s9, s40, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_lshl_b32 s9, s9, 2
+; GCN-NEXT:    s_or_b32 s3, s3, s9
+; GCN-NEXT:    s_cmp_lg_u32 s8, 32
+; GCN-NEXT:    s_cselect_b32 s5, s5, 1
+; GCN-NEXT:    s_and_b32 s5, s5, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 33
+; GCN-NEXT:    v_readlane_b32 s9, v0, 33
+; GCN-NEXT:    s_cselect_b32 s9, s9, 1
+; GCN-NEXT:    s_lshl_b32 s9, s9, 1
+; GCN-NEXT:    s_or_b32 s5, s5, s9
+; GCN-NEXT:    s_and_b32 s5, s5, 3
+; GCN-NEXT:    s_or_b32 s3, s5, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 15
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 0xff
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_and_b32 s1, s1, 0xffff
+; GCN-NEXT:    s_or_b32 s0, s1, s0
+; GCN-NEXT:    s_cmp_lg_u32 s8, 31
+; GCN-NEXT:    v_readlane_b32 s1, v0, 17
+; GCN-NEXT:    s_cselect_b32 s1, s1, 1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 30
+; GCN-NEXT:    v_readlane_b32 s2, v0, 16
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
+; GCN-NEXT:    s_and_b32 s2, s2, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 2
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_cmp_lg_u32 s8, 29
+; GCN-NEXT:    v_readlane_b32 s2, v0, 15
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 28
+; GCN-NEXT:    v_readlane_b32 s3, v0, 14
+; GCN-NEXT:    s_cselect_b32 s3, s3, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 3
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 12
+; GCN-NEXT:    s_cmp_lg_u32 s8, 27
+; GCN-NEXT:    v_readlane_b32 s2, v0, 13
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 26
+; GCN-NEXT:    v_readlane_b32 s3, v0, 12
+; GCN-NEXT:    s_cselect_b32 s3, s3, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 2
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 25
+; GCN-NEXT:    v_readlane_b32 s3, v0, 11
+; GCN-NEXT:    s_cselect_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 24
+; GCN-NEXT:    v_readlane_b32 s5, v0, 10
+; GCN-NEXT:    s_cselect_b32 s5, s5, 1
+; GCN-NEXT:    s_and_b32 s5, s5, 1
+; GCN-NEXT:    s_or_b32 s3, s5, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 3
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 15
+; GCN-NEXT:    s_lshl_b32 s2, s2, 8
+; GCN-NEXT:    s_or_b32 s1, s1, s2
+; GCN-NEXT:    s_cmp_lg_u32 s8, 23
+; GCN-NEXT:    v_readlane_b32 s2, v0, 9
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 22
+; GCN-NEXT:    v_readlane_b32 s3, v0, 8
+; GCN-NEXT:    s_cselect_b32 s3, s3, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 2
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 21
+; GCN-NEXT:    v_readlane_b32 s3, v0, 7
+; GCN-NEXT:    s_cselect_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 20
+; GCN-NEXT:    v_readlane_b32 s5, v0, 6
+; GCN-NEXT:    s_cselect_b32 s5, s5, 1
+; GCN-NEXT:    s_and_b32 s5, s5, 1
+; GCN-NEXT:    s_or_b32 s3, s5, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 3
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_lshl_b32 s2, s2, 4
+; GCN-NEXT:    s_cmp_lg_u32 s8, 19
+; GCN-NEXT:    v_readlane_b32 s3, v0, 5
+; GCN-NEXT:    s_cselect_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 18
+; GCN-NEXT:    v_readlane_b32 s5, v0, 4
+; GCN-NEXT:    s_cselect_b32 s5, s5, 1
+; GCN-NEXT:    s_and_b32 s5, s5, 1
+; GCN-NEXT:    s_lshl_b32 s5, s5, 2
+; GCN-NEXT:    s_or_b32 s3, s3, s5
+; GCN-NEXT:    s_cmp_lg_u32 s8, 17
+; GCN-NEXT:    v_readlane_b32 s5, v0, 3
+; GCN-NEXT:    s_cselect_b32 s5, s5, 1
+; GCN-NEXT:    s_lshl_b32 s5, s5, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 16
+; GCN-NEXT:    v_readlane_b32 s9, v0, 2
+; GCN-NEXT:    s_cselect_b32 s9, s9, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_or_b32 s5, s9, s5
+; GCN-NEXT:    s_and_b32 s5, s5, 3
+; GCN-NEXT:    s_or_b32 s3, s5, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 15
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 0xff
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    s_lshl_b32 s1, s1, 16
+; GCN-NEXT:    s_cmp_lg_u32 s8, 15
+; GCN-NEXT:    v_readlane_b32 s2, v0, 32
+; GCN-NEXT:    s_cselect_b32 s2, s2, 1
+; GCN-NEXT:    s_lshl_b32 s2, s2, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 14
+; GCN-NEXT:    v_readlane_b32 s3, v0, 31
+; GCN-NEXT:    s_cselect_b32 s3, s3, 1
+; GCN-NEXT:    s_and_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 2
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 13
+; GCN-NEXT:    v_readlane_b32 s3, v0, 30
+; GCN-NEXT:    s_cselect_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 12
+; GCN-NEXT:    v_readlane_b32 s5, v0, 29
+; GCN-NEXT:    s_cselect_b32 s5, s5, 1
+; GCN-NEXT:    s_and_b32 s5, s5, 1
+; GCN-NEXT:    s_or_b32 s3, s5, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 3
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_lshl_b32 s2, s2, 12
+; GCN-NEXT:    s_cmp_lg_u32 s8, 11
+; GCN-NEXT:    v_readlane_b32 s3, v0, 28
+; GCN-NEXT:    s_cselect_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 10
+; GCN-NEXT:    v_readlane_b32 s5, v0, 27
+; GCN-NEXT:    s_cselect_b32 s5, s5, 1
+; GCN-NEXT:    s_and_b32 s5, s5, 1
+; GCN-NEXT:    s_lshl_b32 s5, s5, 2
+; GCN-NEXT:    s_or_b32 s3, s3, s5
+; GCN-NEXT:    s_cmp_lg_u32 s8, 9
+; GCN-NEXT:    v_readlane_b32 s5, v0, 26
+; GCN-NEXT:    s_cselect_b32 s5, s5, 1
+; GCN-NEXT:    s_lshl_b32 s5, s5, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 8
+; GCN-NEXT:    v_readlane_b32 s9, v0, 25
+; GCN-NEXT:    s_cselect_b32 s9, s9, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_or_b32 s5, s9, s5
+; GCN-NEXT:    s_and_b32 s5, s5, 3
+; GCN-NEXT:    s_or_b32 s3, s5, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 15
+; GCN-NEXT:    s_lshl_b32 s3, s3, 8
+; GCN-NEXT:    s_or_b32 s2, s2, s3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 7
+; GCN-NEXT:    v_readlane_b32 s3, v0, 24
+; GCN-NEXT:    s_cselect_b32 s3, s3, 1
+; GCN-NEXT:    s_lshl_b32 s3, s3, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 6
+; GCN-NEXT:    v_readlane_b32 s5, v0, 23
+; GCN-NEXT:    s_cselect_b32 s5, s5, 1
+; GCN-NEXT:    s_and_b32 s5, s5, 1
+; GCN-NEXT:    s_lshl_b32 s5, s5, 2
+; GCN-NEXT:    s_or_b32 s3, s3, s5
+; GCN-NEXT:    s_cmp_lg_u32 s8, 5
+; GCN-NEXT:    v_readlane_b32 s5, v0, 22
+; GCN-NEXT:    s_cselect_b32 s5, s5, 1
+; GCN-NEXT:    s_lshl_b32 s5, s5, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 4
+; GCN-NEXT:    v_readlane_b32 s9, v0, 21
+; GCN-NEXT:    s_cselect_b32 s9, s9, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_or_b32 s5, s9, s5
+; GCN-NEXT:    s_and_b32 s5, s5, 3
+; GCN-NEXT:    s_or_b32 s3, s5, s3
+; GCN-NEXT:    s_lshl_b32 s3, s3, 4
+; GCN-NEXT:    s_cmp_lg_u32 s8, 3
+; GCN-NEXT:    v_readlane_b32 s5, v0, 20
+; GCN-NEXT:    s_cselect_b32 s5, s5, 1
+; GCN-NEXT:    s_lshl_b32 s5, s5, 3
+; GCN-NEXT:    s_cmp_lg_u32 s8, 2
+; GCN-NEXT:    v_readlane_b32 s9, v0, 19
+; GCN-NEXT:    s_cselect_b32 s9, s9, 1
+; GCN-NEXT:    s_and_b32 s9, s9, 1
+; GCN-NEXT:    s_lshl_b32 s9, s9, 2
+; GCN-NEXT:    s_or_b32 s5, s5, s9
+; GCN-NEXT:    s_cmp_lg_u32 s8, 0
+; GCN-NEXT:    s_cselect_b32 s4, s4, 1
+; GCN-NEXT:    s_and_b32 s4, s4, 1
+; GCN-NEXT:    s_cmp_lg_u32 s8, 1
+; GCN-NEXT:    v_readlane_b32 s8, v0, 18
+; GCN-NEXT:    s_cselect_b32 s8, s8, 1
+; GCN-NEXT:    s_lshl_b32 s8, s8, 1
+; GCN-NEXT:    s_or_b32 s4, s4, s8
+; GCN-NEXT:    s_and_b32 s4, s4, 3
+; GCN-NEXT:    s_or_b32 s4, s4, s5
+; GCN-NEXT:    s_and_b32 s4, s4, 15
+; GCN-NEXT:    s_or_b32 s3, s4, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 0xff
+; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s2, s2, 0xffff
+; GCN-NEXT:    s_or_b32 s1, s2, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_readlane_b32 s0, v0, 0
+; GCN-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-NEXT:    v_mov_b32_e32 v6, s1
+; GCN-NEXT:    v_mov_b32_e32 v3, s6
+; GCN-NEXT:    v_mov_b32_e32 v4, s7
+; GCN-NEXT:    v_mov_b32_e32 v5, s0
+; GCN-NEXT:    flat_store_dwordx4 v[5:6], v[1:4]
+; GCN-NEXT:    ; kill: killed $vgpr0
 ; GCN-NEXT:    s_endpgm
 entry:
   %v = insertelement <128 x i1> %vec, i1 1, i32 %sel
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 2a8eac8712e52a..213813a94fc859 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1612,16 +1612,16 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s5, s[6:7], 0x28
-; VI-NEXT:    v_mov_b32_e32 v0, 0xff
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b32 s4, s4, 3
-; VI-NEXT:    v_lshlrev_b16_e32 v0, s4, v0
-; VI-NEXT:    v_not_b32_e32 v1, v0
-; VI-NEXT:    v_and_b32_e32 v1, s5, v1
-; VI-NEXT:    v_and_b32_e32 v0, 0x505, v0
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
+; VI-NEXT:    s_and_b32 s6, s4, 0x505
+; VI-NEXT:    s_xor_b32 s4, s4, 0xffff
+; VI-NEXT:    s_and_b32 s4, s4, s5
+; VI-NEXT:    s_or_b32 s4, s6, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
@@ -1871,100 +1871,88 @@ define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s5, s11, 24
 ; VI-NEXT:    s_cmp_lg_u32 s4, 15
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_lshr_b32 s5, s11, 16
+; VI-NEXT:    s_cselect_b32 s5, s5, 5
+; VI-NEXT:    s_lshl_b32 s5, s5, 8
+; VI-NEXT:    s_lshr_b32 s6, s11, 16
 ; VI-NEXT:    s_cmp_lg_u32 s4, 14
-; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_lshr_b32 s5, s11, 8
-; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; VI-NEXT:    s_cselect_b32 s6, s6, 5
+; VI-NEXT:    s_and_b32 s6, s6, 0xff
+; VI-NEXT:    s_or_b32 s5, s6, s5
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_lshr_b32 s6, s11, 8
 ; VI-NEXT:    s_cmp_lg_u32 s4, 13
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cselect_b32 s6, s6, 5
+; VI-NEXT:    s_lshl_b32 s6, s6, 8
 ; VI-NEXT:    s_cmp_lg_u32 s4, 12
-; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v2, s11
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
-; VI-NEXT:    s_lshr_b32 s5, s10, 24
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_cselect_b32 s7, s11, 5
+; VI-NEXT:    s_and_b32 s7, s7, 0xff
+; VI-NEXT:    s_or_b32 s6, s7, s6
+; VI-NEXT:    s_and_b32 s6, s6, 0xffff
+; VI-NEXT:    s_or_b32 s5, s6, s5
+; VI-NEXT:    s_lshr_b32 s6, s10, 24
 ; VI-NEXT:    s_cmp_lg_u32 s4, 11
-; VI-NEXT:    v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_lshr_b32 s5, s10, 16
+; VI-NEXT:    s_cselect_b32 s6, s6, 5
+; VI-NEXT:    s_lshl_b32 s6, s6, 8
+; VI-NEXT:    s_lshr_b32 s7, s10, 16
 ; VI-NEXT:    s_cmp_lg_u32 s4, 10
-; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_lshr_b32 s5, s10, 8
-; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; VI-NEXT:    s_cselect_b32 s7, s7, 5
+; VI-NEXT:    s_and_b32 s7, s7, 0xff
+; VI-NEXT:    s_or_b32 s6, s7, s6
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    s_lshr_b32 s7, s10, 8
 ; VI-NEXT:    s_cmp_lg_u32 s4, 9
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cselect_b32 s7, s7, 5
+; VI-NEXT:    s_lshl_b32 s7, s7, 8
 ; VI-NEXT:    s_cmp_lg_u32 s4, 8
-; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v2, s10
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
-; VI-NEXT:    s_lshr_b32 s5, s9, 24
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_cselect_b32 s10, s10, 5
+; VI-NEXT:    s_and_b32 s10, s10, 0xff
+; VI-NEXT:    s_or_b32 s7, s10, s7
+; VI-NEXT:    s_and_b32 s7, s7, 0xffff
+; VI-NEXT:    s_or_b32 s6, s7, s6
+; VI-NEXT:    s_lshr_b32 s7, s9, 24
 ; VI-NEXT:    s_cmp_lg_u32 s4, 7
-; VI-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_lshr_b32 s5, s9, 16
+; VI-NEXT:    s_cselect_b32 s7, s7, 5
+; VI-NEXT:    s_lshl_b32 s7, s7, 8
+; VI-NEXT:    s_lshr_b32 s10, s9, 16
 ; VI-NEXT:    s_cmp_lg_u32 s4, 6
-; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_lshr_b32 s5, s9, 8
-; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; VI-NEXT:    s_cselect_b32 s10, s10, 5
+; VI-NEXT:    s_and_b32 s10, s10, 0xff
+; VI-NEXT:    s_or_b32 s7, s10, s7
+; VI-NEXT:    s_lshl_b32 s7, s7, 16
+; VI-NEXT:    s_lshr_b32 s10, s9, 8
 ; VI-NEXT:    s_cmp_lg_u32 s4, 5
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cselect_b32 s10, s10, 5
+; VI-NEXT:    s_lshl_b32 s10, s10, 8
 ; VI-NEXT:    s_cmp_lg_u32 s4, 4
-; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v4, s9
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
-; VI-NEXT:    s_lshr_b32 s5, s8, 24
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_cselect_b32 s9, s9, 5
+; VI-NEXT:    s_and_b32 s9, s9, 0xff
+; VI-NEXT:    s_or_b32 s9, s9, s10
+; VI-NEXT:    s_and_b32 s9, s9, 0xffff
+; VI-NEXT:    s_or_b32 s7, s9, s7
+; VI-NEXT:    s_lshr_b32 s9, s8, 24
 ; VI-NEXT:    s_cmp_lg_u32 s4, 3
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_lshr_b32 s5, s8, 16
+; VI-NEXT:    s_cselect_b32 s9, s9, 5
+; VI-NEXT:    s_lshl_b32 s9, s9, 8
+; VI-NEXT:    s_lshr_b32 s10, s8, 16
 ; VI-NEXT:    s_cmp_lg_u32 s4, 2
-; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v4, s5
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    s_lshr_b32 s5, s8, 8
-; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
+; VI-NEXT:    s_cselect_b32 s10, s10, 5
+; VI-NEXT:    s_and_b32 s10, s10, 0xff
+; VI-NEXT:    s_or_b32 s9, s10, s9
+; VI-NEXT:    s_lshl_b32 s9, s9, 16
+; VI-NEXT:    s_lshr_b32 s10, s8, 8
 ; VI-NEXT:    s_cmp_lg_u32 s4, 1
-; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v4, s5
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cselect_b32 s10, s10, 5
+; VI-NEXT:    s_lshl_b32 s10, s10, 8
 ; VI-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
-; VI-NEXT:    v_mov_b32_e32 v5, s8
-; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; VI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
-; VI-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_cselect_b32 s4, s8, 5
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
+; VI-NEXT:    s_or_b32 s4, s4, s10
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_or_b32 s4, s4, s9
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s7
+; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index 2e8049e9765e18..f86c8294ab3c00 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -30,13 +30,12 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) {
 ; GFX8CHECK:       ; %bb.0:
 ; GFX8CHECK-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX8CHECK-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8CHECK-NEXT:    v_mov_b32_e32 v0, 0x7fff
-; GFX8CHECK-NEXT:    s_movk_i32 s2, 0x7f80
 ; GFX8CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8CHECK-NEXT:    v_and_b32_e32 v0, s4, v0
-; GFX8CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s2, v0
+; GFX8CHECK-NEXT:    s_and_b32 s2, s4, 0x7fff
+; GFX8CHECK-NEXT:    s_cmpk_gt_i32 s2, 0x7f80
+; GFX8CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX8CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[2:3]
 ; GFX8CHECK-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8CHECK-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8CHECK-NEXT:    s_endpgm
@@ -45,13 +44,12 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) {
 ; GFX9CHECK:       ; %bb.0:
 ; GFX9CHECK-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX9CHECK-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9CHECK-NEXT:    v_mov_b32_e32 v1, 0x7fff
-; GFX9CHECK-NEXT:    s_movk_i32 s2, 0x7f80
 ; GFX9CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9CHECK-NEXT:    v_and_b32_e32 v1, s4, v1
-; GFX9CHECK-NEXT:    v_cmp_lt_i16_e32 vcc, s2, v1
-; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9CHECK-NEXT:    s_and_b32 s2, s4, 0x7fff
+; GFX9CHECK-NEXT:    s_cmpk_gt_i32 s2, 0x7f80
+; GFX9CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX9CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[2:3]
 ; GFX9CHECK-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9CHECK-NEXT:    s_endpgm
 ;
@@ -60,12 +58,13 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) {
 ; GFX10CHECK-NEXT:    s_clause 0x1
 ; GFX10CHECK-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX10CHECK-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX10CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10CHECK-NEXT:    v_and_b32_e64 v0, 0x7fff, s4
-; GFX10CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX10CHECK-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10CHECK-NEXT:    s_and_b32 s2, s4, 0x7fff
+; GFX10CHECK-NEXT:    s_cmpk_gt_i32 s2, 0x7f80
+; GFX10CHECK-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX10CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
+; GFX10CHECK-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10CHECK-NEXT:    s_endpgm
 ;
 ; GFX11CHECK-LABEL: sgpr_isnan_bf16:
@@ -73,12 +72,13 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) {
 ; GFX11CHECK-NEXT:    s_clause 0x1
 ; GFX11CHECK-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX11CHECK-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT:    v_and_b32_e64 v0, 0x7fff, s4
-; GFX11CHECK-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
-; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11CHECK-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11CHECK-NEXT:    s_and_b32 s2, s4, 0x7fff
+; GFX11CHECK-NEXT:    s_cmpk_gt_i32 s2, 0x7f80
+; GFX11CHECK-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s2
+; GFX11CHECK-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11CHECK-NEXT:    s_nop 0
 ; GFX11CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 7178eaf2e73846..bfe6343e823ce1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -851,6 +851,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
 ; GFX8-LABEL: constant_zextload_v2i1_to_v2i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -858,9 +859,8 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -889,11 +889,10 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v2, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 1, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -934,9 +933,8 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 1, v2
+; GFX8-NEXT:    v_bfe_i32 v3, v2, 1, 1
 ; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 1
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -962,14 +960,14 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_sextload_v2i1_to_v2i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v2, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 1, v0
-; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 1
+; GFX12-NEXT:    s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_bfe_i32 s3, s2, 0x10000
+; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x10001
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1008,16 +1006,14 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    flat_load_ubyte v1, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 2, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v5
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX8-NEXT:    v_bfe_u32 v1, v1, 1, 1
 ; GFX8-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1049,17 +1045,14 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v3, s[2:3]
+; GFX12-NEXT:    global_load_u8 v1, v3, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 1, v0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 2, v0
-; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
+; GFX12-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX12-NEXT:    v_bfe_u32 v1, v1, 1, 1
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1102,11 +1095,9 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 2, v0
+; GFX8-NEXT:    v_bfe_i32 v2, v0, 2, 1
+; GFX8-NEXT:    v_bfe_i32 v1, v0, 1, 1
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 1
 ; GFX8-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1138,16 +1129,14 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_sextload_v3i1_to_v3i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v3, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 2, v0
-; GFX12-NEXT:    v_lshrrev_b16 v4, 1, v0
-; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_i32 v2, v1, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v1, v4, 0, 1
+; GFX12-NEXT:    s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_bfe_i32 s3, s2, 0x10002
+; GFX12-NEXT:    s_bfe_i32 s4, s2, 0x10000
+; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x10001
+; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
+; GFX12-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
 ; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1186,19 +1175,15 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    flat_load_ubyte v1, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, 3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 2, v0
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 3, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX8-NEXT:    v_bfe_u32 v2, v1, 2, 1
+; GFX8-NEXT:    v_bfe_u32 v1, v1, 1, 1
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1229,21 +1214,14 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v4, s[2:3]
+; GFX12-NEXT:    global_load_u8 v1, v4, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 2, v0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 1, v0
-; GFX12-NEXT:    v_lshrrev_b16 v3, 3, v0
-; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX12-NEXT:    v_and_b32_e32 v5, 1, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v1
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v5
+; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX12-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX12-NEXT:    v_bfe_u32 v2, v1, 2, 1
+; GFX12-NEXT:    v_bfe_u32 v1, v1, 1, 1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 3, v3
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1286,13 +1264,10 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 2, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 3, v0
+; GFX8-NEXT:    v_bfe_i32 v3, v0, 3, 1
+; GFX8-NEXT:    v_bfe_i32 v2, v0, 2, 1
+; GFX8-NEXT:    v_bfe_i32 v1, v0, 1, 1
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 1
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1324,19 +1299,17 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_sextload_v4i1_to_v4i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v4, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 3, v0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 2, v0
-; GFX12-NEXT:    v_lshrrev_b16 v5, 1, v0
-; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_bfe_i32 v3, v1, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT:    v_bfe_i32 v1, v5, 0, 1
+; GFX12-NEXT:    s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_bfe_i32 s3, s2, 0x10003
+; GFX12-NEXT:    s_bfe_i32 s4, s2, 0x10002
+; GFX12-NEXT:    s_bfe_i32 s5, s2, 0x10000
+; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x10001
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1380,33 +1353,32 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    flat_load_ubyte v1, v[0:1]
-; GFX8-NEXT:    s_add_u32 s2, s0, 16
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v11, s3
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s1
-; GFX8-NEXT:    v_mov_b32_e32 v10, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 5, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v12, 3, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 4, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 6, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 2, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 7, v1
-; GFX8-NEXT:    v_and_b32_e32 v13, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v12
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff, v13
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
-; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8-NEXT:    s_bfe_u32 s3, s2, 0x10003
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x10001
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x10005
+; GFX8-NEXT:    s_and_b32 s6, s2, 1
+; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x10002
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x10004
+; GFX8-NEXT:    s_add_u32 s0, s0, 16
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_mov_b32_e32 v11, s1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 7, v0
+; GFX8-NEXT:    v_bfe_u32 v2, v0, 6, 1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v10, s0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s3
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_zextload_v8i1_to_v8i32:
@@ -1448,27 +1420,23 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v8, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 5, v0
-; GFX12-NEXT:    v_lshrrev_b16 v5, 1, v0
-; GFX12-NEXT:    v_lshrrev_b16 v6, 3, v0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 4, v0
-; GFX12-NEXT:    v_lshrrev_b16 v3, 6, v0
-; GFX12-NEXT:    v_and_b32_e32 v9, 1, v2
-; GFX12-NEXT:    v_lshrrev_b16 v4, 7, v0
-; GFX12-NEXT:    v_lshrrev_b16 v7, 2, v0
-; GFX12-NEXT:    v_and_b32_e32 v10, 1, v5
-; GFX12-NEXT:    v_and_b32_e32 v5, 1, v6
-; GFX12-NEXT:    v_and_b32_e32 v6, 1, v3
-; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT:    v_and_b32_e32 v2, 1, v7
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 1, v1
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v9
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; GFX12-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10001
+; GFX12-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10003
+; GFX12-NEXT:    s_bfe_u32 s5, s2, 0x10005
+; GFX12-NEXT:    s_and_b32 s6, s2, 1
+; GFX12-NEXT:    s_bfe_u32 s7, s2, 0x10002
+; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x10004
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 7, v0
+; GFX12-NEXT:    v_bfe_u32 v2, v0, 6, 1
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
+; GFX12-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v4, s6
+; GFX12-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -1511,7 +1479,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
 ; GFX8-NEXT:    s_add_u32 s2, s0, 16
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v11, s3
@@ -1519,21 +1487,14 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 4, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 5, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 6, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 7, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 2, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 3, v0
-; GFX8-NEXT:    v_bfe_i32 v7, v7, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v6, v6, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v4, v4, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 1
+; GFX8-NEXT:    v_bfe_i32 v3, v4, 3, 1
+; GFX8-NEXT:    v_bfe_i32 v2, v4, 2, 1
+; GFX8-NEXT:    v_bfe_i32 v1, v4, 1, 1
+; GFX8-NEXT:    v_bfe_i32 v0, v4, 0, 1
+; GFX8-NEXT:    v_bfe_i32 v7, v4, 7, 1
+; GFX8-NEXT:    v_bfe_i32 v6, v4, 6, 1
+; GFX8-NEXT:    v_bfe_i32 v5, v4, 5, 1
+; GFX8-NEXT:    v_bfe_i32 v4, v4, 4, 1
 ; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GFX8-NEXT:    s_endpgm
@@ -1579,28 +1540,26 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_sextload_v8i1_to_v8i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v8, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 4, v0
-; GFX12-NEXT:    v_lshrrev_b16 v4, 5, v0
-; GFX12-NEXT:    v_lshrrev_b16 v5, 6, v0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 3, v0
-; GFX12-NEXT:    v_lshrrev_b16 v6, 2, v0
-; GFX12-NEXT:    v_lshrrev_b16 v7, 7, v0
-; GFX12-NEXT:    v_lshrrev_b16 v9, 1, v0
-; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v3, v2, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v2, v6, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v7, v7, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v6, v5, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v5, v4, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v4, v1, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v1, v9, 0, 1
+; GFX12-NEXT:    s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_bfe_i32 s3, s2, 0x10003
+; GFX12-NEXT:    s_bfe_i32 s4, s2, 0x10002
+; GFX12-NEXT:    s_bfe_i32 s5, s2, 0x10001
+; GFX12-NEXT:    s_bfe_i32 s6, s2, 0x10000
+; GFX12-NEXT:    s_bfe_i32 s7, s2, 0x10007
+; GFX12-NEXT:    s_bfe_i32 s8, s2, 0x10006
+; GFX12-NEXT:    s_bfe_i32 s9, s2, 0x10004
+; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x10005
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s9 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT:    v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -1653,62 +1612,59 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    flat_load_ushort v1, v[0:1]
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v17, s1
+; GFX8-NEXT:    v_mov_b32_e32 v16, s0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8-NEXT:    s_and_b32 s6, 0xffff, s2
+; GFX8-NEXT:    s_bfe_u32 s3, s2, 0x10003
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x10001
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x10007
+; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x10009
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x1000d
+; GFX8-NEXT:    s_and_b32 s9, s2, 1
+; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x1000a
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x1000c
+; GFX8-NEXT:    s_bfe_u32 s11, s6, 0x10005
+; GFX8-NEXT:    s_bfe_u32 s12, s6, 0x1000b
+; GFX8-NEXT:    s_lshr_b32 s13, s6, 15
+; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x10002
+; GFX8-NEXT:    s_bfe_u32 s15, s6, 0x10006
+; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x10004
+; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x10008
+; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x1000e
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 48
+; GFX8-NEXT:    v_mov_b32_e32 v15, s3
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v19, s3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s13
 ; GFX8-NEXT:    v_mov_b32_e32 v18, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NEXT:    v_mov_b32_e32 v0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v17, s1
+; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v16, s0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    v_mov_b32_e32 v6, s10
+; GFX8-NEXT:    v_mov_b32_e32 v4, s17
+; GFX8-NEXT:    v_mov_b32_e32 v7, s12
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    s_add_u32 s0, s0, 16
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v21, s3
-; GFX8-NEXT:    v_mov_b32_e32 v23, s1
-; GFX8-NEXT:    v_mov_b32_e32 v20, s2
-; GFX8-NEXT:    v_mov_b32_e32 v22, s0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 12, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 3, v1
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v8
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v3
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff, v3
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 7, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 14, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v11, 2, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 13, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v9, 9, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 10, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 4, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 6, v1
-; GFX8-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v7
-; GFX8-NEXT:    v_and_b32_e32 v14, 1, v11
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff, v12
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff, v3
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 11, v1
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v11, 15, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 5, v1
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v9
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff, v24
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[8:11]
-; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
-; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[4:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v11, s5
+; GFX8-NEXT:    v_mov_b32_e32 v8, s16
+; GFX8-NEXT:    v_mov_b32_e32 v9, s11
+; GFX8-NEXT:    v_mov_b32_e32 v10, s15
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v12, s9
+; GFX8-NEXT:    v_mov_b32_e32 v13, s4
+; GFX8-NEXT:    v_mov_b32_e32 v14, s14
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1772,49 +1728,39 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v16, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 13, v0
-; GFX12-NEXT:    v_lshrrev_b16 v13, 1, v0
-; GFX12-NEXT:    v_lshrrev_b16 v15, 3, v0
-; GFX12-NEXT:    v_lshrrev_b16 v4, 9, v0
-; GFX12-NEXT:    v_lshrrev_b16 v6, 11, v0
-; GFX12-NEXT:    v_and_b32_e32 v17, 1, v2
-; GFX12-NEXT:    v_lshrrev_b16 v10, 5, v0
-; GFX12-NEXT:    v_lshrrev_b16 v12, 7, v0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 12, v0
-; GFX12-NEXT:    v_lshrrev_b16 v3, 14, v0
-; GFX12-NEXT:    v_lshrrev_b16 v5, 15, v0
-; GFX12-NEXT:    v_lshrrev_b16 v14, 2, v0
-; GFX12-NEXT:    v_and_b32_e32 v22, 1, v13
-; GFX12-NEXT:    v_and_b32_e32 v13, 1, v15
-; GFX12-NEXT:    v_lshrrev_b16 v7, 8, v0
-; GFX12-NEXT:    v_lshrrev_b16 v8, 10, v0
-; GFX12-NEXT:    v_lshrrev_b16 v9, 4, v0
-; GFX12-NEXT:    v_lshrrev_b16 v11, 6, v0
-; GFX12-NEXT:    v_and_b32_e32 v18, 1, v4
-; GFX12-NEXT:    v_and_b32_e32 v19, 1, v6
-; GFX12-NEXT:    v_and_b32_e32 v20, 1, v10
-; GFX12-NEXT:    v_and_b32_e32 v21, 1, v12
-; GFX12-NEXT:    v_and_b32_e32 v2, 1, v14
-; GFX12-NEXT:    v_and_b32_e32 v15, 0xffff, v5
-; GFX12-NEXT:    v_and_b32_e32 v14, 1, v3
-; GFX12-NEXT:    v_and_b32_e32 v12, 1, v1
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v13
-; GFX12-NEXT:    v_and_b32_e32 v13, 0xffff, v17
-; GFX12-NEXT:    v_and_b32_e32 v6, 1, v11
-; GFX12-NEXT:    v_and_b32_e32 v4, 1, v9
-; GFX12-NEXT:    v_and_b32_e32 v10, 1, v8
-; GFX12-NEXT:    v_and_b32_e32 v8, 1, v7
-; GFX12-NEXT:    v_and_b32_e32 v11, 0xffff, v19
-; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff, v18
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff, v21
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v20
-; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; GFX12-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    s_and_b32 s6, 0xffff, s2
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10003
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10001
+; GFX12-NEXT:    s_bfe_u32 s5, s2, 0x10007
+; GFX12-NEXT:    s_bfe_u32 s7, s2, 0x10009
+; GFX12-NEXT:    s_bfe_u32 s8, s2, 0x1000d
+; GFX12-NEXT:    s_and_b32 s9, s2, 1
+; GFX12-NEXT:    v_mov_b32_e32 v1, s8
+; GFX12-NEXT:    s_bfe_u32 s10, s2, 0x1000a
+; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x1000c
+; GFX12-NEXT:    s_bfe_u32 s11, s6, 0x10005
+; GFX12-NEXT:    s_bfe_u32 s12, s6, 0x1000b
+; GFX12-NEXT:    s_lshr_b32 s13, s6, 15
+; GFX12-NEXT:    s_bfe_u32 s14, s6, 0x10002
+; GFX12-NEXT:    s_bfe_u32 s15, s6, 0x10006
+; GFX12-NEXT:    s_bfe_u32 s16, s6, 0x10004
+; GFX12-NEXT:    s_bfe_u32 s17, s6, 0x10008
+; GFX12-NEXT:    s_bfe_u32 s6, s6, 0x1000e
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT:    v_dual_mov_b32 v15, s3 :: v_dual_mov_b32 v2, s6
+; GFX12-NEXT:    v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v4, s17
+; GFX12-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v11, s5
+; GFX12-NEXT:    v_dual_mov_b32 v7, s12 :: v_dual_mov_b32 v8, s16
+; GFX12-NEXT:    v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v10, s15
+; GFX12-NEXT:    v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s4
+; GFX12-NEXT:    v_mov_b32_e32 v14, s14
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -1867,7 +1813,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    flat_load_ushort v12, v[0:1]
 ; GFX8-NEXT:    s_add_u32 s2, s0, 48
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v19, s3
@@ -1883,37 +1829,22 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v20, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v22, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v12, 12, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v13, 13, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v14, 14, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v15, 15, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 8, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v9, 9, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v10, 10, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v11, 11, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 4, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 5, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 6, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 7, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 2, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 3, v0
-; GFX8-NEXT:    v_bfe_i32 v15, v15, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v14, v14, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v13, v13, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v12, v12, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v7, v7, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v6, v6, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v4, v4, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v11, v11, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v10, v10, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v9, v9, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v8, v8, 0, 1
+; GFX8-NEXT:    v_bfe_i32 v3, v12, 3, 1
+; GFX8-NEXT:    v_bfe_i32 v2, v12, 2, 1
+; GFX8-NEXT:    v_bfe_i32 v1, v12, 1, 1
+; GFX8-NEXT:    v_bfe_i32 v0, v12, 0, 1
+; GFX8-NEXT:    v_bfe_i32 v7, v12, 7, 1
+; GFX8-NEXT:    v_bfe_i32 v6, v12, 6, 1
+; GFX8-NEXT:    v_bfe_i32 v5, v12, 5, 1
+; GFX8-NEXT:    v_bfe_i32 v4, v12, 4, 1
+; GFX8-NEXT:    v_bfe_i32 v11, v12, 11, 1
+; GFX8-NEXT:    v_bfe_i32 v10, v12, 10, 1
+; GFX8-NEXT:    v_bfe_i32 v9, v12, 9, 1
+; GFX8-NEXT:    v_bfe_i32 v8, v12, 8, 1
+; GFX8-NEXT:    v_bfe_i32 v15, v12, 15, 1
+; GFX8-NEXT:    v_bfe_i32 v14, v12, 14, 1
+; GFX8-NEXT:    v_bfe_i32 v13, v12, 13, 1
+; GFX8-NEXT:    v_bfe_i32 v12, v12, 12, 1
 ; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
 ; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[8:11]
 ; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[4:7]
@@ -1991,46 +1922,40 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_sextload_v16i1_to_v16i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v0, v16, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 12, v0
-; GFX12-NEXT:    v_lshrrev_b16 v4, 13, v0
-; GFX12-NEXT:    v_lshrrev_b16 v8, 14, v0
-; GFX12-NEXT:    v_lshrrev_b16 v12, 15, v0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 3, v0
-; GFX12-NEXT:    v_lshrrev_b16 v7, 2, v0
-; GFX12-NEXT:    v_lshrrev_b16 v13, 7, v0
-; GFX12-NEXT:    v_lshrrev_b16 v17, 8, v0
-; GFX12-NEXT:    v_lshrrev_b16 v9, 9, v0
-; GFX12-NEXT:    v_lshrrev_b16 v10, 10, v0
-; GFX12-NEXT:    v_lshrrev_b16 v11, 11, v0
-; GFX12-NEXT:    v_lshrrev_b16 v18, 4, v0
-; GFX12-NEXT:    v_lshrrev_b16 v5, 5, v0
-; GFX12-NEXT:    v_lshrrev_b16 v6, 6, v0
-; GFX12-NEXT:    v_lshrrev_b16 v19, 1, v0
-; GFX12-NEXT:    v_bfe_i32 v3, v2, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v2, v7, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v7, v13, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v15, v12, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v14, v8, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v13, v4, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v12, v1, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v11, v11, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v10, v10, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v9, v9, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v8, v17, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v6, v6, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v5, v5, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v4, v18, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v1, v19, 0, 1
+; GFX12-NEXT:    s_load_u16 s2, s[2:3], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_bfe_i32 s3, s2, 0x10003
+; GFX12-NEXT:    s_bfe_i32 s4, s2, 0x10002
+; GFX12-NEXT:    s_bfe_i32 s5, s2, 0x10001
+; GFX12-NEXT:    s_bfe_i32 s6, s2, 0x10000
+; GFX12-NEXT:    s_bfe_i32 s7, s2, 0x10007
+; GFX12-NEXT:    s_bfe_i32 s8, s2, 0x10006
+; GFX12-NEXT:    s_bfe_i32 s9, s2, 0x10005
+; GFX12-NEXT:    s_bfe_i32 s10, s2, 0x10004
+; GFX12-NEXT:    s_bfe_i32 s11, s2, 0x1000b
+; GFX12-NEXT:    s_bfe_i32 s12, s2, 0x1000a
+; GFX12-NEXT:    s_bfe_i32 s13, s2, 0x10009
+; GFX12-NEXT:    s_bfe_i32 s14, s2, 0x10008
+; GFX12-NEXT:    s_bfe_i32 s15, s2, 0x1000f
+; GFX12-NEXT:    s_bfe_i32 s16, s2, 0x1000e
+; GFX12-NEXT:    s_bfe_i32 s17, s2, 0x1000c
+; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x1000d
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s17 :: v_dual_mov_b32 v3, s15
+; GFX12-NEXT:    v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v5, s13
+; GFX12-NEXT:    v_dual_mov_b32 v4, s14 :: v_dual_mov_b32 v7, s11
+; GFX12-NEXT:    v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v9, s9
+; GFX12-NEXT:    v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s7
+; GFX12-NEXT:    v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v13, s5
+; GFX12-NEXT:    v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v15, s3
+; GFX12-NEXT:    v_mov_b32_e32 v14, s4
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -2134,118 +2059,108 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 13, s4
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 5, s4
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 7, s4
-; GFX8-NEXT:    v_and_b32_e32 v15, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 1, s4
-; GFX8-NEXT:    v_and_b32_e32 v23, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 3, s4
-; GFX8-NEXT:    s_lshr_b32 s2, s4, 24
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 9, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 11, s4
-; GFX8-NEXT:    v_and_b32_e32 v26, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 5, s2
-; GFX8-NEXT:    v_and_b32_e32 v17, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 4, s2
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v6, 6, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 1, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 3, s2
-; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x10018
-; GFX8-NEXT:    v_lshrrev_b16_e64 v7, 7, s2
-; GFX8-NEXT:    s_and_b32 s6, s4, 1
-; GFX8-NEXT:    s_bfe_u32 s7, s4, 0x10013
-; GFX8-NEXT:    s_bfe_u32 s8, s4, 0x10012
-; GFX8-NEXT:    s_bfe_u32 s9, s4, 0x10011
-; GFX8-NEXT:    s_bfe_u32 s10, s4, 0x10010
-; GFX8-NEXT:    s_bfe_u32 s2, s4, 0x10017
-; GFX8-NEXT:    s_bfe_u32 s3, s4, 0x10016
-; GFX8-NEXT:    s_bfe_u32 s11, s4, 0x10015
-; GFX8-NEXT:    s_bfe_u32 s12, s4, 0x10014
-; GFX8-NEXT:    v_mov_b32_e32 v11, s2
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x10003
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x10001
+; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x10007
+; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x10005
+; GFX8-NEXT:    s_bfe_u32 s8, s2, 0x1000b
+; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x10009
+; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x1000f
+; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x1000d
+; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x10013
+; GFX8-NEXT:    s_bfe_u32 s13, s2, 0x10011
+; GFX8-NEXT:    s_bfe_u32 s14, s2, 0x10017
+; GFX8-NEXT:    s_bfe_u32 s15, s2, 0x1001b
+; GFX8-NEXT:    s_bfe_u32 s16, s2, 0x10019
+; GFX8-NEXT:    s_lshr_b32 s3, s2, 31
+; GFX8-NEXT:    s_bfe_u32 s17, s2, 0x1001d
+; GFX8-NEXT:    s_and_b32 s18, s2, 1
+; GFX8-NEXT:    s_bfe_u32 s19, s2, 0x10002
+; GFX8-NEXT:    s_bfe_u32 s20, s2, 0x10006
+; GFX8-NEXT:    s_bfe_u32 s21, s2, 0x10004
+; GFX8-NEXT:    s_bfe_u32 s22, s2, 0x1000a
+; GFX8-NEXT:    s_bfe_u32 s23, s2, 0x10008
+; GFX8-NEXT:    s_bfe_u32 s24, s2, 0x1000e
+; GFX8-NEXT:    s_bfe_u32 s25, s2, 0x1000c
+; GFX8-NEXT:    s_bfe_u32 s26, s2, 0x10012
+; GFX8-NEXT:    s_bfe_u32 s27, s2, 0x10010
+; GFX8-NEXT:    s_bfe_u32 s28, s2, 0x10016
+; GFX8-NEXT:    s_bfe_u32 s29, s2, 0x10015
+; GFX8-NEXT:    s_bfe_u32 s30, s2, 0x10014
+; GFX8-NEXT:    s_bfe_u32 s31, s2, 0x1001a
+; GFX8-NEXT:    s_bfe_u32 s33, s2, 0x10018
+; GFX8-NEXT:    s_bfe_u32 s34, s2, 0x1001e
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x1001c
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    s_add_u32 s2, s0, 0x70
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    s_add_u32 s2, s0, 0x60
+; GFX8-NEXT:    v_mov_b32_e32 v1, s17
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x50
-; GFX8-NEXT:    v_mov_b32_e32 v10, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s33
+; GFX8-NEXT:    v_mov_b32_e32 v1, s16
+; GFX8-NEXT:    v_mov_b32_e32 v2, s31
+; GFX8-NEXT:    v_mov_b32_e32 v3, s15
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v13, s3
-; GFX8-NEXT:    v_mov_b32_e32 v12, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 64
-; GFX8-NEXT:    v_mov_b32_e32 v8, s12
-; GFX8-NEXT:    v_mov_b32_e32 v9, s11
+; GFX8-NEXT:    v_mov_b32_e32 v0, s30
+; GFX8-NEXT:    v_mov_b32_e32 v1, s29
+; GFX8-NEXT:    v_mov_b32_e32 v2, s28
+; GFX8-NEXT:    v_mov_b32_e32 v3, s14
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT:    v_mov_b32_e32 v13, s3
-; GFX8-NEXT:    v_mov_b32_e32 v12, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 48
-; GFX8-NEXT:    v_lshrrev_b16_e64 v21, 14, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v25, 2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v8, s10
-; GFX8-NEXT:    v_mov_b32_e32 v9, s9
-; GFX8-NEXT:    v_mov_b32_e32 v10, s8
-; GFX8-NEXT:    v_mov_b32_e32 v11, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s27
+; GFX8-NEXT:    v_mov_b32_e32 v1, s13
+; GFX8-NEXT:    v_mov_b32_e32 v2, s26
+; GFX8-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff, v22
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v25
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v21
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff, v24
-; GFX8-NEXT:    v_mov_b32_e32 v25, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v20, 12, s4
-; GFX8-NEXT:    v_mov_b32_e32 v24, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff, v23
-; GFX8-NEXT:    v_lshrrev_b16_e64 v23, 15, s4
-; GFX8-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX8-NEXT:    v_mov_b32_e32 v0, s25
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_mov_b32_e32 v2, s24
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v21, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v16, 10, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v19, 4, s4
-; GFX8-NEXT:    v_mov_b32_e32 v8, 1
-; GFX8-NEXT:    v_mov_b32_e32 v20, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 16
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v19
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff, v18
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX8-NEXT:    v_and_b32_sdwa v16, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 6, s4
-; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX8-NEXT:    v_mov_b32_e32 v17, s3
-; GFX8-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX8-NEXT:    v_mov_b32_e32 v16, s2
-; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
-; GFX8-NEXT:    s_add_u32 s2, s0, 0x70
-; GFX8-NEXT:    v_mov_b32_e32 v13, s1
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff, v26
-; GFX8-NEXT:    v_mov_b32_e32 v8, s6
-; GFX8-NEXT:    v_mov_b32_e32 v12, s0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s23
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_mov_b32_e32 v2, s22
+; GFX8-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT:    s_add_u32 s0, s0, 0x60
-; GFX8-NEXT:    v_mov_b32_e32 v9, s3
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v8, s2
-; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s21
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s19
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
@@ -2353,82 +2268,65 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshr_b32 s3, s2, 24
-; GFX12-NEXT:    v_lshrrev_b16 v1, 13, s2
-; GFX12-NEXT:    v_lshrrev_b16 v2, 9, s2
-; GFX12-NEXT:    v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT:    v_lshrrev_b16 v6, 5, s2
-; GFX12-NEXT:    v_lshrrev_b16 v9, 7, s2
-; GFX12-NEXT:    v_lshrrev_b16 v13, 3, s2
-; GFX12-NEXT:    v_lshrrev_b16 v14, 5, s3
-; GFX12-NEXT:    v_lshrrev_b16 v18, 1, s3
-; GFX12-NEXT:    v_lshrrev_b16 v21, 3, s3
-; GFX12-NEXT:    v_lshrrev_b16 v10, 1, s2
-; GFX12-NEXT:    v_dual_mov_b32 v32, 0 :: v_dual_and_b32 v33, 1, v1
-; GFX12-NEXT:    v_lshrrev_b16 v0, 12, s2
-; GFX12-NEXT:    v_lshrrev_b16 v12, 14, s2
-; GFX12-NEXT:    v_lshrrev_b16 v20, 15, s2
-; GFX12-NEXT:    v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v3, 10, s2
-; GFX12-NEXT:    v_lshrrev_b16 v5, 4, s2
-; GFX12-NEXT:    v_lshrrev_b16 v7, 6, s2
-; GFX12-NEXT:    v_lshrrev_b16 v11, 2, s2
-; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT:    s_and_b32 s5, s2, 1
-; GFX12-NEXT:    v_lshrrev_b16 v15, 4, s3
-; GFX12-NEXT:    v_lshrrev_b16 v16, 6, s3
-; GFX12-NEXT:    v_lshrrev_b16 v17, 7, s3
-; GFX12-NEXT:    v_lshrrev_b16 v19, 2, s3
-; GFX12-NEXT:    v_and_b32_e32 v25, 1, v14
-; GFX12-NEXT:    v_and_b32_e32 v26, 1, v18
-; GFX12-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10013
-; GFX12-NEXT:    s_bfe_u32 s6, s2, 0x10012
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v30, s6 :: v_dual_and_b32 v13, 1, v13
-; GFX12-NEXT:    s_bfe_u32 s7, s2, 0x10011
-; GFX12-NEXT:    s_bfe_u32 s8, s2, 0x10010
-; GFX12-NEXT:    s_bfe_u32 s9, s2, 0x10017
-; GFX12-NEXT:    v_dual_mov_b32 v27, s9 :: v_dual_and_b32 v24, 1, v6
-; GFX12-NEXT:    s_bfe_u32 s10, s2, 0x10016
-; GFX12-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX12-NEXT:    s_bfe_u32 s11, s2, 0x10014
-; GFX12-NEXT:    v_and_b32_e32 v23, 1, v4
-; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x10015
-; GFX12-NEXT:    v_and_b32_e32 v22, 1, v2
-; GFX12-NEXT:    v_dual_mov_b32 v28, s8 :: v_dual_and_b32 v1, 1, v10
-; GFX12-NEXT:    v_dual_mov_b32 v29, s7 :: v_dual_and_b32 v2, 1, v11
-; GFX12-NEXT:    v_dual_mov_b32 v31, s3 :: v_dual_and_b32 v6, 1, v7
-; GFX12-NEXT:    v_and_b32_e32 v4, 1, v5
-; GFX12-NEXT:    v_and_b32_e32 v10, 1, v3
-; GFX12-NEXT:    v_and_b32_e32 v14, 1, v19
-; GFX12-NEXT:    v_and_b32_e32 v19, 0xffff, v17
-; GFX12-NEXT:    v_and_b32_e32 v18, 1, v16
-; GFX12-NEXT:    v_and_b32_e32 v16, 1, v15
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v13
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v24
-; GFX12-NEXT:    v_dual_mov_b32 v24, s11 :: v_dual_and_b32 v13, 0xffff, v26
-; GFX12-NEXT:    v_and_b32_e32 v11, 0xffff, v23
-; GFX12-NEXT:    v_dual_mov_b32 v26, s10 :: v_dual_and_b32 v23, 0xffff, v20
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff, v9
-; GFX12-NEXT:    v_and_b32_e32 v20, 1, v0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_and_b32 v17, 0xffff, v25
-; GFX12-NEXT:    v_mov_b32_e32 v25, s2
-; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff, v22
-; GFX12-NEXT:    v_and_b32_e32 v22, 1, v12
-; GFX12-NEXT:    v_dual_mov_b32 v12, s4 :: v_dual_and_b32 v15, 0xffff, v21
-; GFX12-NEXT:    v_and_b32_e32 v21, 0xffff, v33
-; GFX12-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT:    s_clause 0x7
-; GFX12-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v32, v[0:3], s[0:1]
-; GFX12-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:96
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10003
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10001
+; GFX12-NEXT:    s_bfe_u32 s5, s2, 0x10007
+; GFX12-NEXT:    s_bfe_u32 s6, s2, 0x10005
+; GFX12-NEXT:    s_bfe_u32 s7, s2, 0x1000b
+; GFX12-NEXT:    s_bfe_u32 s8, s2, 0x10009
+; GFX12-NEXT:    s_bfe_u32 s9, s2, 0x1000f
+; GFX12-NEXT:    s_bfe_u32 s10, s2, 0x1000d
+; GFX12-NEXT:    s_bfe_u32 s11, s2, 0x10013
+; GFX12-NEXT:    s_bfe_u32 s12, s2, 0x10011
+; GFX12-NEXT:    s_bfe_u32 s13, s2, 0x10017
+; GFX12-NEXT:    s_bfe_u32 s14, s2, 0x1001b
+; GFX12-NEXT:    s_bfe_u32 s15, s2, 0x10019
+; GFX12-NEXT:    s_lshr_b32 s16, s2, 31
+; GFX12-NEXT:    s_bfe_u32 s17, s2, 0x1001d
+; GFX12-NEXT:    s_and_b32 s18, s2, 1
+; GFX12-NEXT:    s_bfe_u32 s19, s2, 0x10002
+; GFX12-NEXT:    s_bfe_u32 s20, s2, 0x10006
+; GFX12-NEXT:    s_bfe_u32 s21, s2, 0x10004
+; GFX12-NEXT:    s_bfe_u32 s22, s2, 0x1000a
+; GFX12-NEXT:    s_bfe_u32 s23, s2, 0x10008
+; GFX12-NEXT:    s_bfe_u32 s24, s2, 0x1000e
+; GFX12-NEXT:    s_bfe_u32 s25, s2, 0x1000c
+; GFX12-NEXT:    s_bfe_u32 s26, s2, 0x10012
+; GFX12-NEXT:    s_bfe_u32 s27, s2, 0x10010
+; GFX12-NEXT:    s_bfe_u32 s28, s2, 0x10016
+; GFX12-NEXT:    s_bfe_u32 s29, s2, 0x10015
+; GFX12-NEXT:    s_bfe_u32 s30, s2, 0x10014
+; GFX12-NEXT:    s_bfe_u32 s31, s2, 0x1001a
+; GFX12-NEXT:    s_bfe_u32 s33, s2, 0x10018
+; GFX12-NEXT:    s_bfe_u32 s34, s2, 0x1001c
+; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x1001e
+; GFX12-NEXT:    v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s17
+; GFX12-NEXT:    v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s16
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s15
+; GFX12-NEXT:    v_dual_mov_b32 v4, s33 :: v_dual_mov_b32 v7, s14
+; GFX12-NEXT:    v_dual_mov_b32 v6, s31 :: v_dual_mov_b32 v9, s29
+; GFX12-NEXT:    v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s13
+; GFX12-NEXT:    v_mov_b32_e32 v10, s28
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:96
+; GFX12-NEXT:    v_dual_mov_b32 v1, s12 :: v_dual_mov_b32 v0, s27
+; GFX12-NEXT:    v_dual_mov_b32 v3, s11 :: v_dual_mov_b32 v2, s26
+; GFX12-NEXT:    v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s25
+; GFX12-NEXT:    v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v6, s24
+; GFX12-NEXT:    v_dual_mov_b32 v13, s8 :: v_dual_mov_b32 v12, s23
+; GFX12-NEXT:    v_dual_mov_b32 v15, s7 :: v_dual_mov_b32 v14, s22
+; GFX12-NEXT:    v_dual_mov_b32 v17, s6 :: v_dual_mov_b32 v16, s21
+; GFX12-NEXT:    v_dual_mov_b32 v19, s5 :: v_dual_mov_b32 v18, s20
+; GFX12-NEXT:    v_dual_mov_b32 v21, s4 :: v_dual_mov_b32 v20, s18
+; GFX12-NEXT:    v_dual_mov_b32 v23, s3 :: v_dual_mov_b32 v22, s19
+; GFX12-NEXT:    s_clause 0x5
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -2534,107 +2432,106 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s3, s2, 24
-; GFX8-NEXT:    v_lshrrev_b16_e64 v8, 12, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v20, 13, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v21, 14, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v22, 15, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v16, 8, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v17, 9, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v18, 10, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v19, 11, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v12, 4, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v13, 5, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 6, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v15, 7, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v9, 1, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v10, 2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v11, 3, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v23, 4, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v24, 5, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v6, 6, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v7, 7, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v25, 1, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v26, 2, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v27, 3, s3
-; GFX8-NEXT:    s_bfe_i32 s4, s2, 0x10018
-; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x10000
-; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x10013
-; GFX8-NEXT:    s_bfe_i32 s7, s2, 0x10012
-; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x10011
-; GFX8-NEXT:    s_bfe_i32 s9, s2, 0x10010
-; GFX8-NEXT:    s_bfe_i32 s3, s2, 0x10017
-; GFX8-NEXT:    s_bfe_i32 s10, s2, 0x10016
-; GFX8-NEXT:    s_bfe_i32 s11, s2, 0x10015
-; GFX8-NEXT:    s_bfe_i32 s2, s2, 0x10014
+; GFX8-NEXT:    s_bfe_i32 s4, s2, 0x10003
+; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x10002
+; GFX8-NEXT:    s_bfe_i32 s6, s2, 0x10001
+; GFX8-NEXT:    s_bfe_i32 s7, s2, 0x10000
+; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x10007
+; GFX8-NEXT:    s_bfe_i32 s9, s2, 0x10006
+; GFX8-NEXT:    s_bfe_i32 s10, s2, 0x10005
+; GFX8-NEXT:    s_bfe_i32 s11, s2, 0x10004
+; GFX8-NEXT:    s_bfe_i32 s12, s2, 0x1000b
+; GFX8-NEXT:    s_bfe_i32 s13, s2, 0x1000a
+; GFX8-NEXT:    s_bfe_i32 s14, s2, 0x10009
+; GFX8-NEXT:    s_bfe_i32 s15, s2, 0x10008
+; GFX8-NEXT:    s_bfe_i32 s16, s2, 0x1000f
+; GFX8-NEXT:    s_bfe_i32 s17, s2, 0x1000e
+; GFX8-NEXT:    s_bfe_i32 s18, s2, 0x1000d
+; GFX8-NEXT:    s_bfe_i32 s19, s2, 0x1000c
+; GFX8-NEXT:    s_bfe_i32 s20, s2, 0x10013
+; GFX8-NEXT:    s_bfe_i32 s21, s2, 0x10012
+; GFX8-NEXT:    s_bfe_i32 s22, s2, 0x10011
+; GFX8-NEXT:    s_bfe_i32 s23, s2, 0x10010
+; GFX8-NEXT:    s_bfe_i32 s24, s2, 0x10017
+; GFX8-NEXT:    s_bfe_i32 s25, s2, 0x10016
+; GFX8-NEXT:    s_bfe_i32 s26, s2, 0x10015
+; GFX8-NEXT:    s_bfe_i32 s27, s2, 0x10014
+; GFX8-NEXT:    s_bfe_i32 s28, s2, 0x1001b
+; GFX8-NEXT:    s_bfe_i32 s29, s2, 0x1001a
+; GFX8-NEXT:    s_bfe_i32 s30, s2, 0x10019
+; GFX8-NEXT:    s_bfe_i32 s31, s2, 0x10018
+; GFX8-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX8-NEXT:    s_bfe_i32 s33, s2, 0x1001e
+; GFX8-NEXT:    s_bfe_i32 s34, s2, 0x1001d
+; GFX8-NEXT:    s_bfe_i32 s2, s2, 0x1001c
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    s_add_u32 s2, s0, 0x50
+; GFX8-NEXT:    s_add_u32 s2, s0, 0x70
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    s_add_u32 s2, s0, 0x60
+; GFX8-NEXT:    v_mov_b32_e32 v1, s34
+; GFX8-NEXT:    v_mov_b32_e32 v2, s33
+; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    s_add_u32 s2, s0, 0x50
+; GFX8-NEXT:    v_mov_b32_e32 v0, s31
+; GFX8-NEXT:    v_mov_b32_e32 v1, s30
+; GFX8-NEXT:    v_mov_b32_e32 v2, s29
+; GFX8-NEXT:    v_mov_b32_e32 v3, s28
+; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 64
-; GFX8-NEXT:    v_mov_b32_e32 v1, s11
-; GFX8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NEXT:    v_mov_b32_e32 v0, s27
+; GFX8-NEXT:    v_mov_b32_e32 v1, s26
+; GFX8-NEXT:    v_mov_b32_e32 v2, s25
+; GFX8-NEXT:    v_mov_b32_e32 v3, s24
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 48
-; GFX8-NEXT:    v_mov_b32_e32 v0, s9
-; GFX8-NEXT:    v_mov_b32_e32 v1, s8
-; GFX8-NEXT:    v_mov_b32_e32 v2, s7
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s23
+; GFX8-NEXT:    v_mov_b32_e32 v1, s22
+; GFX8-NEXT:    v_mov_b32_e32 v2, s21
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT:    v_bfe_i32 v5, v24, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v1, v25, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v25, s3
-; GFX8-NEXT:    v_mov_b32_e32 v24, s2
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NEXT:    v_bfe_i32 v4, v23, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v23, v22, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v22, v21, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v21, v20, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v20, v8, 0, 1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s19
+; GFX8-NEXT:    v_mov_b32_e32 v1, s18
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
-; GFX8-NEXT:    v_bfe_i32 v19, v19, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v21, s3
-; GFX8-NEXT:    v_mov_b32_e32 v20, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 16
-; GFX8-NEXT:    v_bfe_i32 v18, v18, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v17, v17, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v16, v16, 0, 1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s15
+; GFX8-NEXT:    v_mov_b32_e32 v1, s14
+; GFX8-NEXT:    v_mov_b32_e32 v2, s13
+; GFX8-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
-; GFX8-NEXT:    v_bfe_i32 v15, v15, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v17, s3
-; GFX8-NEXT:    v_bfe_i32 v14, v14, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v13, v13, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v12, v12, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v16, s2
-; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
-; GFX8-NEXT:    s_add_u32 s2, s0, 0x70
-; GFX8-NEXT:    v_mov_b32_e32 v13, s1
-; GFX8-NEXT:    v_bfe_i32 v11, v11, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v10, v10, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v9, v9, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v8, s5
-; GFX8-NEXT:    v_mov_b32_e32 v12, s0
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT:    s_add_u32 s0, s0, 0x60
-; GFX8-NEXT:    v_mov_b32_e32 v9, s3
-; GFX8-NEXT:    v_bfe_i32 v7, v7, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v6, v6, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v8, s2
-; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NEXT:    v_bfe_i32 v3, v27, 0, 1
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s11
+; GFX8-NEXT:    v_mov_b32_e32 v1, s10
+; GFX8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX8-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_bfe_i32 v2, v26, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
@@ -2774,79 +2671,66 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v0, 12, s2
-; GFX12-NEXT:    v_lshrrev_b16 v12, 13, s2
-; GFX12-NEXT:    v_lshrrev_b16 v13, 14, s2
-; GFX12-NEXT:    v_lshrrev_b16 v14, 15, s2
-; GFX12-NEXT:    v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v9, 9, s2
-; GFX12-NEXT:    v_lshrrev_b16 v10, 10, s2
-; GFX12-NEXT:    v_lshrrev_b16 v11, 11, s2
-; GFX12-NEXT:    s_lshr_b32 s3, s2, 24
-; GFX12-NEXT:    v_lshrrev_b16 v4, 4, s2
-; GFX12-NEXT:    v_lshrrev_b16 v5, 5, s2
-; GFX12-NEXT:    v_lshrrev_b16 v6, 6, s2
-; GFX12-NEXT:    v_lshrrev_b16 v7, 7, s2
-; GFX12-NEXT:    v_lshrrev_b16 v1, 1, s2
-; GFX12-NEXT:    v_lshrrev_b16 v2, 2, s2
-; GFX12-NEXT:    v_lshrrev_b16 v3, 3, s2
-; GFX12-NEXT:    s_bfe_i32 s4, s2, 0x10018
-; GFX12-NEXT:    s_bfe_i32 s5, s2, 0x10000
-; GFX12-NEXT:    s_bfe_i32 s6, s2, 0x10013
-; GFX12-NEXT:    s_bfe_i32 s7, s2, 0x10012
-; GFX12-NEXT:    v_lshrrev_b16 v16, 4, s3
-; GFX12-NEXT:    v_lshrrev_b16 v20, 5, s3
-; GFX12-NEXT:    v_lshrrev_b16 v21, 6, s3
-; GFX12-NEXT:    v_lshrrev_b16 v22, 7, s3
-; GFX12-NEXT:    v_lshrrev_b16 v17, 1, s3
-; GFX12-NEXT:    v_lshrrev_b16 v18, 2, s3
-; GFX12-NEXT:    v_lshrrev_b16 v19, 3, s3
-; GFX12-NEXT:    s_bfe_i32 s3, s2, 0x10011
-; GFX12-NEXT:    s_bfe_i32 s8, s2, 0x10010
-; GFX12-NEXT:    s_bfe_i32 s9, s2, 0x10017
-; GFX12-NEXT:    s_bfe_i32 s10, s2, 0x10016
-; GFX12-NEXT:    s_bfe_i32 s11, s2, 0x10014
-; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x10015
+; GFX12-NEXT:    s_bfe_i32 s3, s2, 0x10003
+; GFX12-NEXT:    s_bfe_i32 s4, s2, 0x10002
+; GFX12-NEXT:    s_bfe_i32 s5, s2, 0x10001
+; GFX12-NEXT:    s_bfe_i32 s6, s2, 0x10000
+; GFX12-NEXT:    s_bfe_i32 s7, s2, 0x10007
+; GFX12-NEXT:    s_bfe_i32 s8, s2, 0x10006
+; GFX12-NEXT:    s_bfe_i32 s9, s2, 0x10005
+; GFX12-NEXT:    s_bfe_i32 s10, s2, 0x10004
+; GFX12-NEXT:    s_bfe_i32 s11, s2, 0x1000b
+; GFX12-NEXT:    s_bfe_i32 s12, s2, 0x1000a
+; GFX12-NEXT:    s_bfe_i32 s13, s2, 0x10009
+; GFX12-NEXT:    s_bfe_i32 s14, s2, 0x10008
+; GFX12-NEXT:    s_bfe_i32 s15, s2, 0x1000f
+; GFX12-NEXT:    s_bfe_i32 s16, s2, 0x1000e
+; GFX12-NEXT:    s_bfe_i32 s17, s2, 0x1000d
+; GFX12-NEXT:    s_bfe_i32 s18, s2, 0x1000c
+; GFX12-NEXT:    s_bfe_i32 s19, s2, 0x10013
+; GFX12-NEXT:    s_bfe_i32 s20, s2, 0x10012
+; GFX12-NEXT:    s_bfe_i32 s21, s2, 0x10011
+; GFX12-NEXT:    s_bfe_i32 s22, s2, 0x10010
+; GFX12-NEXT:    s_bfe_i32 s23, s2, 0x10017
+; GFX12-NEXT:    s_bfe_i32 s24, s2, 0x10016
+; GFX12-NEXT:    s_bfe_i32 s25, s2, 0x10015
+; GFX12-NEXT:    s_bfe_i32 s26, s2, 0x10014
+; GFX12-NEXT:    s_bfe_i32 s27, s2, 0x1001b
+; GFX12-NEXT:    s_bfe_i32 s28, s2, 0x1001a
+; GFX12-NEXT:    s_bfe_i32 s29, s2, 0x10019
+; GFX12-NEXT:    s_bfe_i32 s30, s2, 0x10018
+; GFX12-NEXT:    s_ashr_i32 s31, s2, 31
+; GFX12-NEXT:    s_bfe_i32 s33, s2, 0x1001e
+; GFX12-NEXT:    s_bfe_i32 s34, s2, 0x1001c
+; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x1001d
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s2
-; GFX12-NEXT:    v_bfe_i32 v15, v14, 0, 1
-; GFX12-NEXT:    v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v27, s9
-; GFX12-NEXT:    v_dual_mov_b32 v26, s10 :: v_dual_mov_b32 v29, s3
-; GFX12-NEXT:    v_bfe_i32 v14, v13, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v13, v12, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v12, v0, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v11, v11, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v10, v10, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v9, v9, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v8, v8, 0, 1
-; GFX12-NEXT:    v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v31, s6
-; GFX12-NEXT:    v_mov_b32_e32 v30, s7
-; GFX12-NEXT:    v_bfe_i32 v7, v7, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v6, v6, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v5, v5, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v4, v4, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v3, v3, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 1
-; GFX12-NEXT:    v_mov_b32_e32 v0, s5
-; GFX12-NEXT:    v_bfe_i32 v23, v22, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v22, v21, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v21, v20, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v20, v16, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v19, v19, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v18, v18, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v17, v17, 0, 1
+; GFX12-NEXT:    v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s31
+; GFX12-NEXT:    v_dual_mov_b32 v2, s33 :: v_dual_mov_b32 v5, s29
+; GFX12-NEXT:    v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v7, s27
+; GFX12-NEXT:    v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v9, s25
+; GFX12-NEXT:    v_dual_mov_b32 v8, s26 :: v_dual_mov_b32 v11, s23
+; GFX12-NEXT:    v_mov_b32_e32 v10, s24
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT:    v_mov_b32_e32 v16, s4
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:96
+; GFX12-NEXT:    v_dual_mov_b32 v1, s21 :: v_dual_mov_b32 v0, s22
+; GFX12-NEXT:    v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v2, s20
+; GFX12-NEXT:    v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v4, s18
+; GFX12-NEXT:    v_dual_mov_b32 v7, s15 :: v_dual_mov_b32 v6, s16
+; GFX12-NEXT:    v_dual_mov_b32 v13, s13 :: v_dual_mov_b32 v12, s14
+; GFX12-NEXT:    v_dual_mov_b32 v15, s11 :: v_dual_mov_b32 v14, s12
+; GFX12-NEXT:    v_dual_mov_b32 v17, s9 :: v_dual_mov_b32 v16, s10
+; GFX12-NEXT:    v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v18, s8
+; GFX12-NEXT:    v_dual_mov_b32 v21, s5 :: v_dual_mov_b32 v20, s6
+; GFX12-NEXT:    v_dual_mov_b32 v23, s3 :: v_dual_mov_b32 v22, s4
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v32, v[0:3], s[0:1]
-; GFX12-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:96
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -3027,233 +2911,218 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[26:27], s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s6, s3, 24
-; GFX8-NEXT:    s_lshr_b32 s8, s2, 24
-; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x10018
-; GFX8-NEXT:    s_bfe_u32 s5, s3, 0x10018
-; GFX8-NEXT:    s_and_b32 s7, s3, 1
-; GFX8-NEXT:    s_and_b32 s9, s2, 1
-; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x10013
-; GFX8-NEXT:    s_bfe_u32 s13, s2, 0x10012
-; GFX8-NEXT:    s_bfe_u32 s14, s2, 0x10011
-; GFX8-NEXT:    s_bfe_u32 s15, s2, 0x10010
-; GFX8-NEXT:    s_bfe_u32 s16, s2, 0x10017
-; GFX8-NEXT:    s_bfe_u32 s17, s2, 0x10016
-; GFX8-NEXT:    s_bfe_u32 s18, s2, 0x10015
-; GFX8-NEXT:    s_bfe_u32 s19, s2, 0x10014
-; GFX8-NEXT:    s_bfe_u32 s20, s3, 0x10013
-; GFX8-NEXT:    s_bfe_u32 s21, s3, 0x10012
-; GFX8-NEXT:    s_bfe_u32 s22, s3, 0x10011
-; GFX8-NEXT:    s_bfe_u32 s23, s3, 0x10010
-; GFX8-NEXT:    s_bfe_u32 s10, s3, 0x10017
-; GFX8-NEXT:    s_bfe_u32 s11, s3, 0x10016
-; GFX8-NEXT:    s_bfe_u32 s24, s3, 0x10015
-; GFX8-NEXT:    s_bfe_u32 s25, s3, 0x10014
-; GFX8-NEXT:    v_mov_b32_e32 v25, s10
-; GFX8-NEXT:    s_add_u32 s10, s0, 0xd0
-; GFX8-NEXT:    v_mov_b32_e32 v24, s11
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v27, s11
-; GFX8-NEXT:    v_mov_b32_e32 v26, s10
-; GFX8-NEXT:    s_add_u32 s10, s0, 0xc0
-; GFX8-NEXT:    v_mov_b32_e32 v22, s25
-; GFX8-NEXT:    v_mov_b32_e32 v23, s24
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_mov_b32_e32 v27, s11
-; GFX8-NEXT:    v_mov_b32_e32 v26, s10
-; GFX8-NEXT:    s_add_u32 s10, s0, 0x50
-; GFX8-NEXT:    v_mov_b32_e32 v22, s23
-; GFX8-NEXT:    v_mov_b32_e32 v23, s22
-; GFX8-NEXT:    v_mov_b32_e32 v24, s21
-; GFX8-NEXT:    v_mov_b32_e32 v25, s20
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_mov_b32_e32 v27, s11
-; GFX8-NEXT:    v_mov_b32_e32 v26, s10
-; GFX8-NEXT:    s_add_u32 s10, s0, 64
-; GFX8-NEXT:    v_mov_b32_e32 v22, s19
-; GFX8-NEXT:    v_mov_b32_e32 v23, s18
-; GFX8-NEXT:    v_mov_b32_e32 v24, s17
-; GFX8-NEXT:    v_mov_b32_e32 v25, s16
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_mov_b32_e32 v27, s11
-; GFX8-NEXT:    v_mov_b32_e32 v26, s10
-; GFX8-NEXT:    s_add_u32 s10, s0, 48
-; GFX8-NEXT:    v_mov_b32_e32 v22, s15
-; GFX8-NEXT:    v_mov_b32_e32 v23, s14
-; GFX8-NEXT:    v_mov_b32_e32 v24, s13
-; GFX8-NEXT:    v_mov_b32_e32 v25, s12
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 13, s2
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v19, 12, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v22, 7, s3
-; GFX8-NEXT:    v_mov_b32_e32 v25, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v20, 14, s2
-; GFX8-NEXT:    v_and_b32_e32 v21, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v27, 1, v22
-; GFX8-NEXT:    v_lshrrev_b16_e64 v22, 1, s3
-; GFX8-NEXT:    v_mov_b32_e32 v24, s10
-; GFX8-NEXT:    s_add_u32 s10, s0, 32
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 9, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 11, s2
-; GFX8-NEXT:    v_and_b32_e32 v28, 1, v22
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v20
-; GFX8-NEXT:    v_lshrrev_b16_e64 v23, 15, s2
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX8-NEXT:    v_and_b32_e32 v20, 1, v19
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v14, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v16, 10, s2
-; GFX8-NEXT:    v_and_b32_e32 v17, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 5, s2
-; GFX8-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v19, 3, s3
-; GFX8-NEXT:    v_mov_b32_e32 v25, 1
-; GFX8-NEXT:    v_mov_b32_e32 v21, s11
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 7, s2
-; GFX8-NEXT:    v_and_b32_e32 v23, 1, v19
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff, v18
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v16
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX8-NEXT:    v_and_b32_sdwa v16, v14, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v20, s10
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 5, s6
-; GFX8-NEXT:    v_and_b32_e32 v15, 1, v0
-; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
-; GFX8-NEXT:    v_and_b32_e32 v20, 1, v14
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 1, s6
-; GFX8-NEXT:    s_add_u32 s10, s0, 16
-; GFX8-NEXT:    v_and_b32_e32 v17, 1, v14
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff, v15
-; GFX8-NEXT:    v_lshrrev_b16_e64 v15, 3, s6
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v11, 4, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v13, 6, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 1, s2
-; GFX8-NEXT:    v_and_b32_e32 v19, 1, v15
-; GFX8-NEXT:    v_mov_b32_e32 v16, s11
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 3, s2
-; GFX8-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX8-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX8-NEXT:    v_mov_b32_e32 v15, s10
-; GFX8-NEXT:    v_lshrrev_b16_e64 v9, 2, s2
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v0
-; GFX8-NEXT:    flat_store_dwordx4 v[15:16], v[11:14]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 13, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v11, 5, s8
-; GFX8-NEXT:    v_mov_b32_e32 v13, s1
-; GFX8-NEXT:    v_and_b32_e32 v15, 1, v11
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff, v10
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v9
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff, v8
-; GFX8-NEXT:    v_mov_b32_e32 v8, s9
-; GFX8-NEXT:    v_mov_b32_e32 v12, s0
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT:    s_add_u32 s10, s0, 0xb0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v8, 1, s8
-; GFX8-NEXT:    v_lshrrev_b16_e64 v5, 12, s3
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v7, 14, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 9, s3
-; GFX8-NEXT:    v_and_b32_e32 v11, 1, v8
-; GFX8-NEXT:    v_lshrrev_b16_e64 v8, 3, s8
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v9, s10
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v13, 1, v8
-; GFX8-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX8-NEXT:    v_lshrrev_b16_e64 v8, 15, s3
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v10, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 10, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 11, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v18, 2, s6
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 4, s8
-; GFX8-NEXT:    v_lshrrev_b16_e64 v16, 6, s8
-; GFX8-NEXT:    flat_store_dwordx4 v[9:10], v[5:8]
-; GFX8-NEXT:    s_add_u32 s2, s0, 0xa0
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff, v13
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff, v17
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff, v2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 4, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 5, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v26, 6, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v22, 2, s3
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v16
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v14
-; GFX8-NEXT:    v_and_b32_e32 v14, 1, v18
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v3
-; GFX8-NEXT:    v_and_b32_sdwa v16, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff, v19
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff, v4
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 0x90
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[16:19]
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff, v1
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff, v20
-; GFX8-NEXT:    v_and_b32_e32 v20, 1, v0
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 6, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 0x80
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff, v23
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v22
-; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff, v27
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v26
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v28
-; GFX8-NEXT:    v_mov_b32_e32 v21, s3
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    v_mov_b32_e32 v20, s2
-; GFX8-NEXT:    s_add_u32 s2, s0, 0xf0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v24, 4, s6
-; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[1:4]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v19, 7, s6
-; GFX8-NEXT:    v_and_b32_e32 v16, 1, v24
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 0xe0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v12, 2, s8
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v12
-; GFX8-NEXT:    v_mov_b32_e32 v12, s5
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 0x70
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff, v11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v11, 7, s8
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_add_u32 s0, s0, 0x60
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_bfe_u32 s2, s26, 0x10003
+; GFX8-NEXT:    s_bfe_u32 s3, s26, 0x10001
+; GFX8-NEXT:    s_bfe_u32 s4, s26, 0x10007
+; GFX8-NEXT:    s_bfe_u32 s5, s26, 0x10005
+; GFX8-NEXT:    s_bfe_u32 s6, s26, 0x1000b
+; GFX8-NEXT:    s_bfe_u32 s9, s26, 0x10009
+; GFX8-NEXT:    s_bfe_u32 s11, s26, 0x1000f
+; GFX8-NEXT:    s_bfe_u32 s13, s26, 0x1000d
+; GFX8-NEXT:    s_bfe_u32 s15, s26, 0x10013
+; GFX8-NEXT:    s_bfe_u32 s17, s26, 0x10011
+; GFX8-NEXT:    s_bfe_u32 s19, s26, 0x10017
+; GFX8-NEXT:    s_bfe_u32 s21, s26, 0x1001b
+; GFX8-NEXT:    s_bfe_u32 s23, s26, 0x10019
+; GFX8-NEXT:    s_lshr_b32 s25, s26, 31
+; GFX8-NEXT:    s_bfe_u32 s28, s26, 0x1001d
+; GFX8-NEXT:    s_bfe_u32 s29, s27, 0x10003
+; GFX8-NEXT:    s_bfe_u32 s30, s27, 0x10001
+; GFX8-NEXT:    s_bfe_u32 s31, s27, 0x10007
+; GFX8-NEXT:    s_bfe_u32 s33, s27, 0x10005
+; GFX8-NEXT:    s_bfe_u32 s34, s27, 0x1000b
+; GFX8-NEXT:    s_bfe_u32 s35, s27, 0x10009
+; GFX8-NEXT:    s_bfe_u32 s36, s27, 0x1000f
+; GFX8-NEXT:    s_bfe_u32 s37, s27, 0x1000d
+; GFX8-NEXT:    s_bfe_u32 s38, s27, 0x10013
+; GFX8-NEXT:    s_bfe_u32 s39, s27, 0x10011
+; GFX8-NEXT:    s_bfe_u32 s40, s27, 0x10017
+; GFX8-NEXT:    s_bfe_u32 s41, s27, 0x1001b
+; GFX8-NEXT:    s_bfe_u32 s42, s27, 0x10019
+; GFX8-NEXT:    s_lshr_b32 s43, s27, 31
+; GFX8-NEXT:    s_bfe_u32 s44, s27, 0x1001d
+; GFX8-NEXT:    s_and_b32 s8, s26, 1
+; GFX8-NEXT:    s_bfe_u32 s7, s26, 0x10002
+; GFX8-NEXT:    s_bfe_u32 s10, s26, 0x10006
+; GFX8-NEXT:    s_bfe_u32 s12, s26, 0x10004
+; GFX8-NEXT:    s_bfe_u32 s14, s26, 0x1000a
+; GFX8-NEXT:    s_bfe_u32 s16, s26, 0x10008
+; GFX8-NEXT:    s_bfe_u32 s18, s26, 0x1000e
+; GFX8-NEXT:    s_bfe_u32 s20, s26, 0x1000c
+; GFX8-NEXT:    s_bfe_u32 s22, s26, 0x10012
+; GFX8-NEXT:    s_bfe_u32 s24, s26, 0x10010
+; GFX8-NEXT:    s_bfe_u32 s45, s26, 0x10016
+; GFX8-NEXT:    s_bfe_u32 s46, s26, 0x10015
+; GFX8-NEXT:    s_bfe_u32 s47, s26, 0x10014
+; GFX8-NEXT:    s_bfe_u32 s48, s26, 0x1001a
+; GFX8-NEXT:    s_bfe_u32 s49, s26, 0x10018
+; GFX8-NEXT:    s_bfe_u32 s50, s26, 0x1001e
+; GFX8-NEXT:    s_bfe_u32 s51, s26, 0x1001c
+; GFX8-NEXT:    s_and_b32 s52, s27, 1
+; GFX8-NEXT:    s_bfe_u32 s53, s27, 0x10002
+; GFX8-NEXT:    s_bfe_u32 s54, s27, 0x10006
+; GFX8-NEXT:    s_bfe_u32 s55, s27, 0x10004
+; GFX8-NEXT:    s_bfe_u32 s56, s27, 0x1000a
+; GFX8-NEXT:    s_bfe_u32 s57, s27, 0x10008
+; GFX8-NEXT:    s_bfe_u32 s58, s27, 0x1000e
+; GFX8-NEXT:    s_bfe_u32 s59, s27, 0x1000c
+; GFX8-NEXT:    s_bfe_u32 s60, s27, 0x10012
+; GFX8-NEXT:    s_bfe_u32 s61, s27, 0x10010
+; GFX8-NEXT:    s_bfe_u32 s62, s27, 0x10016
+; GFX8-NEXT:    s_bfe_u32 s63, s27, 0x10015
+; GFX8-NEXT:    s_bfe_u32 s64, s27, 0x10014
+; GFX8-NEXT:    s_bfe_u32 s65, s27, 0x1001a
+; GFX8-NEXT:    s_bfe_u32 s66, s27, 0x10018
+; GFX8-NEXT:    s_bfe_u32 s26, s27, 0x1001e
+; GFX8-NEXT:    s_bfe_u32 s27, s27, 0x1001c
+; GFX8-NEXT:    v_mov_b32_e32 v2, s26
+; GFX8-NEXT:    s_add_u32 s26, s0, 0xf0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s27
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v1, s44
+; GFX8-NEXT:    v_mov_b32_e32 v3, s43
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0xe0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s66
+; GFX8-NEXT:    v_mov_b32_e32 v1, s42
+; GFX8-NEXT:    v_mov_b32_e32 v2, s65
+; GFX8-NEXT:    v_mov_b32_e32 v3, s41
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0xd0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s64
+; GFX8-NEXT:    v_mov_b32_e32 v1, s63
+; GFX8-NEXT:    v_mov_b32_e32 v2, s62
+; GFX8-NEXT:    v_mov_b32_e32 v3, s40
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0xc0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s61
+; GFX8-NEXT:    v_mov_b32_e32 v1, s39
+; GFX8-NEXT:    v_mov_b32_e32 v2, s60
+; GFX8-NEXT:    v_mov_b32_e32 v3, s38
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0xb0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s59
+; GFX8-NEXT:    v_mov_b32_e32 v1, s37
+; GFX8-NEXT:    v_mov_b32_e32 v2, s58
+; GFX8-NEXT:    v_mov_b32_e32 v3, s36
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0xa0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s57
+; GFX8-NEXT:    v_mov_b32_e32 v1, s35
+; GFX8-NEXT:    v_mov_b32_e32 v2, s56
+; GFX8-NEXT:    v_mov_b32_e32 v3, s34
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0x90
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s55
+; GFX8-NEXT:    v_mov_b32_e32 v1, s33
+; GFX8-NEXT:    v_mov_b32_e32 v2, s54
+; GFX8-NEXT:    v_mov_b32_e32 v3, s31
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0x80
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s52
+; GFX8-NEXT:    v_mov_b32_e32 v1, s30
+; GFX8-NEXT:    v_mov_b32_e32 v2, s53
+; GFX8-NEXT:    v_mov_b32_e32 v3, s29
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0x70
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s51
+; GFX8-NEXT:    v_mov_b32_e32 v1, s28
+; GFX8-NEXT:    v_mov_b32_e32 v2, s50
+; GFX8-NEXT:    v_mov_b32_e32 v3, s25
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0x60
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s49
+; GFX8-NEXT:    v_mov_b32_e32 v1, s23
+; GFX8-NEXT:    v_mov_b32_e32 v2, s48
+; GFX8-NEXT:    v_mov_b32_e32 v3, s21
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0x50
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s47
+; GFX8-NEXT:    v_mov_b32_e32 v1, s46
+; GFX8-NEXT:    v_mov_b32_e32 v2, s45
+; GFX8-NEXT:    v_mov_b32_e32 v3, s19
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s22
+; GFX8-NEXT:    s_add_u32 s22, s0, 64
+; GFX8-NEXT:    s_addc_u32 s23, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s22
+; GFX8-NEXT:    v_mov_b32_e32 v0, s24
+; GFX8-NEXT:    v_mov_b32_e32 v1, s17
+; GFX8-NEXT:    v_mov_b32_e32 v3, s15
+; GFX8-NEXT:    v_mov_b32_e32 v5, s23
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NEXT:    s_add_u32 s18, s0, 48
+; GFX8-NEXT:    s_addc_u32 s19, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s18
+; GFX8-NEXT:    v_mov_b32_e32 v0, s20
+; GFX8-NEXT:    v_mov_b32_e32 v1, s13
+; GFX8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NEXT:    v_mov_b32_e32 v5, s19
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s14
+; GFX8-NEXT:    s_add_u32 s14, s0, 32
+; GFX8-NEXT:    s_addc_u32 s15, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s14
+; GFX8-NEXT:    v_mov_b32_e32 v0, s16
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    s_add_u32 s4, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_addc_u32 s5, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s12
+; GFX8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_zextload_v64i1_to_v64i32:
@@ -3448,160 +3317,123 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 13, s2
-; GFX12-NEXT:    s_lshr_b32 s4, s3, 24
-; GFX12-NEXT:    v_lshrrev_b16 v2, 9, s2
-; GFX12-NEXT:    v_lshrrev_b16 v3, 11, s2
-; GFX12-NEXT:    v_lshrrev_b16 v9, 13, s3
-; GFX12-NEXT:    v_and_b32_e32 v44, 1, v1
-; GFX12-NEXT:    v_lshrrev_b16 v1, 1, s4
-; GFX12-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX12-NEXT:    v_dual_mov_b32 v64, 0 :: v_dual_and_b32 v41, 1, v2
-; GFX12-NEXT:    v_lshrrev_b16 v4, 5, s2
-; GFX12-NEXT:    v_lshrrev_b16 v5, 7, s2
-; GFX12-NEXT:    v_lshrrev_b16 v6, 1, s2
-; GFX12-NEXT:    v_lshrrev_b16 v7, 3, s2
-; GFX12-NEXT:    v_lshrrev_b16 v10, 9, s3
-; GFX12-NEXT:    v_lshrrev_b16 v11, 11, s3
-; GFX12-NEXT:    v_lshrrev_b16 v12, 5, s3
-; GFX12-NEXT:    v_lshrrev_b16 v13, 7, s3
-; GFX12-NEXT:    v_lshrrev_b16 v14, 1, s3
-; GFX12-NEXT:    v_lshrrev_b16 v17, 5, s4
-; GFX12-NEXT:    v_lshrrev_b16 v2, 5, s5
-; GFX12-NEXT:    s_and_b32 s7, s2, 1
-; GFX12-NEXT:    s_bfe_u32 s18, s3, 0x10010
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v52, s18 :: v_dual_and_b32 v35, 1, v9
-; GFX12-NEXT:    v_and_b32_e32 v9, 1, v1
-; GFX12-NEXT:    v_lshrrev_b16 v1, 3, s4
-; GFX12-NEXT:    s_bfe_u32 s19, s3, 0x10017
-; GFX12-NEXT:    v_dual_mov_b32 v51, s19 :: v_dual_and_b32 v42, 1, v3
-; GFX12-NEXT:    v_lshrrev_b16 v3, 3, s5
-; GFX12-NEXT:    v_lshrrev_b16 v15, 3, s3
-; GFX12-NEXT:    v_lshrrev_b16 v28, 12, s3
-; GFX12-NEXT:    v_lshrrev_b16 v29, 14, s3
-; GFX12-NEXT:    v_lshrrev_b16 v30, 15, s3
-; GFX12-NEXT:    v_lshrrev_b16 v25, 10, s3
-; GFX12-NEXT:    v_lshrrev_b16 v20, 4, s3
-; GFX12-NEXT:    v_lshrrev_b16 v21, 6, s3
-; GFX12-NEXT:    v_and_b32_e32 v27, 1, v12
-; GFX12-NEXT:    s_and_b32 s6, s3, 1
-; GFX12-NEXT:    s_bfe_u32 s9, s2, 0x10012
-; GFX12-NEXT:    s_bfe_u32 s10, s2, 0x10011
-; GFX12-NEXT:    s_bfe_u32 s12, s2, 0x10017
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v59, s12 :: v_dual_and_b32 v22, 1, v13
-; GFX12-NEXT:    v_dual_mov_b32 v62, s9 :: v_dual_and_b32 v13, 1, v17
-; GFX12-NEXT:    v_lshrrev_b16 v17, 6, s5
-; GFX12-NEXT:    s_bfe_u32 s13, s2, 0x10016
-; GFX12-NEXT:    v_dual_mov_b32 v58, s13 :: v_dual_and_b32 v23, 1, v14
-; GFX12-NEXT:    s_bfe_u32 s14, s2, 0x10015
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v57, s14 :: v_dual_and_b32 v26, 1, v11
-; GFX12-NEXT:    v_and_b32_e32 v11, 1, v1
-; GFX12-NEXT:    v_lshrrev_b16 v1, 1, s5
-; GFX12-NEXT:    s_bfe_u32 s15, s3, 0x10013
-; GFX12-NEXT:    v_dual_mov_b32 v55, s15 :: v_dual_and_b32 v34, 1, v7
-; GFX12-NEXT:    v_lshrrev_b16 v7, 7, s5
-; GFX12-NEXT:    s_bfe_u32 s16, s3, 0x10012
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v54, s16 :: v_dual_and_b32 v31, 1, v10
-; GFX12-NEXT:    s_bfe_u32 s17, s3, 0x10011
-; GFX12-NEXT:    v_dual_mov_b32 v53, s17 :: v_dual_and_b32 v38, 1, v5
-; GFX12-NEXT:    s_bfe_u32 s20, s3, 0x10016
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v50, s20 :: v_dual_and_b32 v39, 1, v6
-; GFX12-NEXT:    v_lshrrev_b16 v6, 2, s5
-; GFX12-NEXT:    s_bfe_u32 s21, s3, 0x10014
-; GFX12-NEXT:    v_dual_mov_b32 v48, s21 :: v_dual_and_b32 v43, 1, v4
-; GFX12-NEXT:    v_lshrrev_b16 v4, 4, s5
-; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10018
-; GFX12-NEXT:    v_and_b32_e32 v5, 1, v2
-; GFX12-NEXT:    v_dual_mov_b32 v61, s10 :: v_dual_and_b32 v2, 1, v3
-; GFX12-NEXT:    v_lshrrev_b16 v16, 15, s2
-; GFX12-NEXT:    v_lshrrev_b16 v24, 8, s3
-; GFX12-NEXT:    v_lshrrev_b16 v18, 2, s3
-; GFX12-NEXT:    s_bfe_u32 s11, s2, 0x10010
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v60, s11 :: v_dual_and_b32 v19, 1, v15
-; GFX12-NEXT:    v_lshrrev_b16 v0, 12, s2
-; GFX12-NEXT:    v_lshrrev_b16 v8, 14, s2
-; GFX12-NEXT:    v_lshrrev_b16 v12, 4, s4
-; GFX12-NEXT:    v_lshrrev_b16 v14, 6, s4
-; GFX12-NEXT:    v_lshrrev_b16 v15, 7, s4
-; GFX12-NEXT:    v_lshrrev_b16 v32, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v40, 10, s2
-; GFX12-NEXT:    v_lshrrev_b16 v36, 4, s2
-; GFX12-NEXT:    v_lshrrev_b16 v37, 6, s2
-; GFX12-NEXT:    v_lshrrev_b16 v33, 2, s2
-; GFX12-NEXT:    v_lshrrev_b16 v10, 2, s4
-; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT:    s_bfe_u32 s8, s2, 0x10013
-; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x10014
-; GFX12-NEXT:    s_bfe_u32 s3, s3, 0x10015
-; GFX12-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v2
-; GFX12-NEXT:    v_dual_mov_b32 v63, s8 :: v_dual_and_b32 v2, 1, v6
-; GFX12-NEXT:    v_and_b32_e32 v6, 1, v17
-; GFX12-NEXT:    v_and_b32_e32 v17, 0xffff, v23
-; GFX12-NEXT:    v_and_b32_e32 v23, 0xffff, v22
-; GFX12-NEXT:    v_and_b32_e32 v22, 1, v21
-; GFX12-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX12-NEXT:    v_dual_mov_b32 v49, s3 :: v_dual_and_b32 v28, 1, v28
-; GFX12-NEXT:    v_dual_mov_b32 v56, s2 :: v_dual_and_b32 v21, 0xffff, v27
-; GFX12-NEXT:    v_and_b32_e32 v27, 0xffff, v26
-; GFX12-NEXT:    v_and_b32_e32 v26, 1, v25
-; GFX12-NEXT:    v_and_b32_e32 v25, 0xffff, v31
-; GFX12-NEXT:    v_and_b32_e32 v31, 0xffff, v30
-; GFX12-NEXT:    v_and_b32_e32 v30, 1, v29
-; GFX12-NEXT:    v_and_b32_e32 v29, 0xffff, v35
-; GFX12-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX12-NEXT:    v_and_b32_e32 v18, 1, v18
-; GFX12-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX12-NEXT:    v_dual_mov_b32 v16, s6 :: v_dual_and_b32 v47, 0xffff, v16
-; GFX12-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX12-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX12-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX12-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX12-NEXT:    v_and_b32_e32 v46, 1, v8
-; GFX12-NEXT:    v_and_b32_e32 v45, 0xffff, v44
-; GFX12-NEXT:    v_and_b32_e32 v44, 1, v0
-; GFX12-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX12-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX12-NEXT:    v_and_b32_e32 v35, 0xffff, v34
-; GFX12-NEXT:    v_and_b32_e32 v34, 1, v33
-; GFX12-NEXT:    v_and_b32_e32 v33, 0xffff, v39
-; GFX12-NEXT:    v_and_b32_e32 v39, 0xffff, v38
-; GFX12-NEXT:    v_and_b32_e32 v38, 1, v37
-; GFX12-NEXT:    v_and_b32_e32 v37, 0xffff, v43
-; GFX12-NEXT:    v_and_b32_e32 v43, 0xffff, v42
-; GFX12-NEXT:    v_and_b32_e32 v42, 1, v40
-; GFX12-NEXT:    v_and_b32_e32 v41, 0xffff, v41
-; GFX12-NEXT:    v_and_b32_e32 v40, 1, v32
-; GFX12-NEXT:    v_dual_mov_b32 v32, s7 :: v_dual_and_b32 v9, 0xffff, v9
-; GFX12-NEXT:    v_dual_mov_b32 v8, s5 :: v_dual_and_b32 v7, 0xffff, v7
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX12-NEXT:    v_and_b32_e32 v36, 1, v36
-; GFX12-NEXT:    s_clause 0x9
-; GFX12-NEXT:    global_store_b128 v64, v[48:51], s[0:1] offset:208
-; GFX12-NEXT:    global_store_b128 v64, v[52:55], s[0:1] offset:192
-; GFX12-NEXT:    global_store_b128 v64, v[56:59], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v64, v[60:63], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v64, v[44:47], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v64, v[40:43], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v64, v[36:39], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v64, v[32:35], s[0:1]
-; GFX12-NEXT:    global_store_b128 v64, v[28:31], s[0:1] offset:176
-; GFX12-NEXT:    global_store_b128 v64, v[24:27], s[0:1] offset:160
-; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT:    s_lshr_b32 s33, s3, 31
+; GFX12-NEXT:    s_bfe_u32 s34, s3, 0x1001d
+; GFX12-NEXT:    s_bfe_u32 s65, s3, 0x1001c
+; GFX12-NEXT:    s_bfe_u32 s66, s3, 0x1001e
+; GFX12-NEXT:    s_bfe_u32 s30, s3, 0x1001b
+; GFX12-NEXT:    s_bfe_u32 s31, s3, 0x10019
+; GFX12-NEXT:    s_bfe_u32 s63, s3, 0x1001a
+; GFX12-NEXT:    s_bfe_u32 s64, s3, 0x10018
+; GFX12-NEXT:    v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s34
+; GFX12-NEXT:    s_bfe_u32 s29, s3, 0x10017
+; GFX12-NEXT:    s_bfe_u32 s60, s3, 0x10016
+; GFX12-NEXT:    s_bfe_u32 s61, s3, 0x10015
+; GFX12-NEXT:    s_bfe_u32 s62, s3, 0x10014
+; GFX12-NEXT:    v_dual_mov_b32 v0, s65 :: v_dual_mov_b32 v3, s33
+; GFX12-NEXT:    v_dual_mov_b32 v2, s66 :: v_dual_mov_b32 v5, s31
+; GFX12-NEXT:    s_bfe_u32 s27, s3, 0x10013
+; GFX12-NEXT:    s_bfe_u32 s28, s3, 0x10011
+; GFX12-NEXT:    s_bfe_u32 s58, s3, 0x10012
+; GFX12-NEXT:    s_bfe_u32 s59, s3, 0x10010
+; GFX12-NEXT:    v_dual_mov_b32 v4, s64 :: v_dual_mov_b32 v7, s30
+; GFX12-NEXT:    v_dual_mov_b32 v6, s63 :: v_dual_mov_b32 v9, s61
+; GFX12-NEXT:    v_dual_mov_b32 v8, s62 :: v_dual_mov_b32 v11, s29
+; GFX12-NEXT:    v_dual_mov_b32 v10, s60 :: v_dual_mov_b32 v13, s28
+; GFX12-NEXT:    s_bfe_u32 s19, s3, 0x10003
+; GFX12-NEXT:    s_bfe_u32 s20, s3, 0x10001
+; GFX12-NEXT:    s_bfe_u32 s21, s3, 0x10007
+; GFX12-NEXT:    s_bfe_u32 s22, s3, 0x10005
+; GFX12-NEXT:    s_bfe_u32 s23, s3, 0x1000b
+; GFX12-NEXT:    s_bfe_u32 s24, s3, 0x10009
+; GFX12-NEXT:    s_bfe_u32 s25, s3, 0x1000f
+; GFX12-NEXT:    s_bfe_u32 s26, s3, 0x1000d
+; GFX12-NEXT:    s_and_b32 s51, s3, 1
+; GFX12-NEXT:    s_bfe_u32 s52, s3, 0x10002
+; GFX12-NEXT:    s_bfe_u32 s53, s3, 0x10006
+; GFX12-NEXT:    s_bfe_u32 s54, s3, 0x10004
+; GFX12-NEXT:    s_bfe_u32 s55, s3, 0x1000a
+; GFX12-NEXT:    s_bfe_u32 s56, s3, 0x10008
+; GFX12-NEXT:    s_bfe_u32 s57, s3, 0x1000e
+; GFX12-NEXT:    v_dual_mov_b32 v12, s59 :: v_dual_mov_b32 v15, s27
+; GFX12-NEXT:    v_mov_b32_e32 v14, s58
+; GFX12-NEXT:    s_bfe_u32 s3, s3, 0x1000c
+; GFX12-NEXT:    s_clause 0x3
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:240
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:224
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:208
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:192
+; GFX12-NEXT:    v_dual_mov_b32 v1, s26 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT:    v_dual_mov_b32 v3, s25 :: v_dual_mov_b32 v2, s57
+; GFX12-NEXT:    v_dual_mov_b32 v5, s24 :: v_dual_mov_b32 v4, s56
+; GFX12-NEXT:    v_dual_mov_b32 v7, s23 :: v_dual_mov_b32 v6, s55
+; GFX12-NEXT:    v_mov_b32_e32 v9, s22
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10003
+; GFX12-NEXT:    s_bfe_u32 s5, s2, 0x10001
+; GFX12-NEXT:    s_bfe_u32 s6, s2, 0x10007
+; GFX12-NEXT:    s_bfe_u32 s7, s2, 0x10005
+; GFX12-NEXT:    s_bfe_u32 s8, s2, 0x1000b
+; GFX12-NEXT:    s_bfe_u32 s9, s2, 0x10009
+; GFX12-NEXT:    s_bfe_u32 s10, s2, 0x1000f
+; GFX12-NEXT:    s_bfe_u32 s11, s2, 0x1000d
+; GFX12-NEXT:    s_bfe_u32 s12, s2, 0x10013
+; GFX12-NEXT:    s_bfe_u32 s13, s2, 0x10011
+; GFX12-NEXT:    s_bfe_u32 s14, s2, 0x10017
+; GFX12-NEXT:    s_bfe_u32 s15, s2, 0x1001b
+; GFX12-NEXT:    s_bfe_u32 s16, s2, 0x10019
+; GFX12-NEXT:    s_lshr_b32 s17, s2, 31
+; GFX12-NEXT:    s_bfe_u32 s18, s2, 0x1001d
+; GFX12-NEXT:    s_and_b32 s35, s2, 1
+; GFX12-NEXT:    s_bfe_u32 s36, s2, 0x10002
+; GFX12-NEXT:    s_bfe_u32 s37, s2, 0x10006
+; GFX12-NEXT:    s_bfe_u32 s38, s2, 0x10004
+; GFX12-NEXT:    s_bfe_u32 s39, s2, 0x1000a
+; GFX12-NEXT:    s_bfe_u32 s40, s2, 0x10008
+; GFX12-NEXT:    s_bfe_u32 s41, s2, 0x1000e
+; GFX12-NEXT:    s_bfe_u32 s42, s2, 0x1000c
+; GFX12-NEXT:    s_bfe_u32 s43, s2, 0x10012
+; GFX12-NEXT:    s_bfe_u32 s44, s2, 0x10010
+; GFX12-NEXT:    s_bfe_u32 s45, s2, 0x10016
+; GFX12-NEXT:    s_bfe_u32 s46, s2, 0x10015
+; GFX12-NEXT:    s_bfe_u32 s47, s2, 0x10014
+; GFX12-NEXT:    s_bfe_u32 s48, s2, 0x1001a
+; GFX12-NEXT:    s_bfe_u32 s49, s2, 0x10018
+; GFX12-NEXT:    s_bfe_u32 s50, s2, 0x1001e
+; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x1001c
+; GFX12-NEXT:    v_dual_mov_b32 v8, s54 :: v_dual_mov_b32 v11, s21
+; GFX12-NEXT:    v_dual_mov_b32 v10, s53 :: v_dual_mov_b32 v13, s20
+; GFX12-NEXT:    v_dual_mov_b32 v12, s51 :: v_dual_mov_b32 v15, s19
+; GFX12-NEXT:    v_dual_mov_b32 v14, s52 :: v_dual_mov_b32 v17, s18
+; GFX12-NEXT:    v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s17
+; GFX12-NEXT:    v_dual_mov_b32 v18, s50 :: v_dual_mov_b32 v21, s16
+; GFX12-NEXT:    v_dual_mov_b32 v20, s49 :: v_dual_mov_b32 v23, s15
+; GFX12-NEXT:    v_mov_b32_e32 v22, s48
+; GFX12-NEXT:    s_clause 0x5
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:176
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:160
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:144
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:128
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1] offset:96
+; GFX12-NEXT:    v_dual_mov_b32 v1, s46 :: v_dual_mov_b32 v0, s47
+; GFX12-NEXT:    v_dual_mov_b32 v3, s14 :: v_dual_mov_b32 v2, s45
+; GFX12-NEXT:    v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v4, s44
+; GFX12-NEXT:    v_dual_mov_b32 v7, s12 :: v_dual_mov_b32 v6, s43
+; GFX12-NEXT:    v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v8, s42
+; GFX12-NEXT:    v_dual_mov_b32 v11, s10 :: v_dual_mov_b32 v10, s41
+; GFX12-NEXT:    v_dual_mov_b32 v13, s9 :: v_dual_mov_b32 v12, s40
+; GFX12-NEXT:    v_dual_mov_b32 v15, s8 :: v_dual_mov_b32 v14, s39
+; GFX12-NEXT:    v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s38
+; GFX12-NEXT:    v_dual_mov_b32 v19, s6 :: v_dual_mov_b32 v18, s37
+; GFX12-NEXT:    v_dual_mov_b32 v21, s5 :: v_dual_mov_b32 v20, s35
+; GFX12-NEXT:    v_dual_mov_b32 v23, s4 :: v_dual_mov_b32 v22, s36
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v64, v[20:23], s[0:1] offset:144
-; GFX12-NEXT:    global_store_b128 v64, v[16:19], s[0:1] offset:128
-; GFX12-NEXT:    global_store_b128 v64, v[12:15], s[0:1] offset:240
-; GFX12-NEXT:    global_store_b128 v64, v[8:11], s[0:1] offset:224
-; GFX12-NEXT:    global_store_b128 v64, v[4:7], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v64, v[0:3], s[0:1] offset:96
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -3782,216 +3614,219 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[26:27], s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e64 v18, 12, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v19, 13, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v20, 14, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v21, 15, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 8, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v15, 9, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v16, 10, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v17, 11, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v10, 4, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v11, 5, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v12, 6, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v13, 7, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v7, 1, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v8, 2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v9, 3, s2
-; GFX8-NEXT:    s_lshr_b32 s7, s3, 24
-; GFX8-NEXT:    s_lshr_b32 s8, s2, 24
-; GFX8-NEXT:    s_bfe_i32 s4, s2, 0x10018
-; GFX8-NEXT:    s_bfe_i32 s5, s3, 0x10018
-; GFX8-NEXT:    s_bfe_i32 s6, s3, 0x10000
-; GFX8-NEXT:    s_bfe_i32 s9, s2, 0x10000
-; GFX8-NEXT:    s_bfe_i32 s12, s2, 0x10013
-; GFX8-NEXT:    s_bfe_i32 s13, s2, 0x10012
-; GFX8-NEXT:    s_bfe_i32 s14, s2, 0x10011
-; GFX8-NEXT:    s_bfe_i32 s15, s2, 0x10010
-; GFX8-NEXT:    s_bfe_i32 s16, s2, 0x10017
-; GFX8-NEXT:    s_bfe_i32 s17, s2, 0x10016
-; GFX8-NEXT:    s_bfe_i32 s18, s2, 0x10015
-; GFX8-NEXT:    s_bfe_i32 s2, s2, 0x10014
-; GFX8-NEXT:    s_bfe_i32 s19, s3, 0x10013
-; GFX8-NEXT:    s_bfe_i32 s20, s3, 0x10012
-; GFX8-NEXT:    s_bfe_i32 s21, s3, 0x10011
-; GFX8-NEXT:    s_bfe_i32 s22, s3, 0x10010
-; GFX8-NEXT:    s_bfe_i32 s10, s3, 0x10017
-; GFX8-NEXT:    s_bfe_i32 s11, s3, 0x10016
-; GFX8-NEXT:    s_bfe_i32 s23, s3, 0x10015
-; GFX8-NEXT:    s_bfe_i32 s24, s3, 0x10014
-; GFX8-NEXT:    v_mov_b32_e32 v25, s10
-; GFX8-NEXT:    s_add_u32 s10, s0, 0xd0
-; GFX8-NEXT:    v_mov_b32_e32 v24, s11
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v27, s11
-; GFX8-NEXT:    v_mov_b32_e32 v26, s10
-; GFX8-NEXT:    s_add_u32 s10, s0, 0xc0
-; GFX8-NEXT:    v_mov_b32_e32 v22, s24
-; GFX8-NEXT:    v_mov_b32_e32 v23, s23
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_mov_b32_e32 v27, s11
-; GFX8-NEXT:    v_mov_b32_e32 v26, s10
-; GFX8-NEXT:    s_add_u32 s10, s0, 0x50
-; GFX8-NEXT:    v_mov_b32_e32 v22, s22
-; GFX8-NEXT:    v_mov_b32_e32 v23, s21
-; GFX8-NEXT:    v_mov_b32_e32 v24, s20
-; GFX8-NEXT:    v_mov_b32_e32 v25, s19
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_mov_b32_e32 v27, s11
-; GFX8-NEXT:    v_mov_b32_e32 v26, s10
-; GFX8-NEXT:    s_add_u32 s10, s0, 64
-; GFX8-NEXT:    v_mov_b32_e32 v22, s2
-; GFX8-NEXT:    v_mov_b32_e32 v23, s18
-; GFX8-NEXT:    v_mov_b32_e32 v24, s17
-; GFX8-NEXT:    v_mov_b32_e32 v25, s16
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_mov_b32_e32 v27, s11
-; GFX8-NEXT:    v_mov_b32_e32 v26, s10
-; GFX8-NEXT:    s_add_u32 s10, s0, 48
-; GFX8-NEXT:    v_mov_b32_e32 v22, s15
-; GFX8-NEXT:    v_mov_b32_e32 v23, s14
-; GFX8-NEXT:    v_mov_b32_e32 v24, s13
-; GFX8-NEXT:    v_mov_b32_e32 v25, s12
+; GFX8-NEXT:    s_bfe_i32 s2, s26, 0x10003
+; GFX8-NEXT:    s_bfe_i32 s3, s26, 0x10002
+; GFX8-NEXT:    s_bfe_i32 s4, s26, 0x10001
+; GFX8-NEXT:    s_bfe_i32 s5, s26, 0x10000
+; GFX8-NEXT:    s_bfe_i32 s6, s26, 0x10007
+; GFX8-NEXT:    s_bfe_i32 s7, s26, 0x10006
+; GFX8-NEXT:    s_bfe_i32 s8, s26, 0x10005
+; GFX8-NEXT:    s_bfe_i32 s9, s26, 0x10004
+; GFX8-NEXT:    s_bfe_i32 s10, s26, 0x1000b
+; GFX8-NEXT:    s_bfe_i32 s11, s26, 0x1000a
+; GFX8-NEXT:    s_bfe_i32 s12, s26, 0x10009
+; GFX8-NEXT:    s_bfe_i32 s13, s26, 0x10008
+; GFX8-NEXT:    s_bfe_i32 s14, s26, 0x1000f
+; GFX8-NEXT:    s_bfe_i32 s15, s26, 0x1000e
+; GFX8-NEXT:    s_bfe_i32 s16, s26, 0x1000d
+; GFX8-NEXT:    s_bfe_i32 s17, s26, 0x1000c
+; GFX8-NEXT:    s_bfe_i32 s18, s26, 0x10013
+; GFX8-NEXT:    s_bfe_i32 s19, s26, 0x10012
+; GFX8-NEXT:    s_bfe_i32 s20, s26, 0x10011
+; GFX8-NEXT:    s_bfe_i32 s21, s26, 0x10010
+; GFX8-NEXT:    s_bfe_i32 s22, s26, 0x10017
+; GFX8-NEXT:    s_bfe_i32 s23, s26, 0x10016
+; GFX8-NEXT:    s_bfe_i32 s24, s26, 0x10015
+; GFX8-NEXT:    s_bfe_i32 s25, s26, 0x10014
+; GFX8-NEXT:    s_bfe_i32 s28, s26, 0x1001b
+; GFX8-NEXT:    s_bfe_i32 s29, s26, 0x1001a
+; GFX8-NEXT:    s_bfe_i32 s30, s26, 0x10019
+; GFX8-NEXT:    s_bfe_i32 s31, s26, 0x10018
+; GFX8-NEXT:    s_ashr_i32 s33, s26, 31
+; GFX8-NEXT:    s_bfe_i32 s34, s26, 0x1001e
+; GFX8-NEXT:    s_bfe_i32 s35, s26, 0x1001d
+; GFX8-NEXT:    s_bfe_i32 s36, s26, 0x1001c
+; GFX8-NEXT:    s_bfe_i32 s37, s27, 0x10003
+; GFX8-NEXT:    s_bfe_i32 s38, s27, 0x10002
+; GFX8-NEXT:    s_bfe_i32 s39, s27, 0x10001
+; GFX8-NEXT:    s_bfe_i32 s40, s27, 0x10000
+; GFX8-NEXT:    s_bfe_i32 s41, s27, 0x10007
+; GFX8-NEXT:    s_bfe_i32 s42, s27, 0x10006
+; GFX8-NEXT:    s_bfe_i32 s43, s27, 0x10005
+; GFX8-NEXT:    s_bfe_i32 s44, s27, 0x10004
+; GFX8-NEXT:    s_bfe_i32 s45, s27, 0x1000b
+; GFX8-NEXT:    s_bfe_i32 s46, s27, 0x1000a
+; GFX8-NEXT:    s_bfe_i32 s47, s27, 0x10009
+; GFX8-NEXT:    s_bfe_i32 s48, s27, 0x10008
+; GFX8-NEXT:    s_bfe_i32 s49, s27, 0x1000f
+; GFX8-NEXT:    s_bfe_i32 s50, s27, 0x1000e
+; GFX8-NEXT:    s_bfe_i32 s51, s27, 0x1000d
+; GFX8-NEXT:    s_bfe_i32 s52, s27, 0x1000c
+; GFX8-NEXT:    s_bfe_i32 s53, s27, 0x10013
+; GFX8-NEXT:    s_bfe_i32 s54, s27, 0x10012
+; GFX8-NEXT:    s_bfe_i32 s55, s27, 0x10011
+; GFX8-NEXT:    s_bfe_i32 s56, s27, 0x10010
+; GFX8-NEXT:    s_bfe_i32 s57, s27, 0x10017
+; GFX8-NEXT:    s_bfe_i32 s58, s27, 0x10016
+; GFX8-NEXT:    s_bfe_i32 s59, s27, 0x10015
+; GFX8-NEXT:    s_bfe_i32 s60, s27, 0x10014
+; GFX8-NEXT:    s_bfe_i32 s61, s27, 0x1001b
+; GFX8-NEXT:    s_bfe_i32 s62, s27, 0x1001a
+; GFX8-NEXT:    s_bfe_i32 s63, s27, 0x10019
+; GFX8-NEXT:    s_bfe_i32 s64, s27, 0x10018
+; GFX8-NEXT:    s_ashr_i32 s26, s27, 31
+; GFX8-NEXT:    s_bfe_i32 s65, s27, 0x1001e
+; GFX8-NEXT:    s_bfe_i32 s66, s27, 0x1001d
+; GFX8-NEXT:    s_bfe_i32 s27, s27, 0x1001c
+; GFX8-NEXT:    v_mov_b32_e32 v3, s26
+; GFX8-NEXT:    s_add_u32 s26, s0, 0xf0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s27
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v1, s66
+; GFX8-NEXT:    v_mov_b32_e32 v2, s65
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0xe0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s64
+; GFX8-NEXT:    v_mov_b32_e32 v1, s63
+; GFX8-NEXT:    v_mov_b32_e32 v2, s62
+; GFX8-NEXT:    v_mov_b32_e32 v3, s61
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0xd0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s60
+; GFX8-NEXT:    v_mov_b32_e32 v1, s59
+; GFX8-NEXT:    v_mov_b32_e32 v2, s58
+; GFX8-NEXT:    v_mov_b32_e32 v3, s57
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0xc0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s56
+; GFX8-NEXT:    v_mov_b32_e32 v1, s55
+; GFX8-NEXT:    v_mov_b32_e32 v2, s54
+; GFX8-NEXT:    v_mov_b32_e32 v3, s53
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0xb0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s52
+; GFX8-NEXT:    v_mov_b32_e32 v1, s51
+; GFX8-NEXT:    v_mov_b32_e32 v2, s50
+; GFX8-NEXT:    v_mov_b32_e32 v3, s49
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0xa0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s48
+; GFX8-NEXT:    v_mov_b32_e32 v1, s47
+; GFX8-NEXT:    v_mov_b32_e32 v2, s46
+; GFX8-NEXT:    v_mov_b32_e32 v3, s45
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0x90
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s44
+; GFX8-NEXT:    v_mov_b32_e32 v1, s43
+; GFX8-NEXT:    v_mov_b32_e32 v2, s42
+; GFX8-NEXT:    v_mov_b32_e32 v3, s41
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0x80
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s40
+; GFX8-NEXT:    v_mov_b32_e32 v1, s39
+; GFX8-NEXT:    v_mov_b32_e32 v2, s38
+; GFX8-NEXT:    v_mov_b32_e32 v3, s37
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0x70
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s36
+; GFX8-NEXT:    v_mov_b32_e32 v1, s35
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v3, s33
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s26, s0, 0x60
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s31
+; GFX8-NEXT:    v_mov_b32_e32 v1, s30
+; GFX8-NEXT:    v_mov_b32_e32 v2, s29
+; GFX8-NEXT:    v_mov_b32_e32 v3, s28
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s22
+; GFX8-NEXT:    s_add_u32 s22, s0, 0x50
+; GFX8-NEXT:    v_mov_b32_e32 v2, s23
+; GFX8-NEXT:    s_addc_u32 s23, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s22
+; GFX8-NEXT:    v_mov_b32_e32 v0, s25
+; GFX8-NEXT:    v_mov_b32_e32 v1, s24
+; GFX8-NEXT:    v_mov_b32_e32 v5, s23
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NEXT:    s_add_u32 s18, s0, 64
+; GFX8-NEXT:    v_mov_b32_e32 v2, s19
+; GFX8-NEXT:    s_addc_u32 s19, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s18
+; GFX8-NEXT:    v_mov_b32_e32 v0, s21
+; GFX8-NEXT:    v_mov_b32_e32 v1, s20
+; GFX8-NEXT:    v_mov_b32_e32 v5, s19
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s14
+; GFX8-NEXT:    s_add_u32 s14, s0, 48
+; GFX8-NEXT:    v_mov_b32_e32 v2, s15
+; GFX8-NEXT:    s_addc_u32 s15, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s14
+; GFX8-NEXT:    v_mov_b32_e32 v0, s17
+; GFX8-NEXT:    v_mov_b32_e32 v1, s16
+; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NEXT:    s_add_u32 s10, s0, 32
+; GFX8-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_bfe_i32 v21, v21, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v23, s11
-; GFX8-NEXT:    v_bfe_i32 v20, v20, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v19, v19, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v18, v18, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v22, s10
-; GFX8-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 12, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 13, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v5, 14, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v6, 15, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 8, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 9, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 10, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v24, 11, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v26, 4, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v27, 5, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v28, 6, s3
-; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v22, 7, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v23, 1, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v20, 2, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v21, 3, s3
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v19, s3
-; GFX8-NEXT:    v_mov_b32_e32 v18, s2
-; GFX8-NEXT:    s_add_u32 s2, s0, 16
-; GFX8-NEXT:    v_bfe_i32 v17, v17, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v16, v16, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v15, v15, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v14, v14, 0, 1
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[14:17]
-; GFX8-NEXT:    v_bfe_i32 v13, v13, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v15, s3
-; GFX8-NEXT:    v_bfe_i32 v12, v12, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v11, v11, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v10, v10, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v14, s2
-; GFX8-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NEXT:    s_add_u32 s2, s0, 0xb0
-; GFX8-NEXT:    v_mov_b32_e32 v12, s1
-; GFX8-NEXT:    v_bfe_i32 v10, v9, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v9, v8, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v8, v7, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v7, s9
-; GFX8-NEXT:    v_mov_b32_e32 v11, s0
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[11:12], v[7:10]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v11, 6, s8
-; GFX8-NEXT:    v_mov_b32_e32 v8, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v10, 5, s8
-; GFX8-NEXT:    v_bfe_i32 v6, v6, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v5, v5, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v4, v4, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v7, s2
-; GFX8-NEXT:    s_add_u32 s2, s0, 0xa0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v13, 4, s8
-; GFX8-NEXT:    v_lshrrev_b16_e64 v12, 1, s8
-; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[3:6]
-; GFX8-NEXT:    v_bfe_i32 v8, v11, 0, 1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 2, s8
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 3, s8
-; GFX8-NEXT:    v_bfe_i32 v7, v10, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v11, v1, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v10, v0, 0, 1
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_bfe_i32 v5, v4, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v4, v3, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v3, v12, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v6, v13, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v13, v24, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v12, v2, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 0x90
-; GFX8-NEXT:    v_lshrrev_b16_e64 v19, 5, s7
-; GFX8-NEXT:    v_lshrrev_b16_e64 v15, 2, s7
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[10:13]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_bfe_i32 v12, v15, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v15, v19, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v19, v23, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v25, v22, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v24, v28, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v23, v27, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v22, v26, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 0x80
-; GFX8-NEXT:    v_lshrrev_b16_e64 v18, 4, s7
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 1, s7
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[22:25]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_bfe_i32 v11, v14, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v14, v18, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v21, v21, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v20, v20, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v18, s6
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 0xf0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v16, 6, s7
-; GFX8-NEXT:    v_lshrrev_b16_e64 v17, 7, s7
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[18:21]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_bfe_i32 v17, v17, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v16, v16, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 0xe0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 3, s7
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[14:17]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_bfe_i32 v13, v2, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 0x70
-; GFX8-NEXT:    v_lshrrev_b16_e64 v9, 7, s8
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[10:13]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_bfe_i32 v9, v9, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    s_add_u32 s0, s0, 0x60
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[6:9]
-; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8-NEXT:    v_mov_b32_e32 v0, s13
+; GFX8-NEXT:    v_mov_b32_e32 v1, s12
+; GFX8-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    s_add_u32 s6, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s9
+; GFX8-NEXT:    v_mov_b32_e32 v1, s8
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_sextload_v64i1_to_v64i32:
@@ -4245,149 +4080,123 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v28, 12, s3
-; GFX12-NEXT:    v_lshrrev_b16 v29, 13, s3
-; GFX12-NEXT:    v_lshrrev_b16 v30, 14, s3
-; GFX12-NEXT:    v_lshrrev_b16 v31, 15, s3
-; GFX12-NEXT:    v_lshrrev_b16 v20, 4, s3
-; GFX12-NEXT:    v_lshrrev_b16 v21, 5, s3
-; GFX12-NEXT:    v_lshrrev_b16 v22, 6, s3
-; GFX12-NEXT:    v_lshrrev_b16 v23, 7, s3
-; GFX12-NEXT:    s_lshr_b32 s4, s3, 24
-; GFX12-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX12-NEXT:    v_lshrrev_b16 v16, 14, s2
-; GFX12-NEXT:    v_lshrrev_b16 v24, 8, s3
-; GFX12-NEXT:    v_lshrrev_b16 v25, 9, s3
-; GFX12-NEXT:    v_lshrrev_b16 v26, 10, s3
-; GFX12-NEXT:    v_lshrrev_b16 v27, 11, s3
-; GFX12-NEXT:    v_lshrrev_b16 v17, 1, s3
-; GFX12-NEXT:    v_lshrrev_b16 v18, 2, s3
-; GFX12-NEXT:    v_lshrrev_b16 v19, 3, s3
-; GFX12-NEXT:    v_lshrrev_b16 v0, 12, s2
-; GFX12-NEXT:    v_lshrrev_b16 v8, 13, s2
-; GFX12-NEXT:    v_lshrrev_b16 v32, 15, s2
-; GFX12-NEXT:    v_lshrrev_b16 v12, 4, s4
-; GFX12-NEXT:    v_lshrrev_b16 v13, 5, s4
-; GFX12-NEXT:    v_lshrrev_b16 v14, 6, s4
-; GFX12-NEXT:    v_lshrrev_b16 v15, 7, s4
-; GFX12-NEXT:    v_lshrrev_b16 v1, 3, s5
-; GFX12-NEXT:    v_lshrrev_b16 v7, 1, s5
-; GFX12-NEXT:    v_lshrrev_b16 v44, 7, s5
-; GFX12-NEXT:    v_lshrrev_b16 v40, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v41, 9, s2
-; GFX12-NEXT:    v_lshrrev_b16 v42, 10, s2
-; GFX12-NEXT:    v_lshrrev_b16 v43, 11, s2
-; GFX12-NEXT:    v_lshrrev_b16 v36, 4, s2
-; GFX12-NEXT:    v_lshrrev_b16 v37, 5, s2
-; GFX12-NEXT:    v_lshrrev_b16 v38, 6, s2
-; GFX12-NEXT:    v_lshrrev_b16 v39, 7, s2
-; GFX12-NEXT:    v_lshrrev_b16 v33, 1, s2
-; GFX12-NEXT:    v_lshrrev_b16 v34, 2, s2
-; GFX12-NEXT:    v_lshrrev_b16 v35, 3, s2
-; GFX12-NEXT:    v_lshrrev_b16 v9, 1, s4
-; GFX12-NEXT:    v_lshrrev_b16 v10, 2, s4
-; GFX12-NEXT:    v_lshrrev_b16 v11, 3, s4
-; GFX12-NEXT:    v_lshrrev_b16 v4, 4, s5
-; GFX12-NEXT:    v_lshrrev_b16 v5, 5, s5
-; GFX12-NEXT:    v_lshrrev_b16 v6, 6, s5
-; GFX12-NEXT:    v_lshrrev_b16 v2, 2, s5
-; GFX12-NEXT:    s_bfe_i32 s4, s2, 0x10018
-; GFX12-NEXT:    s_bfe_i32 s5, s3, 0x10018
-; GFX12-NEXT:    s_bfe_i32 s6, s3, 0x10000
+; GFX12-NEXT:    s_ashr_i32 s63, s3, 31
+; GFX12-NEXT:    s_bfe_i32 s64, s3, 0x1001e
+; GFX12-NEXT:    s_bfe_i32 s65, s3, 0x1001c
+; GFX12-NEXT:    s_bfe_i32 s66, s3, 0x1001d
+; GFX12-NEXT:    s_bfe_i32 s59, s3, 0x1001b
+; GFX12-NEXT:    s_bfe_i32 s60, s3, 0x1001a
+; GFX12-NEXT:    s_bfe_i32 s61, s3, 0x10019
+; GFX12-NEXT:    s_bfe_i32 s62, s3, 0x10018
+; GFX12-NEXT:    v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s66
+; GFX12-NEXT:    s_bfe_i32 s55, s3, 0x10017
+; GFX12-NEXT:    s_bfe_i32 s56, s3, 0x10016
+; GFX12-NEXT:    s_bfe_i32 s57, s3, 0x10015
+; GFX12-NEXT:    s_bfe_i32 s58, s3, 0x10014
+; GFX12-NEXT:    v_dual_mov_b32 v0, s65 :: v_dual_mov_b32 v3, s63
+; GFX12-NEXT:    v_dual_mov_b32 v2, s64 :: v_dual_mov_b32 v5, s61
+; GFX12-NEXT:    s_bfe_i32 s51, s3, 0x10013
+; GFX12-NEXT:    s_bfe_i32 s52, s3, 0x10012
+; GFX12-NEXT:    s_bfe_i32 s53, s3, 0x10011
+; GFX12-NEXT:    s_bfe_i32 s54, s3, 0x10010
+; GFX12-NEXT:    v_dual_mov_b32 v4, s62 :: v_dual_mov_b32 v7, s59
+; GFX12-NEXT:    v_dual_mov_b32 v6, s60 :: v_dual_mov_b32 v9, s57
+; GFX12-NEXT:    v_dual_mov_b32 v8, s58 :: v_dual_mov_b32 v11, s55
+; GFX12-NEXT:    v_dual_mov_b32 v10, s56 :: v_dual_mov_b32 v13, s53
+; GFX12-NEXT:    s_bfe_i32 s36, s3, 0x10003
+; GFX12-NEXT:    s_bfe_i32 s37, s3, 0x10002
+; GFX12-NEXT:    s_bfe_i32 s38, s3, 0x10001
+; GFX12-NEXT:    s_bfe_i32 s39, s3, 0x10000
+; GFX12-NEXT:    s_bfe_i32 s40, s3, 0x10007
+; GFX12-NEXT:    s_bfe_i32 s41, s3, 0x10006
+; GFX12-NEXT:    s_bfe_i32 s42, s3, 0x10005
+; GFX12-NEXT:    s_bfe_i32 s43, s3, 0x10004
+; GFX12-NEXT:    s_bfe_i32 s44, s3, 0x1000b
+; GFX12-NEXT:    s_bfe_i32 s45, s3, 0x1000a
+; GFX12-NEXT:    s_bfe_i32 s46, s3, 0x10009
+; GFX12-NEXT:    s_bfe_i32 s47, s3, 0x10008
+; GFX12-NEXT:    s_bfe_i32 s48, s3, 0x1000f
+; GFX12-NEXT:    s_bfe_i32 s49, s3, 0x1000e
+; GFX12-NEXT:    s_bfe_i32 s50, s3, 0x1000d
+; GFX12-NEXT:    v_dual_mov_b32 v12, s54 :: v_dual_mov_b32 v15, s51
+; GFX12-NEXT:    v_mov_b32_e32 v14, s52
+; GFX12-NEXT:    s_bfe_i32 s3, s3, 0x1000c
+; GFX12-NEXT:    s_clause 0x3
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:240
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:224
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:208
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:192
+; GFX12-NEXT:    v_dual_mov_b32 v1, s50 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT:    v_dual_mov_b32 v3, s48 :: v_dual_mov_b32 v2, s49
+; GFX12-NEXT:    v_dual_mov_b32 v5, s46 :: v_dual_mov_b32 v4, s47
+; GFX12-NEXT:    v_dual_mov_b32 v7, s44 :: v_dual_mov_b32 v6, s45
+; GFX12-NEXT:    v_mov_b32_e32 v9, s42
+; GFX12-NEXT:    s_bfe_i32 s4, s2, 0x10003
+; GFX12-NEXT:    s_bfe_i32 s5, s2, 0x10002
+; GFX12-NEXT:    s_bfe_i32 s6, s2, 0x10001
 ; GFX12-NEXT:    s_bfe_i32 s7, s2, 0x10000
-; GFX12-NEXT:    s_bfe_i32 s8, s2, 0x10013
-; GFX12-NEXT:    s_bfe_i32 s9, s2, 0x10012
-; GFX12-NEXT:    s_bfe_i32 s10, s2, 0x10011
-; GFX12-NEXT:    s_bfe_i32 s11, s2, 0x10010
-; GFX12-NEXT:    s_bfe_i32 s12, s2, 0x10017
-; GFX12-NEXT:    s_bfe_i32 s13, s2, 0x10016
-; GFX12-NEXT:    s_bfe_i32 s14, s2, 0x10015
-; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x10014
-; GFX12-NEXT:    s_bfe_i32 s15, s3, 0x10013
-; GFX12-NEXT:    s_bfe_i32 s16, s3, 0x10012
-; GFX12-NEXT:    s_bfe_i32 s17, s3, 0x10011
-; GFX12-NEXT:    s_bfe_i32 s18, s3, 0x10010
-; GFX12-NEXT:    s_bfe_i32 s19, s3, 0x10017
-; GFX12-NEXT:    s_bfe_i32 s20, s3, 0x10016
-; GFX12-NEXT:    s_bfe_i32 s21, s3, 0x10014
-; GFX12-NEXT:    s_bfe_i32 s3, s3, 0x10015
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v64, 0 :: v_dual_mov_b32 v49, s3
-; GFX12-NEXT:    v_bfe_i32 v23, v23, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v22, v22, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v21, v21, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v20, v20, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v31, v31, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v30, v30, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v29, v29, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v28, v28, 0, 1
-; GFX12-NEXT:    v_dual_mov_b32 v48, s21 :: v_dual_mov_b32 v51, s19
-; GFX12-NEXT:    v_dual_mov_b32 v50, s20 :: v_dual_mov_b32 v53, s17
-; GFX12-NEXT:    v_dual_mov_b32 v54, s16 :: v_dual_mov_b32 v57, s14
-; GFX12-NEXT:    v_dual_mov_b32 v56, s2 :: v_dual_mov_b32 v59, s12
-; GFX12-NEXT:    v_dual_mov_b32 v58, s13 :: v_dual_mov_b32 v61, s10
-; GFX12-NEXT:    v_bfe_i32 v19, v19, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v18, v18, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v17, v17, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v27, v27, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v26, v26, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v25, v25, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v24, v24, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v46, v16, 0, 1
-; GFX12-NEXT:    v_dual_mov_b32 v52, s18 :: v_dual_mov_b32 v55, s15
-; GFX12-NEXT:    v_dual_mov_b32 v60, s11 :: v_dual_mov_b32 v63, s8
-; GFX12-NEXT:    v_mov_b32_e32 v62, s9
-; GFX12-NEXT:    v_mov_b32_e32 v16, s6
-; GFX12-NEXT:    v_bfe_i32 v3, v1, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v1, v7, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v7, v44, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v15, v15, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v14, v14, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v13, v13, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v12, v12, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v47, v32, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v45, v8, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v44, v0, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v11, v11, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v10, v10, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v9, v9, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v43, v43, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v42, v42, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v41, v41, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v40, v40, 0, 1
-; GFX12-NEXT:    v_mov_b32_e32 v8, s5
-; GFX12-NEXT:    v_bfe_i32 v6, v6, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v5, v5, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v4, v4, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v39, v39, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v38, v38, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v37, v37, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v36, v36, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v35, v35, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v34, v34, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v33, v33, 0, 1
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v64, v[48:51], s[0:1] offset:208
-; GFX12-NEXT:    global_store_b128 v64, v[52:55], s[0:1] offset:192
-; GFX12-NEXT:    v_mov_b32_e32 v32, s7
-; GFX12-NEXT:    s_clause 0x7
-; GFX12-NEXT:    global_store_b128 v64, v[56:59], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v64, v[60:63], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v64, v[44:47], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v64, v[40:43], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v64, v[36:39], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v64, v[32:35], s[0:1]
-; GFX12-NEXT:    global_store_b128 v64, v[28:31], s[0:1] offset:176
-; GFX12-NEXT:    global_store_b128 v64, v[24:27], s[0:1] offset:160
-; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    s_bfe_i32 s8, s2, 0x10007
+; GFX12-NEXT:    s_bfe_i32 s9, s2, 0x10006
+; GFX12-NEXT:    s_bfe_i32 s10, s2, 0x10005
+; GFX12-NEXT:    s_bfe_i32 s11, s2, 0x10004
+; GFX12-NEXT:    s_bfe_i32 s12, s2, 0x1000b
+; GFX12-NEXT:    s_bfe_i32 s13, s2, 0x1000a
+; GFX12-NEXT:    s_bfe_i32 s14, s2, 0x10009
+; GFX12-NEXT:    s_bfe_i32 s15, s2, 0x10008
+; GFX12-NEXT:    s_bfe_i32 s16, s2, 0x1000f
+; GFX12-NEXT:    s_bfe_i32 s17, s2, 0x1000e
+; GFX12-NEXT:    s_bfe_i32 s18, s2, 0x1000d
+; GFX12-NEXT:    s_bfe_i32 s19, s2, 0x1000c
+; GFX12-NEXT:    s_bfe_i32 s20, s2, 0x10013
+; GFX12-NEXT:    s_bfe_i32 s21, s2, 0x10012
+; GFX12-NEXT:    s_bfe_i32 s22, s2, 0x10011
+; GFX12-NEXT:    s_bfe_i32 s23, s2, 0x10010
+; GFX12-NEXT:    s_bfe_i32 s24, s2, 0x10017
+; GFX12-NEXT:    s_bfe_i32 s25, s2, 0x10016
+; GFX12-NEXT:    s_bfe_i32 s26, s2, 0x10015
+; GFX12-NEXT:    s_bfe_i32 s27, s2, 0x10014
+; GFX12-NEXT:    s_bfe_i32 s28, s2, 0x1001b
+; GFX12-NEXT:    s_bfe_i32 s29, s2, 0x1001a
+; GFX12-NEXT:    s_bfe_i32 s30, s2, 0x10019
+; GFX12-NEXT:    s_bfe_i32 s31, s2, 0x10018
+; GFX12-NEXT:    s_ashr_i32 s33, s2, 31
+; GFX12-NEXT:    s_bfe_i32 s34, s2, 0x1001e
+; GFX12-NEXT:    s_bfe_i32 s35, s2, 0x1001d
+; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x1001c
+; GFX12-NEXT:    v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v11, s40
+; GFX12-NEXT:    v_dual_mov_b32 v10, s41 :: v_dual_mov_b32 v13, s38
+; GFX12-NEXT:    v_dual_mov_b32 v12, s39 :: v_dual_mov_b32 v15, s36
+; GFX12-NEXT:    v_dual_mov_b32 v14, s37 :: v_dual_mov_b32 v17, s35
+; GFX12-NEXT:    v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s33
+; GFX12-NEXT:    v_dual_mov_b32 v18, s34 :: v_dual_mov_b32 v21, s30
+; GFX12-NEXT:    v_dual_mov_b32 v20, s31 :: v_dual_mov_b32 v23, s28
+; GFX12-NEXT:    v_mov_b32_e32 v22, s29
+; GFX12-NEXT:    s_clause 0x5
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:176
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:160
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:144
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:128
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1] offset:96
+; GFX12-NEXT:    v_dual_mov_b32 v1, s26 :: v_dual_mov_b32 v0, s27
+; GFX12-NEXT:    v_dual_mov_b32 v3, s24 :: v_dual_mov_b32 v2, s25
+; GFX12-NEXT:    v_dual_mov_b32 v5, s22 :: v_dual_mov_b32 v4, s23
+; GFX12-NEXT:    v_dual_mov_b32 v7, s20 :: v_dual_mov_b32 v6, s21
+; GFX12-NEXT:    v_dual_mov_b32 v9, s18 :: v_dual_mov_b32 v8, s19
+; GFX12-NEXT:    v_dual_mov_b32 v11, s16 :: v_dual_mov_b32 v10, s17
+; GFX12-NEXT:    v_dual_mov_b32 v13, s14 :: v_dual_mov_b32 v12, s15
+; GFX12-NEXT:    v_dual_mov_b32 v15, s12 :: v_dual_mov_b32 v14, s13
+; GFX12-NEXT:    v_dual_mov_b32 v17, s10 :: v_dual_mov_b32 v16, s11
+; GFX12-NEXT:    v_dual_mov_b32 v19, s8 :: v_dual_mov_b32 v18, s9
+; GFX12-NEXT:    v_dual_mov_b32 v21, s6 :: v_dual_mov_b32 v20, s7
+; GFX12-NEXT:    v_dual_mov_b32 v23, s4 :: v_dual_mov_b32 v22, s5
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v64, v[20:23], s[0:1] offset:144
-; GFX12-NEXT:    global_store_b128 v64, v[16:19], s[0:1] offset:128
-; GFX12-NEXT:    global_store_b128 v64, v[12:15], s[0:1] offset:240
-; GFX12-NEXT:    global_store_b128 v64, v[8:11], s[0:1] offset:224
-; GFX12-NEXT:    global_store_b128 v64, v[4:7], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v64, v[0:3], s[0:1] offset:96
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -4708,6 +4517,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ; GFX8-LABEL: constant_zextload_v2i1_to_v2i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -4715,11 +4525,10 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -4750,11 +4559,13 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v1, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 1, v0
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4798,7 +4609,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -4834,7 +4645,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v4, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 1, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 1, v0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_bfe_i32 v2, v1, 0, 1
@@ -4879,28 +4690,26 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; GFX8-LABEL: constant_zextload_v3i1_to_v3i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    v_mov_b32_e32 v10, 2
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
 ; GFX8-NEXT:    s_add_u32 s2, s0, 16
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v7, s3
-; GFX8-NEXT:    v_mov_b32_e32 v6, s2
+; GFX8-NEXT:    v_mov_b32_e32 v9, s3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s1
+; GFX8-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 2, v0
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v8
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v4
+; GFX8-NEXT:    v_bfe_u32 v2, v4, 1, 1
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    flat_store_dwordx2 v[8:9], v[4:5]
+; GFX8-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_zextload_v3i1_to_v3i64:
@@ -4936,15 +4745,16 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v5, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v2, 1, v0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 1, v0
-; GFX12-NEXT:    v_lshrrev_b16 v3, 2, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v2
-; GFX12-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_and_b32 v6, 1, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_dual_mov_b32 v3, v5 :: v_dual_and_b32 v4, 0xffff, v3
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v6
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX12-NEXT:    v_bfe_u32 v2, v0, 1, 1
+; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
+; GFX12-NEXT:    v_mov_b32_e32 v3, v5
+; GFX12-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_and_b32 v2, 0xffff, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b64 v5, v[4:5], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
@@ -4998,8 +4808,8 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 2, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 2, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
 ; GFX8-NEXT:    v_bfe_i32 v8, v3, 0, 1
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 1
@@ -5046,8 +4856,8 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v6, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 2, v0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 1, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-NEXT:    v_bfe_i32 v4, v1, 0, 1
@@ -5099,6 +4909,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; GFX8-LABEL: constant_zextload_v4i1_to_v4i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    v_mov_b32_e32 v2, 3
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -5114,15 +4925,10 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 2, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 3, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff, v12
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v0
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 1, 1
+; GFX8-NEXT:    v_lshrrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 2, 1
 ; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GFX8-NEXT:    s_endpgm
@@ -5163,23 +4969,23 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v1, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v6, 1, v0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 2, v0
-; GFX12-NEXT:    v_lshrrev_b16 v4, 1, v0
-; GFX12-NEXT:    v_mov_b32_e32 v3, v1
-; GFX12-NEXT:    v_lshrrev_b16 v0, 3, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v8, 1, v2
-; GFX12-NEXT:    v_and_b32_e32 v9, 1, v4
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v4, 0xffff, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff, v9
-; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10002
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 3, v0
+; GFX12-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10001
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    s_and_b32 s2, s2, 1
+; GFX12-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX12-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v1, v[4:7], s[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -5233,9 +5039,9 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 2, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 3, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 2, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 3, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
 ; GFX8-NEXT:    v_bfe_i32 v6, v4, 0, 1
 ; GFX8-NEXT:    v_bfe_i32 v4, v3, 0, 1
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 1
@@ -5287,9 +5093,9 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u8 v0, v8, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 3, v0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 2, v0
-; GFX12-NEXT:    v_lshrrev_b16 v3, 1, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 3, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 1, v0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-NEXT:    v_bfe_i32 v6, v1, 0, 1
@@ -5368,39 +5174,31 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; GFX8-NEXT:    v_mov_b32_e32 v16, s0
 ; GFX8-NEXT:    s_add_u32 s0, s0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v21, s3
-; GFX8-NEXT:    v_mov_b32_e32 v23, s1
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    v_mov_b32_e32 v20, s2
+; GFX8-NEXT:    v_mov_b32_e32 v23, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v13, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v15, v1
-; GFX8-NEXT:    v_mov_b32_e32 v20, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v22, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 2, v0
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 4, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 6, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v10, 5, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v14, 3, v0
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v0
+; GFX8-NEXT:    v_bfe_u32 v6, v0, 5, 1
+; GFX8-NEXT:    v_bfe_u32 v4, v0, 4, 1
+; GFX8-NEXT:    v_bfe_u32 v10, v0, 3, 1
+; GFX8-NEXT:    v_bfe_u32 v14, v0, 1, 1
 ; GFX8-NEXT:    v_and_b32_e32 v12, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 7, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v10
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v14
-; GFX8-NEXT:    v_and_b32_e32 v14, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v24
-; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
+; GFX8-NEXT:    v_bfe_u32 v8, v0, 2, 1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 7, v24
+; GFX8-NEXT:    v_bfe_u32 v0, v24, 6, 1
 ; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[4:7]
 ; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[8:11]
+; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5456,27 +5254,24 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT:    global_load_u8 v12, v1, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v12, 1, v0
-; GFX12-NEXT:    v_lshrrev_b16 v4, 5, v0
-; GFX12-NEXT:    v_lshrrev_b16 v8, 3, v0
-; GFX12-NEXT:    v_lshrrev_b16 v14, 1, v0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 7, v0
-; GFX12-NEXT:    v_lshrrev_b16 v6, 6, v0
-; GFX12-NEXT:    v_lshrrev_b16 v10, 4, v0
-; GFX12-NEXT:    v_and_b32_e32 v17, 1, v4
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v18, 1, v8
-; GFX12-NEXT:    v_lshrrev_b16 v16, 2, v0
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v14, 1, v14
-; GFX12-NEXT:    v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v0, 1, v6
-; GFX12-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v2, 0xffff, v2
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v12
+; GFX12-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-NEXT:    v_mov_b32_e32 v7, v1
+; GFX12-NEXT:    v_bfe_u32 v6, v12, 5, 1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v8, 1, v16
-; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v4, 1, v10
-; GFX12-NEXT:    v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v6, 0xffff, v17
-; GFX12-NEXT:    v_and_b32_e32 v10, 0xffff, v18
-; GFX12-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 7, v0
+; GFX12-NEXT:    v_bfe_u32 v0, v0, 6, 1
+; GFX12-NEXT:    v_bfe_u32 v4, v12, 4, 1
+; GFX12-NEXT:    v_mov_b32_e32 v9, v1
+; GFX12-NEXT:    v_mov_b32_e32 v11, v1
+; GFX12-NEXT:    v_bfe_u32 v10, v12, 3, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v12, 2, 1
+; GFX12-NEXT:    v_mov_b32_e32 v13, v1
+; GFX12-NEXT:    v_mov_b32_e32 v15, v1
+; GFX12-NEXT:    v_bfe_u32 v14, v12, 1, 1
+; GFX12-NEXT:    v_and_b32_e32 v12, 1, v12
 ; GFX12-NEXT:    s_clause 0x3
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
 ; GFX12-NEXT:    global_store_b128 v1, v[4:7], s[0:1] offset:32
@@ -5542,47 +5337,56 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v17, s1
+; GFX8-NEXT:    v_mov_b32_e32 v16, s0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX8-NEXT:    s_lshr_b32 s2, s3, 6
+; GFX8-NEXT:    s_lshr_b32 s4, s3, 7
+; GFX8-NEXT:    s_lshr_b32 s6, s3, 4
+; GFX8-NEXT:    s_lshr_b32 s8, s3, 5
+; GFX8-NEXT:    s_lshr_b32 s10, s3, 2
+; GFX8-NEXT:    s_lshr_b32 s12, s3, 3
+; GFX8-NEXT:    s_lshr_b32 s14, s3, 1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 48
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v19, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v18, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NEXT:    v_mov_b32_e32 v17, s1
+; GFX8-NEXT:    v_mov_b32_e32 v6, s4
+; GFX8-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v16, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[4:7]
 ; GFX8-NEXT:    s_add_u32 s0, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v8, s6
+; GFX8-NEXT:    v_mov_b32_e32 v9, s7
+; GFX8-NEXT:    v_mov_b32_e32 v10, s8
+; GFX8-NEXT:    v_mov_b32_e32 v11, s9
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v21, s3
-; GFX8-NEXT:    v_mov_b32_e32 v23, s1
-; GFX8-NEXT:    v_mov_b32_e32 v20, s2
-; GFX8-NEXT:    v_mov_b32_e32 v22, s0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 6, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 7, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v7, 4, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 5, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 2, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 3, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v0
-; GFX8-NEXT:    v_bfe_i32 v14, v5, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v12, v3, 0, 1
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[8:11]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v6, v6, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v4, v4, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v10, v8, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v8, v7, 0, 1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
-; GFX8-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GFX8-NEXT:    v_mov_b32_e32 v12, s10
+; GFX8-NEXT:    v_mov_b32_e32 v13, s11
+; GFX8-NEXT:    v_mov_b32_e32 v14, s12
+; GFX8-NEXT:    v_mov_b32_e32 v15, s13
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX8-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
-; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[8:11]
-; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[4:7]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s14
+; GFX8-NEXT:    v_mov_b32_e32 v3, s15
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[12:15]
 ; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -5645,36 +5449,39 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v1, v16, s[2:3]
+; GFX12-NEXT:    global_load_u8 v0, v16, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v3, 6, v1
-; GFX12-NEXT:    v_lshrrev_b16 v5, 7, v1
-; GFX12-NEXT:    v_lshrrev_b16 v7, 4, v1
-; GFX12-NEXT:    v_lshrrev_b16 v4, 3, v1
-; GFX12-NEXT:    v_lshrrev_b16 v8, 2, v1
-; GFX12-NEXT:    v_lshrrev_b16 v9, 5, v1
-; GFX12-NEXT:    v_lshrrev_b16 v2, 1, v1
-; GFX12-NEXT:    v_bfe_i32 v14, v5, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v12, v3, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v6, v4, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v4, v8, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v10, v9, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v8, v7, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v0, v1, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX12-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v9, s3
+; GFX12-NEXT:    s_lshr_b32 s2, s3, 6
+; GFX12-NEXT:    s_lshr_b32 s4, s3, 7
+; GFX12-NEXT:    s_lshr_b32 s6, s3, 4
+; GFX12-NEXT:    s_lshr_b32 s8, s3, 5
+; GFX12-NEXT:    s_lshr_b32 s10, s3, 2
+; GFX12-NEXT:    s_lshr_b32 s12, s3, 3
+; GFX12-NEXT:    s_lshr_b32 s14, s3, 1
+; GFX12-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT:    v_bfe_i32 v12, v9, 0, 1
+; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT:    v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s9
+; GFX12-NEXT:    v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v9, s11
+; GFX12-NEXT:    v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v11, s13
+; GFX12-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
-; GFX12-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GFX12-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GFX12-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX12-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -5747,102 +5554,85 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8-NEXT:    v_mov_b32_e32 v8, v2
-; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
-; GFX8-NEXT:    s_add_u32 s2, s0, 0x70
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    s_add_u32 s4, s0, 0x50
-; GFX8-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v23, s5
-; GFX8-NEXT:    v_mov_b32_e32 v22, s4
-; GFX8-NEXT:    v_mov_b32_e32 v9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v11, v2
-; GFX8-NEXT:    v_mov_b32_e32 v12, v2
-; GFX8-NEXT:    v_mov_b32_e32 v14, v2
-; GFX8-NEXT:    v_mov_b32_e32 v15, v2
-; GFX8-NEXT:    v_mov_b32_e32 v17, v2
-; GFX8-NEXT:    v_mov_b32_e32 v19, v2
-; GFX8-NEXT:    v_mov_b32_e32 v21, v2
-; GFX8-NEXT:    v_mov_b32_e32 v25, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 10, v0
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 11, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff, v1
-; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[5:8]
-; GFX8-NEXT:    v_mov_b32_e32 v23, s3
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 14, v0
-; GFX8-NEXT:    v_mov_b32_e32 v22, s2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 15, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8-NEXT:    s_bfe_u32 s3, s2, 0x10009
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x1000d
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x10007
+; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x10003
+; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x10001
+; GFX8-NEXT:    s_and_b32 s8, s2, 1
+; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x10002
+; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x10004
+; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x10006
+; GFX8-NEXT:    s_bfe_u32 s12, s2, 0x1000c
+; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x1000a
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    s_add_u32 s2, s0, 0x50
+; GFX8-NEXT:    v_mov_b32_e32 v6, s3
+; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v13, s3
+; GFX8-NEXT:    v_mov_b32_e32 v12, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 64
-; GFX8-NEXT:    v_mov_b32_e32 v5, v2
-; GFX8-NEXT:    v_mov_b32_e32 v7, v2
-; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[1:4]
-; GFX8-NEXT:    v_mov_b32_e32 v23, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, 1
+; GFX8-NEXT:    v_bfe_u32 v2, v4, 11, 1
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s2
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 9, v0
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v13, s3
+; GFX8-NEXT:    v_mov_b32_e32 v12, s2
+; GFX8-NEXT:    s_add_u32 s2, s0, 0x70
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 15, v4
+; GFX8-NEXT:    v_bfe_u32 v14, v4, 5, 1
+; GFX8-NEXT:    v_bfe_u32 v8, v4, 14, 1
+; GFX8-NEXT:    v_bfe_u32 v4, v4, 8, 1
+; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s12
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x60
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff, v3
-; GFX8-NEXT:    v_mov_b32_e32 v4, s3
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[8:11]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 48
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 12, v0
-; GFX8-NEXT:    flat_store_dwordx4 v[1:2], v[8:11]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_mov_b32_e32 v9, s3
-; GFX8-NEXT:    v_and_b32_e32 v11, 1, v6
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 13, v0
-; GFX8-NEXT:    v_mov_b32_e32 v8, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s11
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    flat_store_dwordx4 v[3:4], v[11:14]
-; GFX8-NEXT:    s_add_u32 s0, s0, 16
-; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 7, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 6, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v14, 4, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v16, 5, v0
-; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 2, v0
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v14
-; GFX8-NEXT:    v_and_b32_e32 v14, 1, v6
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 3, v0
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_and_b32_e32 v16, 1, v16
-; GFX8-NEXT:    v_mov_b32_e32 v11, s3
-; GFX8-NEXT:    v_mov_b32_e32 v13, s1
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff, v16
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff, v3
-; GFX8-NEXT:    v_mov_b32_e32 v10, s2
-; GFX8-NEXT:    v_mov_b32_e32 v12, s0
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v0
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[14:17]
-; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[18:21]
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
-; GFX8-NEXT:    flat_store_dwordx4 v[1:2], v[22:25]
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    s_add_u32 s2, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    v_mov_b32_e32 v2, v14
+; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s9
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_zextload_v16i1_to_v16i64:
@@ -5935,56 +5725,51 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v28, 1, v0
-; GFX12-NEXT:    v_lshrrev_b16 v4, 11, v0
-; GFX12-NEXT:    v_lshrrev_b16 v8, 9, v0
-; GFX12-NEXT:    v_lshrrev_b16 v12, 13, v0
-; GFX12-NEXT:    v_lshrrev_b16 v16, 7, v0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 15, v0
-; GFX12-NEXT:    v_lshrrev_b16 v6, 14, v0
-; GFX12-NEXT:    v_lshrrev_b16 v10, 10, v0
-; GFX12-NEXT:    v_lshrrev_b16 v20, 5, v0
-; GFX12-NEXT:    v_lshrrev_b16 v24, 3, v0
-; GFX12-NEXT:    v_lshrrev_b16 v32, 1, v0
-; GFX12-NEXT:    v_and_b32_e32 v33, 1, v4
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v34, 1, v8
-; GFX12-NEXT:    v_lshrrev_b16 v14, 8, v0
-; GFX12-NEXT:    v_lshrrev_b16 v18, 12, v0
-; GFX12-NEXT:    v_and_b32_e32 v35, 1, v12
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v36, 1, v16
-; GFX12-NEXT:    v_lshrrev_b16 v22, 6, v0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v38, 1, v24
-; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_and_b32 v32, 1, v32
-; GFX12-NEXT:    v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v4, 1, v10
-; GFX12-NEXT:    v_mov_b32_e32 v23, v1
-; GFX12-NEXT:    v_dual_mov_b32 v25, v1 :: v_dual_and_b32 v2, 0xffff, v2
-; GFX12-NEXT:    v_mov_b32_e32 v31, v1
-; GFX12-NEXT:    v_lshrrev_b16 v26, 4, v0
-; GFX12-NEXT:    v_lshrrev_b16 v30, 2, v0
-; GFX12-NEXT:    v_and_b32_e32 v37, 1, v20
-; GFX12-NEXT:    v_and_b32_e32 v0, 1, v6
-; GFX12-NEXT:    v_dual_mov_b32 v11, v1 :: v_dual_and_b32 v6, 0xffff, v33
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v24, 1, v30
-; GFX12-NEXT:    v_and_b32_e32 v8, 1, v14
-; GFX12-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_and_b32 v10, 0xffff, v34
-; GFX12-NEXT:    v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v20, 1, v26
-; GFX12-NEXT:    v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v16, 1, v22
-; GFX12-NEXT:    v_and_b32_e32 v12, 1, v18
-; GFX12-NEXT:    v_and_b32_e32 v14, 0xffff, v35
-; GFX12-NEXT:    v_dual_mov_b32 v27, v1 :: v_dual_and_b32 v18, 0xffff, v36
-; GFX12-NEXT:    v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v30, 0xffff, v32
-; GFX12-NEXT:    v_and_b32_e32 v26, 0xffff, v38
-; GFX12-NEXT:    v_and_b32_e32 v22, 0xffff, v37
-; GFX12-NEXT:    s_clause 0x7
-; GFX12-NEXT:    global_store_b128 v1, v[4:7], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v1, v[8:11], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v1, v[12:15], s[0:1] offset:96
-; GFX12-NEXT:    global_store_b128 v1, v[16:19], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v1, v[20:23], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v1, v[24:27], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v1, v[28:31], s[0:1]
+; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v7, v1
+; GFX12-NEXT:    v_mov_b32_e32 v11, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_bfe_u32 v2, v4, 11, 1
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1000a
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1000d
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x1000c
+; GFX12-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-NEXT:    v_bfe_u32 v6, v4, 5, 1
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10007
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10006
+; GFX12-NEXT:    v_mov_b32_e32 v9, v1
+; GFX12-NEXT:    s_bfe_u32 s6, s2, 0x10002
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10004
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10009
+; GFX12-NEXT:    s_bfe_u32 s5, s2, 0x10001
+; GFX12-NEXT:    v_lshrrev_b32_e32 v10, 15, v4
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, v6
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10003
+; GFX12-NEXT:    s_and_b32 s2, s2, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v4, 14, 1
+; GFX12-NEXT:    v_bfe_u32 v4, v4, 8, 1
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    v_mov_b32_e32 v6, s3
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    v_mov_b32_e32 v2, s5
+; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    global_store_b128 v1, v[8:11], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v1, v[4:7], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -6074,92 +5859,109 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v27, s1
+; GFX8-NEXT:    v_mov_b32_e32 v26, s0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX8-NEXT:    s_lshr_b32 s2, s3, 14
+; GFX8-NEXT:    s_lshr_b32 s4, s3, 15
+; GFX8-NEXT:    s_lshr_b32 s6, s3, 12
+; GFX8-NEXT:    s_lshr_b32 s8, s3, 13
+; GFX8-NEXT:    s_lshr_b32 s10, s3, 10
+; GFX8-NEXT:    s_lshr_b32 s12, s3, 11
+; GFX8-NEXT:    s_lshr_b32 s14, s3, 8
+; GFX8-NEXT:    s_lshr_b32 s16, s3, 9
+; GFX8-NEXT:    s_lshr_b32 s18, s3, 6
+; GFX8-NEXT:    s_lshr_b32 s20, s3, 7
+; GFX8-NEXT:    s_lshr_b32 s22, s3, 4
+; GFX8-NEXT:    s_lshr_b32 s24, s3, 5
+; GFX8-NEXT:    s_lshr_b32 s26, s3, 2
+; GFX8-NEXT:    s_lshr_b32 s28, s3, 3
+; GFX8-NEXT:    s_lshr_b32 s30, s3, 1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x70
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s2
+; GFX8-NEXT:    v_mov_b32_e32 v23, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    v_mov_b32_e32 v22, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x60
+; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[2:5]
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v8, s3
-; GFX8-NEXT:    v_mov_b32_e32 v7, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x50
+; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mov_b32_e32 v7, s7
+; GFX8-NEXT:    v_mov_b32_e32 v8, s8
+; GFX8-NEXT:    v_mov_b32_e32 v9, s9
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v10, s3
-; GFX8-NEXT:    v_mov_b32_e32 v9, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[6:9]
+; GFX8-NEXT:    v_mov_b32_e32 v10, s10
+; GFX8-NEXT:    v_mov_b32_e32 v9, s3
+; GFX8-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 64
+; GFX8-NEXT:    v_mov_b32_e32 v11, s11
+; GFX8-NEXT:    v_mov_b32_e32 v12, s12
+; GFX8-NEXT:    v_mov_b32_e32 v13, s13
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v16, s3
-; GFX8-NEXT:    v_mov_b32_e32 v15, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[10:13]
+; GFX8-NEXT:    v_mov_b32_e32 v9, s3
+; GFX8-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 48
+; GFX8-NEXT:    v_mov_b32_e32 v14, s14
+; GFX8-NEXT:    v_mov_b32_e32 v15, s15
+; GFX8-NEXT:    v_mov_b32_e32 v16, s16
+; GFX8-NEXT:    v_mov_b32_e32 v17, s17
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v23, s3
-; GFX8-NEXT:    v_mov_b32_e32 v22, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[14:17]
+; GFX8-NEXT:    v_mov_b32_e32 v9, s3
+; GFX8-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NEXT:    v_mov_b32_e32 v21, s1
+; GFX8-NEXT:    v_mov_b32_e32 v18, s18
+; GFX8-NEXT:    v_mov_b32_e32 v19, s19
+; GFX8-NEXT:    v_mov_b32_e32 v20, s20
+; GFX8-NEXT:    v_mov_b32_e32 v21, s21
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v20, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[18:21]
+; GFX8-NEXT:    v_mov_b32_e32 v9, s3
 ; GFX8-NEXT:    s_add_u32 s0, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v22, s22
+; GFX8-NEXT:    v_mov_b32_e32 v23, s23
+; GFX8-NEXT:    v_mov_b32_e32 v24, s24
+; GFX8-NEXT:    v_mov_b32_e32 v25, s25
+; GFX8-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v25, s3
-; GFX8-NEXT:    v_mov_b32_e32 v27, s1
-; GFX8-NEXT:    v_mov_b32_e32 v24, s2
-; GFX8-NEXT:    v_mov_b32_e32 v26, s0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 14, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 15, v0
-; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v11, 12, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v12, 13, v0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GFX8-NEXT:    flat_store_dwordx4 v[5:6], v[1:4]
-; GFX8-NEXT:    v_lshrrev_b16_e32 v13, 10, v0
-; GFX8-NEXT:    v_bfe_i32 v3, v12, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v1, v11, 0, 1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v14, 11, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 3, v0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v5, 2, v0
-; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[1:4]
-; GFX8-NEXT:    v_lshrrev_b16_e32 v17, 8, v0
-; GFX8-NEXT:    v_bfe_i32 v3, v6, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v6, v14, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v4, v13, 0, 1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v18, 9, v0
-; GFX8-NEXT:    v_bfe_i32 v1, v5, 0, 1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX8-NEXT:    flat_store_dwordx4 v[9:10], v[4:7]
-; GFX8-NEXT:    v_bfe_i32 v10, v18, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v8, v17, 0, 1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v19, 6, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v28, 4, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v29, 5, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v0
-; GFX8-NEXT:    v_bfe_i32 v12, v0, 0, 1
-; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 7, v0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GFX8-NEXT:    flat_store_dwordx4 v[15:16], v[8:11]
-; GFX8-NEXT:    v_bfe_i32 v18, v0, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v16, v19, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v7, v29, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v5, v28, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v14, v2, 0, 1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; GFX8-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GFX8-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
-; GFX8-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
-; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
-; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[16:19]
-; GFX8-NEXT:    flat_store_dwordx4 v[24:25], v[5:8]
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[1:4]
-; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[22:25]
+; GFX8-NEXT:    v_mov_b32_e32 v9, s1
+; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    v_mov_b32_e32 v6, s28
+; GFX8-NEXT:    v_mov_b32_e32 v7, s29
+; GFX8-NEXT:    v_mov_b32_e32 v8, s0
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s30
+; GFX8-NEXT:    v_mov_b32_e32 v3, s31
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
+; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_sextload_v16i1_to_v16i64:
@@ -6266,64 +6068,69 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 ; GFX12-NEXT:    v_mov_b32_e32 v32, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v1, v32, s[2:3]
+; GFX12-NEXT:    global_load_u16 v0, v32, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v3, 14, v1
-; GFX12-NEXT:    v_lshrrev_b16 v5, 15, v1
-; GFX12-NEXT:    v_lshrrev_b16 v7, 12, v1
-; GFX12-NEXT:    v_lshrrev_b16 v9, 13, v1
-; GFX12-NEXT:    v_lshrrev_b16 v11, 10, v1
-; GFX12-NEXT:    v_lshrrev_b16 v13, 11, v1
-; GFX12-NEXT:    v_lshrrev_b16 v15, 8, v1
-; GFX12-NEXT:    v_lshrrev_b16 v16, 9, v1
-; GFX12-NEXT:    v_lshrrev_b16 v12, 6, v1
-; GFX12-NEXT:    v_lshrrev_b16 v14, 7, v1
-; GFX12-NEXT:    v_lshrrev_b16 v8, 4, v1
-; GFX12-NEXT:    v_lshrrev_b16 v2, 1, v1
-; GFX12-NEXT:    v_lshrrev_b16 v4, 3, v1
-; GFX12-NEXT:    v_lshrrev_b16 v10, 2, v1
-; GFX12-NEXT:    v_lshrrev_b16 v17, 5, v1
-; GFX12-NEXT:    v_bfe_i32 v30, v5, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v28, v3, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v26, v9, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v24, v7, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v22, v13, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v20, v11, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v18, v16, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v16, v15, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v14, v14, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v12, v12, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v0, v1, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v6, v4, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v4, v10, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v10, v17, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v8, v8, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v31, 31, v30
+; GFX12-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_lshr_b32 s4, s3, 15
+; GFX12-NEXT:    s_lshr_b32 s2, s3, 14
+; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_lshr_b32 s6, s3, 12
+; GFX12-NEXT:    s_lshr_b32 s8, s3, 13
+; GFX12-NEXT:    s_lshr_b32 s10, s3, 10
+; GFX12-NEXT:    s_lshr_b32 s12, s3, 11
+; GFX12-NEXT:    s_lshr_b32 s14, s3, 8
+; GFX12-NEXT:    s_lshr_b32 s16, s3, 9
+; GFX12-NEXT:    s_lshr_b32 s18, s3, 6
+; GFX12-NEXT:    s_lshr_b32 s20, s3, 7
+; GFX12-NEXT:    s_lshr_b32 s22, s3, 4
+; GFX12-NEXT:    s_lshr_b32 s24, s3, 5
+; GFX12-NEXT:    s_lshr_b32 s26, s3, 2
+; GFX12-NEXT:    s_lshr_b32 s28, s3, 3
+; GFX12-NEXT:    s_lshr_b32 s30, s3, 1
+; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s4
+; GFX12-NEXT:    v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v4, s6
+; GFX12-NEXT:    v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v6, s8
+; GFX12-NEXT:    v_dual_mov_b32 v11, s13 :: v_dual_mov_b32 v8, s10
+; GFX12-NEXT:    v_dual_mov_b32 v13, s15 :: v_dual_mov_b32 v10, s12
+; GFX12-NEXT:    v_mov_b32_e32 v15, s17
+; GFX12-NEXT:    v_bfe_i32 v28, v28, 0, 1
+; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v17, s19
+; GFX12-NEXT:    v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v19, s21
+; GFX12-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v21, s23
+; GFX12-NEXT:    v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v23, s25
+; GFX12-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v25, s27
+; GFX12-NEXT:    v_dual_mov_b32 v22, s24 :: v_dual_mov_b32 v27, s29
+; GFX12-NEXT:    v_dual_mov_b32 v24, s26 :: v_dual_mov_b32 v31, s31
+; GFX12-NEXT:    v_mov_b32_e32 v26, s28
+; GFX12-NEXT:    v_mov_b32_e32 v30, s30
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:96
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v29, 31, v28
-; GFX12-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
-; GFX12-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
-; GFX12-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
-; GFX12-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
-; GFX12-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; GFX12-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GFX12-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
-; GFX12-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
-; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX12-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX12-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GFX12-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GFX12-NEXT:    s_clause 0x7
-; GFX12-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:96
-; GFX12-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v32, v[0:3], s[0:1]
+; GFX12-NEXT:    s_clause 0x5
+; GFX12-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v32, v[28:31], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -6444,176 +6251,152 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX8-LABEL: constant_zextload_v32i1_to_v32i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT:    s_load_dword s6, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 13, s2
-; GFX8-NEXT:    v_and_b32_e32 v15, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 9, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 11, s2
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 7, s2
-; GFX8-NEXT:    v_and_b32_e32 v11, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 5, s2
-; GFX8-NEXT:    s_lshr_b32 s14, s2, 24
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 3, s2
-; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x10018
-; GFX8-NEXT:    s_and_b32 s11, s2, 1
-; GFX8-NEXT:    s_bfe_u32 s15, s2, 0x10011
-; GFX8-NEXT:    s_bfe_u32 s16, s2, 0x10010
-; GFX8-NEXT:    s_bfe_u32 s17, s2, 0x10012
-; GFX8-NEXT:    s_bfe_u32 s18, s2, 0x10013
-; GFX8-NEXT:    s_bfe_u32 s19, s2, 0x10014
-; GFX8-NEXT:    s_bfe_u32 s20, s2, 0x10015
-; GFX8-NEXT:    s_bfe_u32 s21, s2, 0x10016
-; GFX8-NEXT:    s_bfe_u32 s22, s2, 0x10017
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 14, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 12, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v9, 10, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 6, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v7, 4, s2
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v12, 2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 1, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v20, 15, s2
-; GFX8-NEXT:    s_add_u32 s2, s0, 0xb0
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    s_add_u32 s4, s0, 0xa0
-; GFX8-NEXT:    s_addc_u32 s5, s1, 0
+; GFX8-NEXT:    s_lshr_b32 s7, s6, 31
+; GFX8-NEXT:    s_bfe_u32 s8, s6, 0x1001d
+; GFX8-NEXT:    s_bfe_u32 s9, s6, 0x1001b
+; GFX8-NEXT:    s_bfe_u32 s10, s6, 0x10019
+; GFX8-NEXT:    s_bfe_u32 s11, s6, 0x10017
+; GFX8-NEXT:    s_bfe_u32 s12, s6, 0x10013
+; GFX8-NEXT:    s_bfe_u32 s13, s6, 0x10011
+; GFX8-NEXT:    s_bfe_u32 s14, s6, 0x1000f
+; GFX8-NEXT:    s_bfe_u32 s15, s6, 0x1000d
+; GFX8-NEXT:    s_bfe_u32 s16, s6, 0x1000b
+; GFX8-NEXT:    s_bfe_u32 s17, s6, 0x10009
+; GFX8-NEXT:    s_bfe_u32 s18, s6, 0x10007
+; GFX8-NEXT:    s_bfe_u32 s19, s6, 0x10005
+; GFX8-NEXT:    s_bfe_u32 s4, s6, 0x10003
+; GFX8-NEXT:    s_bfe_u32 s2, s6, 0x10001
+; GFX8-NEXT:    s_and_b32 s3, s6, 1
+; GFX8-NEXT:    s_bfe_u32 s5, s6, 0x10002
+; GFX8-NEXT:    s_bfe_u32 s20, s6, 0x10004
+; GFX8-NEXT:    s_bfe_u32 s21, s6, 0x10006
+; GFX8-NEXT:    s_bfe_u32 s22, s6, 0x10008
+; GFX8-NEXT:    s_bfe_u32 s23, s6, 0x1000a
+; GFX8-NEXT:    s_bfe_u32 s24, s6, 0x1000c
+; GFX8-NEXT:    s_bfe_u32 s25, s6, 0x1000e
+; GFX8-NEXT:    s_bfe_u32 s26, s6, 0x10010
+; GFX8-NEXT:    s_bfe_u32 s27, s6, 0x10012
+; GFX8-NEXT:    s_bfe_u32 s28, s6, 0x10014
+; GFX8-NEXT:    s_bfe_u32 s29, s6, 0x10015
+; GFX8-NEXT:    s_bfe_u32 s30, s6, 0x10016
+; GFX8-NEXT:    s_bfe_u32 s31, s6, 0x10018
+; GFX8-NEXT:    s_bfe_u32 s33, s6, 0x1001a
+; GFX8-NEXT:    s_bfe_u32 s34, s6, 0x1001c
+; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x1001e
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_add_u32 s6, s0, 0xf0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_add_u32 s6, s0, 0xe0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s34
+; GFX8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_add_u32 s6, s0, 0xd0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s33
+; GFX8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_add_u32 s6, s0, 0xc0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s31
+; GFX8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_add_u32 s6, s0, 0xb0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s30
+; GFX8-NEXT:    v_mov_b32_e32 v2, s11
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_add_u32 s6, s0, 0xa0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s28
+; GFX8-NEXT:    v_mov_b32_e32 v2, s29
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8-NEXT:    s_add_u32 s6, s0, 0x90
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    s_add_u32 s8, s0, 0x80
-; GFX8-NEXT:    s_addc_u32 s9, s1, 0
-; GFX8-NEXT:    s_add_u32 s12, s0, 0x70
-; GFX8-NEXT:    v_and_b32_e32 v16, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 5, s14
-; GFX8-NEXT:    s_addc_u32 s13, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v17, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 3, s14
-; GFX8-NEXT:    v_mov_b32_e32 v23, s13
-; GFX8-NEXT:    v_and_b32_e32 v25, 1, v1
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v22, s12
-; GFX8-NEXT:    s_add_u32 s12, s0, 0xf0
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v19, v1
-; GFX8-NEXT:    v_mov_b32_e32 v21, v1
-; GFX8-NEXT:    s_addc_u32 s13, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GFX8-NEXT:    v_mov_b32_e32 v23, s13
-; GFX8-NEXT:    v_lshrrev_b16_e64 v6, 6, s14
-; GFX8-NEXT:    v_mov_b32_e32 v22, s12
-; GFX8-NEXT:    s_add_u32 s12, s0, 0x60
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v6
-; GFX8-NEXT:    v_lshrrev_b16_e64 v20, 7, s14
-; GFX8-NEXT:    s_addc_u32 s13, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v24, 4, s14
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v14
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff, v15
-; GFX8-NEXT:    v_mov_b32_e32 v15, s13
-; GFX8-NEXT:    v_mov_b32_e32 v14, s12
-; GFX8-NEXT:    s_add_u32 s12, s0, 0x50
-; GFX8-NEXT:    s_addc_u32 s13, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v23, s13
-; GFX8-NEXT:    v_mov_b32_e32 v22, s12
-; GFX8-NEXT:    s_add_u32 s12, s0, 64
-; GFX8-NEXT:    flat_store_dwordx4 v[14:15], v[18:21]
-; GFX8-NEXT:    s_addc_u32 s13, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v9
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff, v11
-; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff, v10
-; GFX8-NEXT:    v_mov_b32_e32 v10, 1
-; GFX8-NEXT:    v_mov_b32_e32 v23, s13
-; GFX8-NEXT:    v_and_b32_sdwa v18, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff, v8
-; GFX8-NEXT:    v_mov_b32_e32 v22, s12
-; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff, v2
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v7
-; GFX8-NEXT:    v_mov_b32_e32 v8, s3
-; GFX8-NEXT:    v_and_b32_e32 v21, 1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s27
+; GFX8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_add_u32 s6, s0, 0x80
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s26
+; GFX8-NEXT:    v_mov_b32_e32 v2, s13
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_add_u32 s6, s0, 0x70
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s25
+; GFX8-NEXT:    v_mov_b32_e32 v2, s14
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_add_u32 s6, s0, 0x60
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s24
+; GFX8-NEXT:    v_mov_b32_e32 v2, s15
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_add_u32 s6, s0, 0x50
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s23
+; GFX8-NEXT:    v_mov_b32_e32 v2, s16
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_add_u32 s6, s0, 64
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s22
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_add_u32 s6, s0, 48
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s21
-; GFX8-NEXT:    v_mov_b32_e32 v2, s22
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, s2
-; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v8, s5
-; GFX8-NEXT:    v_mov_b32_e32 v0, s19
-; GFX8-NEXT:    v_mov_b32_e32 v2, s20
-; GFX8-NEXT:    v_mov_b32_e32 v7, s4
-; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v8, s7
-; GFX8-NEXT:    v_mov_b32_e32 v0, s17
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s18
-; GFX8-NEXT:    v_mov_b32_e32 v7, s6
-; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v7, s8
-; GFX8-NEXT:    v_mov_b32_e32 v0, s16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s15
-; GFX8-NEXT:    v_mov_b32_e32 v8, s9
-; GFX8-NEXT:    s_add_u32 s2, s0, 48
-; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_and_b32_e32 v15, 1, v24
-; GFX8-NEXT:    v_mov_b32_e32 v22, v1
-; GFX8-NEXT:    v_mov_b32_e32 v24, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[21:24]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff, v5
-; GFX8-NEXT:    v_mov_b32_e32 v21, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 16
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[18:21]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff, v16
-; GFX8-NEXT:    v_and_b32_e32 v9, 1, v12
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v12, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_mov_b32_e32 v8, s1
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[9:12]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s11
-; GFX8-NEXT:    v_mov_b32_e32 v2, v14
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, s0
-; GFX8-NEXT:    s_add_u32 s2, s0, 0xe0
-; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 0xd0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 1, s14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX8-NEXT:    v_mov_b32_e32 v16, v1
-; GFX8-NEXT:    v_mov_b32_e32 v18, v1
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v26, 2, s14
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[15:18]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    s_add_u32 s0, s0, 0xc0
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v26
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v25
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    s_add_u32 s6, s0, 32
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s20
+; GFX8-NEXT:    v_mov_b32_e32 v2, s19
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    s_add_u32 s4, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NEXT:    s_addc_u32 s5, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v13
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
@@ -6783,109 +6566,87 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v0, 13, s2
-; GFX12-NEXT:    v_lshrrev_b16 v3, 11, s2
-; GFX12-NEXT:    s_lshr_b32 s3, s2, 24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 12, s2
-; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT:    v_lshrrev_b16 v4, 9, s2
-; GFX12-NEXT:    v_lshrrev_b16 v8, 7, s2
-; GFX12-NEXT:    v_lshrrev_b16 v16, 7, s3
-; GFX12-NEXT:    v_lshrrev_b16 v18, 6, s3
-; GFX12-NEXT:    v_lshrrev_b16 v17, 5, s3
-; GFX12-NEXT:    v_lshrrev_b16 v20, 4, s3
-; GFX12-NEXT:    v_lshrrev_b16 v21, 3, s3
-; GFX12-NEXT:    v_lshrrev_b16 v22, 2, s3
-; GFX12-NEXT:    v_lshrrev_b16 v23, 1, s3
-; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10016
-; GFX12-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX12-NEXT:    s_bfe_u32 s6, s2, 0x10017
-; GFX12-NEXT:    v_lshrrev_b16 v11, 5, s2
-; GFX12-NEXT:    v_lshrrev_b16 v13, 3, s2
-; GFX12-NEXT:    v_and_b32_e32 v24, 1, v4
-; GFX12-NEXT:    v_and_b32_e32 v25, 1, v8
-; GFX12-NEXT:    v_and_b32_e32 v28, 1, v21
-; GFX12-NEXT:    v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v31, 1, v2
-; GFX12-NEXT:    v_dual_mov_b32 v32, v1 :: v_dual_and_b32 v33, 0xffff, v0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v21, 0xffff, v3
-; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1001e
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT:    s_lshr_b32 s4, s2, 31
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1001d
+; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x1001c
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:240
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1001b
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x1001a
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:224
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10019
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10018
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:208
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10017
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10016
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10014
-; GFX12-NEXT:    s_bfe_u32 s6, s2, 0x10015
-; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v15, 1, s2
-; GFX12-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX12-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10015
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:176
-; GFX12-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
-; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10012
-; GFX12-NEXT:    s_bfe_u32 s6, s2, 0x10013
-; GFX12-NEXT:    v_lshrrev_b16 v6, 10, s2
-; GFX12-NEXT:    v_and_b32_e32 v26, 1, v15
-; GFX12-NEXT:    v_dual_mov_b32 v36, v1 :: v_dual_and_b32 v15, 1, v9
-; GFX12-NEXT:    v_and_b32_e32 v9, 1, v17
-; GFX12-NEXT:    v_and_b32_e32 v29, 1, v23
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10013
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10012
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:160
-; GFX12-NEXT:    v_mov_b32_e32 v2, s6
-; GFX12-NEXT:    v_mov_b32_e32 v0, s3
-; GFX12-NEXT:    v_lshrrev_b16 v5, 15, s2
-; GFX12-NEXT:    v_lshrrev_b16 v7, 14, s2
-; GFX12-NEXT:    v_lshrrev_b16 v10, 6, s2
-; GFX12-NEXT:    v_lshrrev_b16 v12, 4, s2
-; GFX12-NEXT:    v_lshrrev_b16 v14, 2, s2
-; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT:    s_and_b32 s5, s2, 1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10011
-; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x10010
-; GFX12-NEXT:    v_dual_mov_b32 v38, v1 :: v_dual_and_b32 v19, 1, v6
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff, v13
-; GFX12-NEXT:    v_and_b32_e32 v17, 0xffff, v24
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, v1
-; GFX12-NEXT:    v_and_b32_e32 v43, 0xffff, v26
-; GFX12-NEXT:    v_and_b32_e32 v4, 1, v14
-; GFX12-NEXT:    v_and_b32_e32 v8, 1, v12
-; GFX12-NEXT:    v_and_b32_e32 v44, 0xffff, v29
-; GFX12-NEXT:    v_dual_mov_b32 v34, v1 :: v_dual_and_b32 v35, 1, v18
-; GFX12-NEXT:    v_and_b32_e32 v37, 0xffff, v16
-; GFX12-NEXT:    v_and_b32_e32 v39, 1, v7
-; GFX12-NEXT:    v_dual_mov_b32 v16, v1 :: v_dual_and_b32 v41, 0xffff, v5
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1000f
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x1000e
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:128
-; GFX12-NEXT:    v_mov_b32_e32 v5, v1
-; GFX12-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v0, s5
-; GFX12-NEXT:    v_dual_mov_b32 v2, v43 :: v_dual_and_b32 v29, 0xffff, v9
-; GFX12-NEXT:    v_dual_mov_b32 v40, v1 :: v_dual_and_b32 v23, 1, v22
-; GFX12-NEXT:    v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v27, 1, v20
-; GFX12-NEXT:    v_mov_b32_e32 v20, v1
-; GFX12-NEXT:    v_mov_b32_e32 v22, v1
-; GFX12-NEXT:    v_mov_b32_e32 v18, v1
-; GFX12-NEXT:    v_and_b32_e32 v12, 1, v10
-; GFX12-NEXT:    v_and_b32_e32 v10, 0xffff, v11
-; GFX12-NEXT:    v_and_b32_e32 v14, 0xffff, v25
-; GFX12-NEXT:    v_mov_b32_e32 v24, v1
-; GFX12-NEXT:    s_clause 0x4
-; GFX12-NEXT:    global_store_b128 v1, v[35:38], s[0:1] offset:240
-; GFX12-NEXT:    global_store_b128 v1, v[39:42], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v1, v[31:34], s[0:1] offset:96
-; GFX12-NEXT:    global_store_b128 v1, v[19:22], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v1, v[15:18], s[0:1] offset:64
-; GFX12-NEXT:    v_mov_b32_e32 v15, v1
-; GFX12-NEXT:    v_mov_b32_e32 v11, v1
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v1, v[4:7], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12-NEXT:    v_dual_mov_b32 v2, v44 :: v_dual_mov_b32 v9, v1
-; GFX12-NEXT:    v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v25, 0xffff, v28
-; GFX12-NEXT:    v_mov_b32_e32 v28, v1
-; GFX12-NEXT:    s_clause 0x4
-; GFX12-NEXT:    global_store_b128 v1, v[12:15], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v1, v[8:11], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v1, v[27:30], s[0:1] offset:224
-; GFX12-NEXT:    global_store_b128 v1, v[23:26], s[0:1] offset:208
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1000d
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x1000c
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:112
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1000b
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x1000a
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10009
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10008
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10007
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10006
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:64
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10005
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10004
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10003
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10002
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10001
+; GFX12-NEXT:    s_and_b32 s2, s2, 1
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -7065,189 +6826,220 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s6, s4, 22
-; GFX8-NEXT:    s_lshr_b32 s8, s4, 23
-; GFX8-NEXT:    s_lshr_b32 s10, s4, 20
-; GFX8-NEXT:    s_lshr_b32 s12, s4, 21
-; GFX8-NEXT:    s_lshr_b32 s14, s4, 18
-; GFX8-NEXT:    s_lshr_b32 s16, s4, 19
-; GFX8-NEXT:    s_lshr_b32 s18, s4, 16
-; GFX8-NEXT:    s_lshr_b32 s20, s4, 17
-; GFX8-NEXT:    s_lshr_b32 s2, s4, 24
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 14, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 15, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v15, 12, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v17, 13, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v13, 10, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 11, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v9, 8, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v10, 9, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v6, 6, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v8, 7, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 4, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v5, 5, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 2, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 3, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v7, 1, s4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v11, 6, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v12, 7, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v16, 4, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v18, 5, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v19, 2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v20, 3, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v27, 1, s2
-; GFX8-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX8-NEXT:    s_lshr_b32 s44, s2, 30
+; GFX8-NEXT:    s_lshr_b32 s46, s2, 31
+; GFX8-NEXT:    s_lshr_b32 s48, s2, 28
+; GFX8-NEXT:    s_lshr_b32 s50, s2, 29
+; GFX8-NEXT:    s_lshr_b32 s52, s2, 26
+; GFX8-NEXT:    s_lshr_b32 s54, s2, 27
+; GFX8-NEXT:    s_lshr_b32 s56, s2, 24
+; GFX8-NEXT:    s_lshr_b32 s58, s2, 25
+; GFX8-NEXT:    s_lshr_b32 s60, s2, 22
+; GFX8-NEXT:    s_lshr_b32 s62, s2, 23
+; GFX8-NEXT:    s_lshr_b32 s64, s2, 20
+; GFX8-NEXT:    s_lshr_b32 s66, s2, 21
+; GFX8-NEXT:    s_lshr_b32 s42, s2, 18
+; GFX8-NEXT:    s_lshr_b32 s40, s2, 19
+; GFX8-NEXT:    s_lshr_b32 s38, s2, 16
+; GFX8-NEXT:    s_lshr_b32 s36, s2, 17
+; GFX8-NEXT:    s_lshr_b32 s34, s2, 14
+; GFX8-NEXT:    s_lshr_b32 s30, s2, 15
+; GFX8-NEXT:    s_lshr_b32 s28, s2, 12
+; GFX8-NEXT:    s_lshr_b32 s26, s2, 13
+; GFX8-NEXT:    s_lshr_b32 s24, s2, 10
+; GFX8-NEXT:    s_lshr_b32 s22, s2, 11
+; GFX8-NEXT:    s_lshr_b32 s20, s2, 8
+; GFX8-NEXT:    s_lshr_b32 s18, s2, 9
+; GFX8-NEXT:    s_lshr_b32 s16, s2, 6
+; GFX8-NEXT:    s_lshr_b32 s14, s2, 7
+; GFX8-NEXT:    s_lshr_b32 s12, s2, 4
+; GFX8-NEXT:    s_lshr_b32 s10, s2, 5
+; GFX8-NEXT:    s_lshr_b32 s8, s2, 2
+; GFX8-NEXT:    s_lshr_b32 s6, s2, 3
+; GFX8-NEXT:    s_lshr_b32 s68, s2, 1
+; GFX8-NEXT:    s_bfe_i64 s[4:5], s[2:3], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[2:3], s[68:69], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX8-NEXT:    v_mov_b32_e32 v21, s6
-; GFX8-NEXT:    s_add_u32 s6, s0, 0xb0
-; GFX8-NEXT:    v_mov_b32_e32 v22, s7
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v26, s7
-; GFX8-NEXT:    v_mov_b32_e32 v25, s6
-; GFX8-NEXT:    s_add_u32 s6, s0, 0xa0
-; GFX8-NEXT:    v_mov_b32_e32 v23, s8
-; GFX8-NEXT:    v_mov_b32_e32 v24, s9
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[25:26], v[21:24]
-; GFX8-NEXT:    v_mov_b32_e32 v26, s7
-; GFX8-NEXT:    v_mov_b32_e32 v25, s6
-; GFX8-NEXT:    s_add_u32 s6, s0, 0x90
-; GFX8-NEXT:    v_mov_b32_e32 v21, s10
-; GFX8-NEXT:    v_mov_b32_e32 v22, s11
-; GFX8-NEXT:    v_mov_b32_e32 v23, s12
-; GFX8-NEXT:    v_mov_b32_e32 v24, s13
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[25:26], v[21:24]
-; GFX8-NEXT:    v_mov_b32_e32 v26, s7
-; GFX8-NEXT:    v_mov_b32_e32 v25, s6
-; GFX8-NEXT:    s_add_u32 s6, s0, 0x80
-; GFX8-NEXT:    v_mov_b32_e32 v21, s14
-; GFX8-NEXT:    v_mov_b32_e32 v22, s15
-; GFX8-NEXT:    v_mov_b32_e32 v23, s16
-; GFX8-NEXT:    v_mov_b32_e32 v24, s17
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[25:26], v[21:24]
-; GFX8-NEXT:    v_mov_b32_e32 v26, s7
-; GFX8-NEXT:    v_mov_b32_e32 v21, s18
-; GFX8-NEXT:    v_mov_b32_e32 v22, s19
-; GFX8-NEXT:    v_mov_b32_e32 v23, s20
-; GFX8-NEXT:    v_mov_b32_e32 v24, s21
-; GFX8-NEXT:    v_mov_b32_e32 v25, s6
-; GFX8-NEXT:    s_add_u32 s6, s0, 0x70
-; GFX8-NEXT:    flat_store_dwordx4 v[25:26], v[21:24]
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    v_bfe_i32 v23, v3, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v21, v2, 0, 1
+; GFX8-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[66:67], s[66:67], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[64:65], s[64:65], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[62:63], s[62:63], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[60:61], s[60:61], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[58:59], s[58:59], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[56:57], s[56:57], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[52:53], s[52:53], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX8-NEXT:    v_mov_b32_e32 v0, s44
+; GFX8-NEXT:    s_add_u32 s44, s0, 0xf0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s45
+; GFX8-NEXT:    s_addc_u32 s45, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s44
+; GFX8-NEXT:    v_mov_b32_e32 v2, s46
+; GFX8-NEXT:    v_mov_b32_e32 v3, s47
+; GFX8-NEXT:    v_mov_b32_e32 v5, s45
+; GFX8-NEXT:    s_add_u32 s44, s0, 0xe0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s45, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s44
+; GFX8-NEXT:    v_mov_b32_e32 v0, s48
+; GFX8-NEXT:    v_mov_b32_e32 v1, s49
+; GFX8-NEXT:    v_mov_b32_e32 v2, s50
+; GFX8-NEXT:    v_mov_b32_e32 v3, s51
+; GFX8-NEXT:    v_mov_b32_e32 v5, s45
+; GFX8-NEXT:    s_add_u32 s44, s0, 0xd0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s45, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s44
+; GFX8-NEXT:    v_mov_b32_e32 v0, s52
+; GFX8-NEXT:    v_mov_b32_e32 v1, s53
+; GFX8-NEXT:    v_mov_b32_e32 v2, s54
+; GFX8-NEXT:    v_mov_b32_e32 v3, s55
+; GFX8-NEXT:    v_mov_b32_e32 v5, s45
+; GFX8-NEXT:    s_add_u32 s44, s0, 0xc0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s45, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s44
+; GFX8-NEXT:    v_mov_b32_e32 v0, s56
+; GFX8-NEXT:    v_mov_b32_e32 v1, s57
+; GFX8-NEXT:    v_mov_b32_e32 v2, s58
+; GFX8-NEXT:    v_mov_b32_e32 v3, s59
+; GFX8-NEXT:    v_mov_b32_e32 v5, s45
+; GFX8-NEXT:    s_add_u32 s44, s0, 0xb0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s45, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s44
+; GFX8-NEXT:    v_mov_b32_e32 v0, s60
+; GFX8-NEXT:    v_mov_b32_e32 v1, s61
+; GFX8-NEXT:    v_mov_b32_e32 v2, s62
+; GFX8-NEXT:    v_mov_b32_e32 v3, s63
+; GFX8-NEXT:    v_mov_b32_e32 v5, s45
+; GFX8-NEXT:    s_add_u32 s44, s0, 0xa0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s45, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s44
+; GFX8-NEXT:    v_mov_b32_e32 v0, s64
+; GFX8-NEXT:    v_mov_b32_e32 v1, s65
+; GFX8-NEXT:    v_mov_b32_e32 v2, s66
+; GFX8-NEXT:    v_mov_b32_e32 v3, s67
+; GFX8-NEXT:    v_mov_b32_e32 v5, s45
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s40
+; GFX8-NEXT:    s_add_u32 s40, s0, 0x90
+; GFX8-NEXT:    v_mov_b32_e32 v3, s41
+; GFX8-NEXT:    s_addc_u32 s41, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s40
+; GFX8-NEXT:    v_mov_b32_e32 v0, s42
+; GFX8-NEXT:    v_mov_b32_e32 v1, s43
+; GFX8-NEXT:    v_mov_b32_e32 v5, s41
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s36
+; GFX8-NEXT:    s_add_u32 s36, s0, 0x80
+; GFX8-NEXT:    v_mov_b32_e32 v3, s37
+; GFX8-NEXT:    s_addc_u32 s37, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s36
+; GFX8-NEXT:    v_mov_b32_e32 v0, s38
+; GFX8-NEXT:    v_mov_b32_e32 v1, s39
+; GFX8-NEXT:    v_mov_b32_e32 v5, s37
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s30
+; GFX8-NEXT:    s_add_u32 s30, s0, 0x70
+; GFX8-NEXT:    v_mov_b32_e32 v3, s31
+; GFX8-NEXT:    s_addc_u32 s31, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s30
+; GFX8-NEXT:    v_mov_b32_e32 v0, s34
+; GFX8-NEXT:    v_mov_b32_e32 v1, s35
+; GFX8-NEXT:    v_mov_b32_e32 v5, s31
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s26
+; GFX8-NEXT:    s_add_u32 s26, s0, 0x60
+; GFX8-NEXT:    v_mov_b32_e32 v3, s27
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v0, s28
+; GFX8-NEXT:    v_mov_b32_e32 v1, s29
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s22
+; GFX8-NEXT:    s_add_u32 s22, s0, 0x50
+; GFX8-NEXT:    v_mov_b32_e32 v3, s23
+; GFX8-NEXT:    s_addc_u32 s23, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s22
+; GFX8-NEXT:    v_mov_b32_e32 v0, s24
+; GFX8-NEXT:    v_mov_b32_e32 v1, s25
+; GFX8-NEXT:    v_mov_b32_e32 v5, s23
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NEXT:    s_add_u32 s18, s0, 64
+; GFX8-NEXT:    v_mov_b32_e32 v3, s19
+; GFX8-NEXT:    s_addc_u32 s19, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s18
+; GFX8-NEXT:    v_mov_b32_e32 v0, s20
+; GFX8-NEXT:    v_mov_b32_e32 v1, s21
+; GFX8-NEXT:    v_mov_b32_e32 v5, s19
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s14
+; GFX8-NEXT:    s_add_u32 s14, s0, 48
+; GFX8-NEXT:    v_mov_b32_e32 v3, s15
+; GFX8-NEXT:    s_addc_u32 s15, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s14
+; GFX8-NEXT:    v_mov_b32_e32 v0, s16
+; GFX8-NEXT:    v_mov_b32_e32 v1, s17
+; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NEXT:    s_add_u32 s10, s0, 32
+; GFX8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NEXT:    s_addc_u32 s11, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8-NEXT:    v_mov_b32_e32 v0, s12
+; GFX8-NEXT:    v_mov_b32_e32 v1, s13
+; GFX8-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s6
-; GFX8-NEXT:    v_mov_b32_e32 v3, s7
-; GFX8-NEXT:    s_add_u32 s6, s0, 0x60
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
-; GFX8-NEXT:    v_mov_b32_e32 v26, s7
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[21:24]
-; GFX8-NEXT:    v_mov_b32_e32 v25, s6
-; GFX8-NEXT:    v_bfe_i32 v23, v17, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v21, v15, 0, 1
-; GFX8-NEXT:    s_add_u32 s6, s0, 0x50
-; GFX8-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[25:26], v[21:24]
-; GFX8-NEXT:    v_bfe_i32 v25, v14, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v23, v13, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v14, s7
-; GFX8-NEXT:    v_mov_b32_e32 v13, s6
-; GFX8-NEXT:    s_add_u32 s6, s0, 64
-; GFX8-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
-; GFX8-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[13:14], v[23:26]
-; GFX8-NEXT:    v_bfe_i32 v12, v12, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v25, v10, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v23, v9, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v10, s7
-; GFX8-NEXT:    v_mov_b32_e32 v9, s6
-; GFX8-NEXT:    s_add_u32 s6, s0, 48
-; GFX8-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
-; GFX8-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[9:10], v[23:26]
-; GFX8-NEXT:    v_bfe_i32 v10, v11, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v25, v8, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v23, v6, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v9, s7
-; GFX8-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
-; GFX8-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT:    v_mov_b32_e32 v8, s6
-; GFX8-NEXT:    s_add_u32 s6, s0, 32
-; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[23:26]
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    v_bfe_i32 v25, v5, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v23, v4, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
-; GFX8-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8-NEXT:    s_add_u32 s6, s0, 16
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[23:26]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    v_bfe_i32 v25, v1, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v23, v0, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
-; GFX8-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    v_bfe_i32 v6, v7, 0, 1
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[23:26]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    s_add_u32 s4, s0, 0xf0
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
-; GFX8-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    s_add_u32 s4, s0, 0xe0
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[10:13]
-; GFX8-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    s_add_u32 s4, s0, 0xd0
-; GFX8-NEXT:    v_bfe_i32 v17, v18, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v15, v16, 0, 1
-; GFX8-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
-; GFX8-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GFX8-NEXT:    s_add_u32 s0, s0, 0xc0
-; GFX8-NEXT:    v_bfe_i32 v21, v20, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v19, v19, 0, 1
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_bfe_i32 v2, v27, 0, 1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
-; GFX8-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[19:22]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
@@ -7448,120 +7240,123 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v26, 6, s2
-; GFX12-NEXT:    v_lshrrev_b16 v28, 7, s2
-; GFX12-NEXT:    v_lshrrev_b16 v4, 2, s2
-; GFX12-NEXT:    v_lshrrev_b16 v5, 3, s2
-; GFX12-NEXT:    s_lshr_b32 s22, s2, 24
-; GFX12-NEXT:    v_lshrrev_b16 v8, 4, s2
-; GFX12-NEXT:    v_lshrrev_b16 v3, 5, s2
-; GFX12-NEXT:    v_lshrrev_b16 v2, 1, s2
-; GFX12-NEXT:    v_lshrrev_b16 v7, 6, s22
-; GFX12-NEXT:    v_lshrrev_b16 v11, 7, s22
-; GFX12-NEXT:    v_lshrrev_b16 v13, 4, s22
-; GFX12-NEXT:    v_lshrrev_b16 v15, 5, s22
-; GFX12-NEXT:    v_lshrrev_b16 v0, 14, s2
-; GFX12-NEXT:    v_lshrrev_b16 v1, 15, s2
-; GFX12-NEXT:    v_lshrrev_b16 v14, 2, s22
-; GFX12-NEXT:    v_lshrrev_b16 v16, 3, s22
-; GFX12-NEXT:    v_lshrrev_b16 v35, 12, s2
-; GFX12-NEXT:    v_lshrrev_b16 v37, 13, s2
-; GFX12-NEXT:    v_lshrrev_b16 v34, 10, s2
-; GFX12-NEXT:    v_lshrrev_b16 v36, 11, s2
-; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v27, 9, s2
-; GFX12-NEXT:    v_lshrrev_b16 v12, 1, s22
-; GFX12-NEXT:    v_bfe_i32 v6, v5, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v4, v4, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v28, v28, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v26, v26, 0, 1
-; GFX12-NEXT:    s_lshr_b32 s4, s2, 22
-; GFX12-NEXT:    s_lshr_b32 s8, s2, 23
-; GFX12-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v10, v3, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v8, v8, 0, 1
-; GFX12-NEXT:    s_lshr_b32 s10, s2, 20
-; GFX12-NEXT:    s_lshr_b32 s12, s2, 21
-; GFX12-NEXT:    s_lshr_b32 s20, s2, 17
-; GFX12-NEXT:    v_bfe_i32 v24, v11, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v22, v7, 0, 1
-; GFX12-NEXT:    s_lshr_b32 s14, s2, 18
-; GFX12-NEXT:    s_lshr_b32 s16, s2, 19
-; GFX12-NEXT:    v_bfe_i32 v20, v15, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v18, v13, 0, 1
-; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX12-NEXT:    s_lshr_b32 s18, s2, 16
-; GFX12-NEXT:    s_bfe_i64 s[6:7], s[2:3], 0x10000
-; GFX12-NEXT:    v_bfe_i32 v16, v16, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v14, v14, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v44, v1, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v42, v0, 0, 1
+; GFX12-NEXT:    s_lshr_b32 s34, s2, 30
+; GFX12-NEXT:    s_lshr_b32 s36, s2, 31
+; GFX12-NEXT:    s_lshr_b32 s38, s2, 28
+; GFX12-NEXT:    s_lshr_b32 s40, s2, 29
+; GFX12-NEXT:    s_lshr_b32 s42, s2, 26
+; GFX12-NEXT:    s_lshr_b32 s44, s2, 27
+; GFX12-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX12-NEXT:    s_lshr_b32 s46, s2, 24
+; GFX12-NEXT:    s_lshr_b32 s48, s2, 25
+; GFX12-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35
+; GFX12-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s37
+; GFX12-NEXT:    v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v5, s39
+; GFX12-NEXT:    s_lshr_b32 s26, s2, 22
+; GFX12-NEXT:    s_lshr_b32 s50, s2, 23
+; GFX12-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v4, s38 :: v_dual_mov_b32 v7, s41
+; GFX12-NEXT:    v_dual_mov_b32 v6, s40 :: v_dual_mov_b32 v9, s43
+; GFX12-NEXT:    s_lshr_b32 s52, s2, 20
+; GFX12-NEXT:    s_lshr_b32 s54, s2, 21
+; GFX12-NEXT:    v_dual_mov_b32 v8, s42 :: v_dual_mov_b32 v11, s45
+; GFX12-NEXT:    v_dual_mov_b32 v10, s44 :: v_dual_mov_b32 v13, s47
+; GFX12-NEXT:    s_lshr_b32 s56, s2, 18
+; GFX12-NEXT:    s_lshr_b32 s58, s2, 19
+; GFX12-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v12, s46 :: v_dual_mov_b32 v15, s49
+; GFX12-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX12-NEXT:    v_mov_b32_e32 v14, s48
+; GFX12-NEXT:    s_lshr_b32 s60, s2, 16
+; GFX12-NEXT:    s_lshr_b32 s62, s2, 17
+; GFX12-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[52:53], s[52:53], 0x10000
+; GFX12-NEXT:    s_lshr_b32 s64, s2, 14
+; GFX12-NEXT:    s_lshr_b32 s66, s2, 15
+; GFX12-NEXT:    s_bfe_i64 s[58:59], s[58:59], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[56:57], s[56:57], 0x10000
+; GFX12-NEXT:    s_clause 0x3
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:240
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:224
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:208
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:192
+; GFX12-NEXT:    v_dual_mov_b32 v1, s27 :: v_dual_mov_b32 v0, s26
+; GFX12-NEXT:    v_dual_mov_b32 v3, s51 :: v_dual_mov_b32 v2, s50
+; GFX12-NEXT:    v_mov_b32_e32 v5, s53
+; GFX12-NEXT:    s_lshr_b32 s30, s2, 12
+; GFX12-NEXT:    s_lshr_b32 s28, s2, 13
+; GFX12-NEXT:    s_lshr_b32 s24, s2, 10
+; GFX12-NEXT:    s_lshr_b32 s22, s2, 11
+; GFX12-NEXT:    s_bfe_i64 s[62:63], s[62:63], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[60:61], s[60:61], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v4, s52 :: v_dual_mov_b32 v7, s55
+; GFX12-NEXT:    v_dual_mov_b32 v6, s54 :: v_dual_mov_b32 v9, s57
+; GFX12-NEXT:    s_lshr_b32 s20, s2, 8
+; GFX12-NEXT:    s_lshr_b32 s18, s2, 9
+; GFX12-NEXT:    s_bfe_i64 s[66:67], s[66:67], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[64:65], s[64:65], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v8, s56 :: v_dual_mov_b32 v11, s59
+; GFX12-NEXT:    v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s61
+; GFX12-NEXT:    s_lshr_b32 s16, s2, 6
+; GFX12-NEXT:    s_lshr_b32 s14, s2, 7
+; GFX12-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v12, s60 :: v_dual_mov_b32 v15, s63
+; GFX12-NEXT:    v_dual_mov_b32 v14, s62 :: v_dual_mov_b32 v17, s65
+; GFX12-NEXT:    s_lshr_b32 s12, s2, 4
+; GFX12-NEXT:    s_lshr_b32 s10, s2, 5
+; GFX12-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX12-NEXT:    v_dual_mov_b32 v62, 0 :: v_dual_mov_b32 v47, s5
-; GFX12-NEXT:    v_bfe_i32 v12, v12, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX12-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX12-NEXT:    v_bfe_i32 v32, v27, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v30, v9, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v29, 31, v28
-; GFX12-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
-; GFX12-NEXT:    v_bfe_i32 v36, v36, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v34, v34, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v40, v37, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v38, v35, 0, 1
-; GFX12-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v16, s64 :: v_dual_mov_b32 v19, s67
+; GFX12-NEXT:    v_dual_mov_b32 v18, s66 :: v_dual_mov_b32 v21, s31
+; GFX12-NEXT:    s_lshr_b32 s8, s2, 2
+; GFX12-NEXT:    s_lshr_b32 s6, s2, 3
 ; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX12-NEXT:    v_dual_mov_b32 v46, s4 :: v_dual_mov_b32 v49, s9
-; GFX12-NEXT:    v_dual_mov_b32 v48, s8 :: v_dual_mov_b32 v51, s11
-; GFX12-NEXT:    s_bfe_i64 s[2:3], s[22:23], 0x10000
-; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GFX12-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GFX12-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX12-NEXT:    v_dual_mov_b32 v50, s10 :: v_dual_mov_b32 v53, s13
-; GFX12-NEXT:    v_dual_mov_b32 v52, s12 :: v_dual_mov_b32 v55, s15
-; GFX12-NEXT:    v_dual_mov_b32 v60, s20 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT:    v_mov_b32_e32 v0, s6
-; GFX12-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
-; GFX12-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
-; GFX12-NEXT:    v_dual_mov_b32 v54, s14 :: v_dual_mov_b32 v57, s17
-; GFX12-NEXT:    v_dual_mov_b32 v56, s16 :: v_dual_mov_b32 v59, s19
-; GFX12-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
-; GFX12-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; GFX12-NEXT:    v_dual_mov_b32 v58, s18 :: v_dual_mov_b32 v61, s21
-; GFX12-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GFX12-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
-; GFX12-NEXT:    v_ashrrev_i32_e32 v45, 31, v44
-; GFX12-NEXT:    v_ashrrev_i32_e32 v43, 31, v42
-; GFX12-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
-; GFX12-NEXT:    v_ashrrev_i32_e32 v33, 31, v32
-; GFX12-NEXT:    v_ashrrev_i32_e32 v31, 31, v30
-; GFX12-NEXT:    v_ashrrev_i32_e32 v37, 31, v36
-; GFX12-NEXT:    v_ashrrev_i32_e32 v35, 31, v34
-; GFX12-NEXT:    v_ashrrev_i32_e32 v41, 31, v40
-; GFX12-NEXT:    v_ashrrev_i32_e32 v39, 31, v38
-; GFX12-NEXT:    s_clause 0x9
-; GFX12-NEXT:    global_store_b128 v62, v[46:49], s[0:1] offset:176
-; GFX12-NEXT:    global_store_b128 v62, v[50:53], s[0:1] offset:160
-; GFX12-NEXT:    global_store_b128 v62, v[54:57], s[0:1] offset:144
-; GFX12-NEXT:    global_store_b128 v62, v[58:61], s[0:1] offset:128
-; GFX12-NEXT:    global_store_b128 v62, v[42:45], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v62, v[38:41], s[0:1] offset:96
-; GFX12-NEXT:    global_store_b128 v62, v[34:37], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v62, v[30:33], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v62, v[26:29], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v62, v[8:11], s[0:1] offset:32
-; GFX12-NEXT:    v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, s3
+; GFX12-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v20, s30 :: v_dual_mov_b32 v23, s29
+; GFX12-NEXT:    v_mov_b32_e32 v22, s28
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v62, v[4:7], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v62, v[0:3], s[0:1]
-; GFX12-NEXT:    global_store_b128 v62, v[22:25], s[0:1] offset:240
-; GFX12-NEXT:    global_store_b128 v62, v[18:21], s[0:1] offset:224
-; GFX12-NEXT:    global_store_b128 v62, v[14:17], s[0:1] offset:208
-; GFX12-NEXT:    global_store_b128 v62, v[10:13], s[0:1] offset:192
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:176
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:160
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:144
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:128
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1] offset:96
+; GFX12-NEXT:    v_dual_mov_b32 v1, s25 :: v_dual_mov_b32 v0, s24
+; GFX12-NEXT:    v_dual_mov_b32 v3, s23 :: v_dual_mov_b32 v2, s22
+; GFX12-NEXT:    v_mov_b32_e32 v5, s21
+; GFX12-NEXT:    s_lshr_b32 s68, s2, 1
+; GFX12-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s19
+; GFX12-NEXT:    v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s17
+; GFX12-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v11, s15
+; GFX12-NEXT:    v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v13, s13
+; GFX12-NEXT:    s_bfe_i64 s[4:5], s[2:3], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[2:3], s[68:69], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s11
+; GFX12-NEXT:    v_dual_mov_b32 v14, s10 :: v_dual_mov_b32 v17, s9
+; GFX12-NEXT:    v_dual_mov_b32 v16, s8 :: v_dual_mov_b32 v19, s7
+; GFX12-NEXT:    v_dual_mov_b32 v18, s6 :: v_dual_mov_b32 v21, s5
+; GFX12-NEXT:    v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v23, s3
+; GFX12-NEXT:    v_mov_b32_e32 v22, s2
+; GFX12-NEXT:    s_clause 0x5
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -7778,345 +7573,311 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-LABEL: constant_zextload_v64i1_to_v64i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 13, s2
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 11, s2
-; GFX8-NEXT:    v_and_b32_e32 v16, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 9, s2
-; GFX8-NEXT:    v_and_b32_e32 v15, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 7, s2
-; GFX8-NEXT:    v_and_b32_e32 v13, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 5, s2
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 3, s2
-; GFX8-NEXT:    v_mov_b32_e32 v12, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 14, s2
-; GFX8-NEXT:    s_lshr_b32 s33, s3, 24
-; GFX8-NEXT:    s_lshr_b32 s24, s2, 24
-; GFX8-NEXT:    v_lshrrev_b16_e64 v19, 12, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v17, 10, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 6, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v11, 4, s2
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v9, 2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 1, s2
-; GFX8-NEXT:    s_bfe_u32 s20, s2, 0x10018
-; GFX8-NEXT:    s_bfe_u32 s21, s3, 0x10018
-; GFX8-NEXT:    s_and_b32 s22, s3, 1
-; GFX8-NEXT:    s_and_b32 s23, s2, 1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 15, s2
-; GFX8-NEXT:    s_bfe_u32 s25, s2, 0x10011
-; GFX8-NEXT:    s_bfe_u32 s26, s2, 0x10010
-; GFX8-NEXT:    s_bfe_u32 s27, s2, 0x10012
-; GFX8-NEXT:    s_bfe_u32 s28, s2, 0x10013
-; GFX8-NEXT:    s_bfe_u32 s29, s2, 0x10014
-; GFX8-NEXT:    s_bfe_u32 s30, s2, 0x10015
-; GFX8-NEXT:    s_bfe_u32 s31, s2, 0x10016
-; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x10017
-; GFX8-NEXT:    s_bfe_u32 s34, s3, 0x10011
-; GFX8-NEXT:    s_bfe_u32 s35, s3, 0x10010
-; GFX8-NEXT:    s_bfe_u32 s36, s3, 0x10012
-; GFX8-NEXT:    s_bfe_u32 s37, s3, 0x10013
-; GFX8-NEXT:    s_bfe_u32 s38, s3, 0x10016
-; GFX8-NEXT:    s_bfe_u32 s39, s3, 0x10017
-; GFX8-NEXT:    s_bfe_u32 s40, s3, 0x10015
-; GFX8-NEXT:    s_bfe_u32 s41, s3, 0x10014
-; GFX8-NEXT:    s_add_u32 s4, s0, 0x1a0
-; GFX8-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NEXT:    s_add_u32 s6, s0, 0x1b0
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    s_add_u32 s8, s0, 0x190
-; GFX8-NEXT:    s_addc_u32 s9, s1, 0
-; GFX8-NEXT:    s_add_u32 s10, s0, 0x180
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    s_add_u32 s12, s0, 0xb0
-; GFX8-NEXT:    s_addc_u32 s13, s1, 0
-; GFX8-NEXT:    s_add_u32 s14, s0, 0xa0
-; GFX8-NEXT:    s_addc_u32 s15, s1, 0
-; GFX8-NEXT:    s_add_u32 s16, s0, 0x90
-; GFX8-NEXT:    s_addc_u32 s17, s1, 0
-; GFX8-NEXT:    s_add_u32 s18, s0, 0x80
-; GFX8-NEXT:    s_addc_u32 s19, s1, 0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 13, s3
-; GFX8-NEXT:    s_add_u32 s42, s0, 0x70
-; GFX8-NEXT:    v_and_b32_e32 v7, 1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v23, s42
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_mov_b32_e32 v24, s43
-; GFX8-NEXT:    s_add_u32 s42, s0, 0x170
-; GFX8-NEXT:    v_lshrrev_b16_e64 v22, 14, s3
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[2:5]
-; GFX8-NEXT:    s_addc_u32 s43, s1, 0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0x0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshr_b32 s44, s43, 31
+; GFX8-NEXT:    s_bfe_u32 s45, s43, 0x1001d
+; GFX8-NEXT:    s_bfe_u32 s46, s43, 0x1001b
+; GFX8-NEXT:    s_bfe_u32 s47, s43, 0x10019
+; GFX8-NEXT:    s_bfe_u32 s48, s43, 0x10017
+; GFX8-NEXT:    s_bfe_u32 s49, s43, 0x10013
+; GFX8-NEXT:    s_bfe_u32 s50, s43, 0x10011
+; GFX8-NEXT:    s_bfe_u32 s51, s43, 0x1000f
+; GFX8-NEXT:    s_bfe_u32 s52, s43, 0x1000d
+; GFX8-NEXT:    s_bfe_u32 s53, s43, 0x1000b
+; GFX8-NEXT:    s_bfe_u32 s40, s43, 0x10009
+; GFX8-NEXT:    s_bfe_u32 s38, s43, 0x10007
+; GFX8-NEXT:    s_bfe_u32 s37, s43, 0x10005
+; GFX8-NEXT:    s_bfe_u32 s35, s43, 0x10003
+; GFX8-NEXT:    s_bfe_u32 s33, s43, 0x10001
+; GFX8-NEXT:    s_lshr_b32 s30, s42, 31
+; GFX8-NEXT:    s_bfe_u32 s28, s42, 0x1001d
+; GFX8-NEXT:    s_bfe_u32 s26, s42, 0x1001b
+; GFX8-NEXT:    s_bfe_u32 s25, s42, 0x10019
+; GFX8-NEXT:    s_bfe_u32 s22, s42, 0x10017
+; GFX8-NEXT:    s_bfe_u32 s19, s42, 0x10013
+; GFX8-NEXT:    s_bfe_u32 s17, s42, 0x10011
+; GFX8-NEXT:    s_bfe_u32 s15, s42, 0x1000f
+; GFX8-NEXT:    s_bfe_u32 s13, s42, 0x1000d
+; GFX8-NEXT:    s_bfe_u32 s12, s42, 0x1000b
+; GFX8-NEXT:    s_bfe_u32 s10, s42, 0x10009
+; GFX8-NEXT:    s_bfe_u32 s8, s42, 0x10007
+; GFX8-NEXT:    s_bfe_u32 s6, s42, 0x10005
+; GFX8-NEXT:    s_bfe_u32 s4, s42, 0x10003
+; GFX8-NEXT:    s_bfe_u32 s2, s42, 0x10001
+; GFX8-NEXT:    s_and_b32 s3, s42, 1
+; GFX8-NEXT:    s_bfe_u32 s5, s42, 0x10002
+; GFX8-NEXT:    s_bfe_u32 s7, s42, 0x10004
+; GFX8-NEXT:    s_bfe_u32 s9, s42, 0x10006
+; GFX8-NEXT:    s_bfe_u32 s11, s42, 0x10008
+; GFX8-NEXT:    s_bfe_u32 s14, s42, 0x1000a
+; GFX8-NEXT:    s_bfe_u32 s16, s42, 0x1000c
+; GFX8-NEXT:    s_bfe_u32 s18, s42, 0x1000e
+; GFX8-NEXT:    s_bfe_u32 s20, s42, 0x10010
+; GFX8-NEXT:    s_bfe_u32 s21, s42, 0x10012
+; GFX8-NEXT:    s_bfe_u32 s23, s42, 0x10014
+; GFX8-NEXT:    s_bfe_u32 s24, s42, 0x10015
+; GFX8-NEXT:    s_bfe_u32 s27, s42, 0x10016
+; GFX8-NEXT:    s_bfe_u32 s29, s42, 0x10018
+; GFX8-NEXT:    s_bfe_u32 s31, s42, 0x1001a
+; GFX8-NEXT:    s_bfe_u32 s34, s42, 0x1001c
+; GFX8-NEXT:    s_bfe_u32 s36, s42, 0x1001e
+; GFX8-NEXT:    s_and_b32 s39, s43, 1
+; GFX8-NEXT:    s_bfe_u32 s41, s43, 0x10002
+; GFX8-NEXT:    s_bfe_u32 s54, s43, 0x10004
+; GFX8-NEXT:    s_bfe_u32 s55, s43, 0x10006
+; GFX8-NEXT:    s_bfe_u32 s56, s43, 0x10008
+; GFX8-NEXT:    s_bfe_u32 s57, s43, 0x1000a
+; GFX8-NEXT:    s_bfe_u32 s58, s43, 0x1000c
+; GFX8-NEXT:    s_bfe_u32 s59, s43, 0x1000e
+; GFX8-NEXT:    s_bfe_u32 s60, s43, 0x10010
+; GFX8-NEXT:    s_bfe_u32 s61, s43, 0x10012
+; GFX8-NEXT:    s_bfe_u32 s62, s43, 0x10016
+; GFX8-NEXT:    s_bfe_u32 s63, s43, 0x10018
+; GFX8-NEXT:    s_bfe_u32 s64, s43, 0x1001a
+; GFX8-NEXT:    s_bfe_u32 s65, s43, 0x1001c
+; GFX8-NEXT:    s_bfe_u32 s66, s43, 0x1001e
+; GFX8-NEXT:    s_bfe_u32 s42, s43, 0x10015
+; GFX8-NEXT:    s_bfe_u32 s43, s43, 0x10014
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s42
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX8-NEXT:    v_lshrrev_b16_e64 v24, 15, s3
-; GFX8-NEXT:    v_mov_b32_e32 v23, v1
-; GFX8-NEXT:    v_mov_b32_e32 v25, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s43
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[22:25]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 11, s3
+; GFX8-NEXT:    s_add_u32 s42, s0, 0x1a0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s43
+; GFX8-NEXT:    s_addc_u32 s43, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NEXT:    v_mov_b32_e32 v5, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x1f0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v21, 6, s33
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v2
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s42
-; GFX8-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX8-NEXT:    v_lshrrev_b16_e64 v23, 7, s33
-; GFX8-NEXT:    v_mov_b32_e32 v22, v1
-; GFX8-NEXT:    v_mov_b32_e32 v24, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s43
-; GFX8-NEXT:    s_add_u32 s42, s0, 0xf0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v20, 6, s24
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[21:24]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NEXT:    v_mov_b32_e32 v0, s66
+; GFX8-NEXT:    v_mov_b32_e32 v2, s44
+; GFX8-NEXT:    v_mov_b32_e32 v5, s43
+; GFX8-NEXT:    s_add_u32 s42, s0, 0x1e0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s42
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v20
-; GFX8-NEXT:    v_lshrrev_b16_e64 v24, 7, s24
-; GFX8-NEXT:    v_mov_b32_e32 v23, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s43
-; GFX8-NEXT:    s_add_u32 s42, s0, 0x60
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[22:25]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NEXT:    v_mov_b32_e32 v0, s65
+; GFX8-NEXT:    v_mov_b32_e32 v2, s45
+; GFX8-NEXT:    v_mov_b32_e32 v5, s43
+; GFX8-NEXT:    s_add_u32 s42, s0, 0x1d0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v19
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v18
-; GFX8-NEXT:    v_mov_b32_e32 v18, s42
-; GFX8-NEXT:    v_mov_b32_e32 v19, s43
-; GFX8-NEXT:    s_add_u32 s42, s0, 0x50
-; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[22:25]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NEXT:    v_mov_b32_e32 v0, s64
+; GFX8-NEXT:    v_mov_b32_e32 v2, s46
+; GFX8-NEXT:    v_mov_b32_e32 v5, s43
+; GFX8-NEXT:    s_add_u32 s42, s0, 0x1c0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v17
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v16
-; GFX8-NEXT:    v_mov_b32_e32 v16, s42
-; GFX8-NEXT:    v_mov_b32_e32 v17, s43
-; GFX8-NEXT:    s_add_u32 s42, s0, 64
-; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[22:25]
-; GFX8-NEXT:    v_mov_b32_e32 v17, 1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NEXT:    v_mov_b32_e32 v0, s63
+; GFX8-NEXT:    v_mov_b32_e32 v2, s47
+; GFX8-NEXT:    v_mov_b32_e32 v5, s43
+; GFX8-NEXT:    s_add_u32 s42, s0, 0x1b0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v26, s42
-; GFX8-NEXT:    v_and_b32_sdwa v22, v12, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v15
-; GFX8-NEXT:    v_mov_b32_e32 v27, s43
-; GFX8-NEXT:    s_add_u32 s42, s0, 48
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NEXT:    v_mov_b32_e32 v0, s62
+; GFX8-NEXT:    v_mov_b32_e32 v2, s48
+; GFX8-NEXT:    v_mov_b32_e32 v5, s43
+; GFX8-NEXT:    s_add_u32 s42, s0, 0x190
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v14
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v13
-; GFX8-NEXT:    v_mov_b32_e32 v13, s42
-; GFX8-NEXT:    v_mov_b32_e32 v14, s43
-; GFX8-NEXT:    s_add_u32 s42, s0, 32
-; GFX8-NEXT:    flat_store_dwordx4 v[13:14], v[22:25]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NEXT:    v_mov_b32_e32 v0, s61
+; GFX8-NEXT:    v_mov_b32_e32 v2, s49
+; GFX8-NEXT:    v_mov_b32_e32 v5, s43
+; GFX8-NEXT:    s_add_u32 s42, s0, 0x180
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v11
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v10
-; GFX8-NEXT:    v_mov_b32_e32 v10, s42
-; GFX8-NEXT:    v_mov_b32_e32 v11, s43
-; GFX8-NEXT:    s_add_u32 s42, s0, 16
-; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[22:25]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NEXT:    v_mov_b32_e32 v0, s60
+; GFX8-NEXT:    v_mov_b32_e32 v2, s50
+; GFX8-NEXT:    v_mov_b32_e32 v5, s43
+; GFX8-NEXT:    s_add_u32 s42, s0, 0x170
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v9
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v8
-; GFX8-NEXT:    v_mov_b32_e32 v8, s42
-; GFX8-NEXT:    v_mov_b32_e32 v9, s43
+; GFX8-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NEXT:    v_mov_b32_e32 v0, s59
+; GFX8-NEXT:    v_mov_b32_e32 v2, s51
+; GFX8-NEXT:    v_mov_b32_e32 v5, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x160
-; GFX8-NEXT:    v_lshrrev_b16_e64 v5, 12, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v10, 3, s33
-; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[22:25]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v8, 1, s33
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v22, s42
-; GFX8-NEXT:    v_and_b32_e32 v28, 1, v10
-; GFX8-NEXT:    v_and_b32_e32 v19, 1, v8
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff, v7
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v11, v1
-; GFX8-NEXT:    v_mov_b32_e32 v23, s43
-; GFX8-NEXT:    v_lshrrev_b16_e64 v5, 5, s24
+; GFX8-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NEXT:    v_mov_b32_e32 v0, s58
+; GFX8-NEXT:    v_mov_b32_e32 v2, s52
+; GFX8-NEXT:    v_mov_b32_e32 v5, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x150
-; GFX8-NEXT:    v_lshrrev_b16_e64 v21, 10, s3
-; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[8:11]
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff, v4
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s42
-; GFX8-NEXT:    v_and_b32_e32 v7, 1, v21
-; GFX8-NEXT:    v_mov_b32_e32 v8, v1
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s57
+; GFX8-NEXT:    v_mov_b32_e32 v2, s53
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s43
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[7:10]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 3, s24
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 9, s3
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v4
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 1, s24
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x140
-; GFX8-NEXT:    v_mov_b32_e32 v6, s3
-; GFX8-NEXT:    v_and_b32_e32 v20, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v8, s42
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 7, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v18, 6, s3
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff, v4
-; GFX8-NEXT:    v_and_b32_sdwa v4, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v20
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, s43
+; GFX8-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NEXT:    v_mov_b32_e32 v0, s56
+; GFX8-NEXT:    v_mov_b32_e32 v2, s40
+; GFX8-NEXT:    v_mov_b32_e32 v5, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x130
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 5, s3
-; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v7, 1, v18
-; GFX8-NEXT:    v_mov_b32_e32 v17, s42
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v10
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff, v2
-; GFX8-NEXT:    v_mov_b32_e32 v8, v1
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v18, s43
+; GFX8-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NEXT:    v_mov_b32_e32 v0, s55
+; GFX8-NEXT:    v_mov_b32_e32 v2, s38
+; GFX8-NEXT:    v_mov_b32_e32 v5, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x120
-; GFX8-NEXT:    v_lshrrev_b16_e64 v16, 4, s3
-; GFX8-NEXT:    flat_store_dwordx4 v[17:18], v[7:10]
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff, v3
-; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s42
-; GFX8-NEXT:    v_lshrrev_b16_e64 v12, 3, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v13, 1, s3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff, v19
-; GFX8-NEXT:    v_and_b32_e32 v16, 1, v16
-; GFX8-NEXT:    v_mov_b32_e32 v17, v1
-; GFX8-NEXT:    v_mov_b32_e32 v19, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s43
-; GFX8-NEXT:    s_add_u32 s42, s0, 0x110
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX8-NEXT:    v_lshrrev_b16_e64 v15, 2, s3
-; GFX8-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[16:19]
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s42
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_and_b32_e32 v17, 1, v15
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff, v12
-; GFX8-NEXT:    v_mov_b32_e32 v18, v1
-; GFX8-NEXT:    v_mov_b32_e32 v20, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s43
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff, v13
-; GFX8-NEXT:    v_mov_b32_e32 v13, s5
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[17:20]
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff, v0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NEXT:    v_mov_b32_e32 v0, s54
+; GFX8-NEXT:    v_mov_b32_e32 v2, s37
+; GFX8-NEXT:    v_mov_b32_e32 v5, s43
+; GFX8-NEXT:    s_add_u32 s40, s0, 0x110
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s41
-; GFX8-NEXT:    v_mov_b32_e32 v2, s40
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v12, s4
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v13, s7
-; GFX8-NEXT:    v_mov_b32_e32 v0, s38
-; GFX8-NEXT:    v_mov_b32_e32 v2, s39
-; GFX8-NEXT:    v_mov_b32_e32 v12, s6
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v13, s9
+; GFX8-NEXT:    s_addc_u32 s41, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s40
+; GFX8-NEXT:    v_mov_b32_e32 v2, s35
+; GFX8-NEXT:    v_mov_b32_e32 v5, s41
+; GFX8-NEXT:    s_add_u32 s38, s0, 0x100
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s39
+; GFX8-NEXT:    s_addc_u32 s39, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s38
+; GFX8-NEXT:    v_mov_b32_e32 v2, s33
+; GFX8-NEXT:    v_mov_b32_e32 v5, s39
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s36
-; GFX8-NEXT:    v_mov_b32_e32 v2, s37
-; GFX8-NEXT:    v_mov_b32_e32 v12, s8
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v13, s11
-; GFX8-NEXT:    v_mov_b32_e32 v0, s35
-; GFX8-NEXT:    v_mov_b32_e32 v2, s34
-; GFX8-NEXT:    v_mov_b32_e32 v12, s10
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v12, s12
-; GFX8-NEXT:    v_mov_b32_e32 v0, s31
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v13, s13
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v12, s14
-; GFX8-NEXT:    v_mov_b32_e32 v0, s29
+; GFX8-NEXT:    s_add_u32 s36, s0, 0xf0
+; GFX8-NEXT:    s_addc_u32 s37, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s36
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s30
-; GFX8-NEXT:    v_mov_b32_e32 v13, s15
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v12, s16
-; GFX8-NEXT:    v_mov_b32_e32 v0, s27
+; GFX8-NEXT:    v_mov_b32_e32 v5, s37
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s34
+; GFX8-NEXT:    s_add_u32 s34, s0, 0xe0
+; GFX8-NEXT:    s_addc_u32 s35, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s34
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s28
-; GFX8-NEXT:    v_mov_b32_e32 v13, s17
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v12, s18
-; GFX8-NEXT:    v_mov_b32_e32 v0, s26
+; GFX8-NEXT:    v_mov_b32_e32 v5, s35
+; GFX8-NEXT:    s_add_u32 s30, s0, 0xd0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s31
+; GFX8-NEXT:    s_addc_u32 s31, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s30
+; GFX8-NEXT:    v_mov_b32_e32 v2, s26
+; GFX8-NEXT:    v_mov_b32_e32 v5, s31
+; GFX8-NEXT:    s_add_u32 s28, s0, 0xc0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s29
+; GFX8-NEXT:    s_addc_u32 s29, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s28
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s25
-; GFX8-NEXT:    v_mov_b32_e32 v13, s19
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v13, s1
-; GFX8-NEXT:    s_add_u32 s2, s0, 0x100
+; GFX8-NEXT:    v_mov_b32_e32 v5, s29
+; GFX8-NEXT:    s_add_u32 s26, s0, 0xb0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s27
+; GFX8-NEXT:    s_addc_u32 s27, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NEXT:    v_mov_b32_e32 v2, s22
+; GFX8-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NEXT:    s_add_u32 s22, s0, 0xa0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s23
-; GFX8-NEXT:    v_mov_b32_e32 v2, v10
-; GFX8-NEXT:    v_mov_b32_e32 v12, s0
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v13, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 5, s33
-; GFX8-NEXT:    v_mov_b32_e32 v0, s22
-; GFX8-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8-NEXT:    v_mov_b32_e32 v12, s2
-; GFX8-NEXT:    s_add_u32 s2, s0, 0x1e0
-; GFX8-NEXT:    v_and_b32_e32 v26, 1, v14
-; GFX8-NEXT:    v_lshrrev_b16_e64 v27, 4, s33
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_and_b32_e32 v17, 1, v27
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff, v26
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 0x1d0
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[17:20]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 0x1c0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 2, s33
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff, v28
-; GFX8-NEXT:    v_mov_b32_e32 v15, v1
-; GFX8-NEXT:    v_mov_b32_e32 v17, v1
-; GFX8-NEXT:    v_mov_b32_e32 v13, s3
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[14:17]
+; GFX8-NEXT:    s_addc_u32 s23, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s22
+; GFX8-NEXT:    v_mov_b32_e32 v2, s24
+; GFX8-NEXT:    v_mov_b32_e32 v5, s23
+; GFX8-NEXT:    s_add_u32 s22, s0, 0x90
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s23, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s22
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s21
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v12, s2
-; GFX8-NEXT:    s_add_u32 s2, s0, 0xe0
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v23, 4, s24
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    s_add_u32 s2, s0, 0xd0
-; GFX8-NEXT:    v_and_b32_e32 v7, 1, v23
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff, v22
-; GFX8-NEXT:    v_mov_b32_e32 v8, v1
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v21, 2, s24
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[7:10]
-; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    s_add_u32 s0, s0, 0xc0
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v21
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s19
+; GFX8-NEXT:    v_mov_b32_e32 v5, s23
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
+; GFX8-NEXT:    s_add_u32 s20, s0, 0x80
+; GFX8-NEXT:    s_addc_u32 s21, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_mov_b32_e32 v5, s21
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    s_add_u32 s18, s0, 0x70
+; GFX8-NEXT:    s_addc_u32 s19, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s18
+; GFX8-NEXT:    v_mov_b32_e32 v2, s15
+; GFX8-NEXT:    v_mov_b32_e32 v5, s19
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s16
+; GFX8-NEXT:    s_add_u32 s16, s0, 0x60
+; GFX8-NEXT:    s_addc_u32 s17, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s16
+; GFX8-NEXT:    v_mov_b32_e32 v2, s13
+; GFX8-NEXT:    v_mov_b32_e32 v5, s17
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s12
+; GFX8-NEXT:    s_add_u32 s12, s0, 0x50
+; GFX8-NEXT:    s_addc_u32 s13, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s12
+; GFX8-NEXT:    v_mov_b32_e32 v0, s14
+; GFX8-NEXT:    v_mov_b32_e32 v5, s13
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NEXT:    s_add_u32 s10, s0, 64
+; GFX8-NEXT:    v_mov_b32_e32 v0, s11
+; GFX8-NEXT:    s_addc_u32 s11, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8-NEXT:    s_add_u32 s8, s0, 48
+; GFX8-NEXT:    v_mov_b32_e32 v0, s9
+; GFX8-NEXT:    s_addc_u32 s9, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8-NEXT:    v_mov_b32_e32 v5, s9
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    s_add_u32 s6, s0, 32
+; GFX8-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    s_add_u32 s4, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NEXT:    s_addc_u32 s5, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v11
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
@@ -8432,207 +8193,167 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v0, 13, s2
-; GFX12-NEXT:    v_lshrrev_b16 v3, 11, s2
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 12, s2
-; GFX12-NEXT:    v_lshrrev_b16 v4, 9, s2
-; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT:    v_lshrrev_b16 v10, 3, s2
-; GFX12-NEXT:    s_bfe_u32 s8, s3, 0x10014
-; GFX12-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX12-NEXT:    v_lshrrev_b16 v6, 7, s2
-; GFX12-NEXT:    s_bfe_u32 s9, s3, 0x10015
-; GFX12-NEXT:    v_lshrrev_b16 v12, 1, s2
-; GFX12-NEXT:    v_lshrrev_b16 v16, 11, s3
-; GFX12-NEXT:    v_and_b32_e32 v36, 1, v4
-; GFX12-NEXT:    v_and_b32_e32 v43, 1, v10
-; GFX12-NEXT:    v_dual_mov_b32 v68, v1 :: v_dual_and_b32 v69, 1, v2
-; GFX12-NEXT:    v_dual_mov_b32 v62, v1 :: v_dual_and_b32 v71, 0xffff, v0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_and_b32 v67, 0xffff, v3
-; GFX12-NEXT:    v_mov_b32_e32 v66, v1
-; GFX12-NEXT:    v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT:    s_lshr_b32 s4, s3, 24
-; GFX12-NEXT:    v_lshrrev_b16 v8, 5, s2
-; GFX12-NEXT:    v_lshrrev_b16 v14, 13, s3
-; GFX12-NEXT:    v_lshrrev_b16 v18, 9, s3
-; GFX12-NEXT:    v_dual_mov_b32 v47, v1 :: v_dual_and_b32 v38, 1, v6
-; GFX12-NEXT:    v_lshrrev_b16 v4, 5, s4
-; GFX12-NEXT:    v_lshrrev_b16 v6, 3, s4
-; GFX12-NEXT:    s_bfe_u32 s8, s3, 0x10016
-; GFX12-NEXT:    s_bfe_u32 s9, s3, 0x10017
-; GFX12-NEXT:    v_lshrrev_b16 v20, 7, s3
-; GFX12-NEXT:    v_and_b32_e32 v45, 1, v12
-; GFX12-NEXT:    v_and_b32_e32 v41, 1, v16
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x10014
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10015
+; GFX12-NEXT:    s_lshr_b32 s4, s3, 31
+; GFX12-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x1001e
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:416
-; GFX12-NEXT:    v_mov_b32_e32 v2, s9
-; GFX12-NEXT:    v_mov_b32_e32 v0, s8
-; GFX12-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX12-NEXT:    v_lshrrev_b16 v22, 5, s3
-; GFX12-NEXT:    v_lshrrev_b16 v24, 3, s3
-; GFX12-NEXT:    v_dual_mov_b32 v49, v1 :: v_dual_and_b32 v40, 1, v8
-; GFX12-NEXT:    v_and_b32_e32 v44, 1, v14
-; GFX12-NEXT:    v_and_b32_e32 v14, 1, v6
-; GFX12-NEXT:    v_lshrrev_b16 v6, 5, s5
-; GFX12-NEXT:    v_lshrrev_b16 v8, 1, s5
-; GFX12-NEXT:    v_lshrrev_b16 v10, 3, s5
-; GFX12-NEXT:    s_bfe_u32 s8, s3, 0x10012
-; GFX12-NEXT:    v_and_b32_e32 v37, 1, v18
-; GFX12-NEXT:    v_and_b32_e32 v18, 1, v4
-; GFX12-NEXT:    v_lshrrev_b16 v4, 1, s4
-; GFX12-NEXT:    s_bfe_u32 s9, s3, 0x10013
-; GFX12-NEXT:    v_and_b32_e32 v33, 1, v20
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:432
-; GFX12-NEXT:    v_mov_b32_e32 v2, s9
-; GFX12-NEXT:    v_mov_b32_e32 v0, s8
-; GFX12-NEXT:    v_lshrrev_b16 v9, 15, s3
-; GFX12-NEXT:    v_lshrrev_b16 v11, 14, s3
-; GFX12-NEXT:    v_lshrrev_b16 v23, 12, s3
-; GFX12-NEXT:    v_lshrrev_b16 v25, 10, s3
-; GFX12-NEXT:    v_lshrrev_b16 v27, 8, s3
-; GFX12-NEXT:    v_lshrrev_b16 v29, 6, s3
-; GFX12-NEXT:    v_lshrrev_b16 v28, 4, s3
-; GFX12-NEXT:    v_lshrrev_b16 v26, 1, s3
-; GFX12-NEXT:    v_and_b32_e32 v30, 1, v22
-; GFX12-NEXT:    v_and_b32_e32 v31, 1, v24
-; GFX12-NEXT:    v_lshrrev_b16 v24, 2, s3
-; GFX12-NEXT:    v_lshrrev_b16 v22, 7, s5
-; GFX12-NEXT:    v_lshrrev_b16 v20, 6, s5
-; GFX12-NEXT:    v_and_b32_e32 v39, 1, v6
-; GFX12-NEXT:    v_lshrrev_b16 v6, 2, s5
-; GFX12-NEXT:    s_and_b32 s6, s3, 1
-; GFX12-NEXT:    s_bfe_u32 s8, s3, 0x10011
-; GFX12-NEXT:    v_and_b32_e32 v35, 1, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 1, v8
-; GFX12-NEXT:    v_and_b32_e32 v8, 1, v10
-; GFX12-NEXT:    v_lshrrev_b16 v10, 4, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x1001d
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x1001c
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:496
+; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x1001b
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x1001a
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:480
+; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x10019
 ; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10018
-; GFX12-NEXT:    s_bfe_u32 s3, s3, 0x10010
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:464
+; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x10017
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10016
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:448
+; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x10013
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10012
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:432
+; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x10011
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:400
-; GFX12-NEXT:    v_mov_b32_e32 v0, s3
-; GFX12-NEXT:    v_mov_b32_e32 v2, s8
-; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10016
-; GFX12-NEXT:    s_bfe_u32 s8, s2, 0x10017
-; GFX12-NEXT:    v_lshrrev_b16 v13, 10, s2
-; GFX12-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX12-NEXT:    v_and_b32_e32 v82, 0xffff, v35
-; GFX12-NEXT:    v_and_b32_e32 v35, 1, v27
+; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x1000f
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x1000e
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:384
+; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x1000d
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x1000c
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:368
+; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x1000b
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x1000a
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:352
+; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x10009
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10008
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:336
+; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x10007
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10006
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:320
+; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x10005
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10004
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:304
+; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x10003
+; GFX12-NEXT:    s_bfe_u32 s5, s3, 0x10002
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:288
+; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x10001
+; GFX12-NEXT:    s_and_b32 s3, s3, 1
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:272
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
-; GFX12-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v27, v1
-; GFX12-NEXT:    v_and_b32_e32 v81, 0xffff, v4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_lshr_b32 s3, s2, 31
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x1001e
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:256
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1001d
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x1001c
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:240
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1001b
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x1001a
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:224
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10019
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10018
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:208
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10017
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10016
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10014
-; GFX12-NEXT:    s_bfe_u32 s8, s2, 0x10015
-; GFX12-NEXT:    v_dual_mov_b32 v72, v1 :: v_dual_and_b32 v65, 1, v13
-; GFX12-NEXT:    v_mov_b32_e32 v13, v1
-; GFX12-NEXT:    v_and_b32_e32 v83, 0xffff, v26
-; GFX12-NEXT:    v_and_b32_e32 v26, 0xffff, v31
-; GFX12-NEXT:    v_and_b32_e32 v31, 1, v29
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10015
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:176
-; GFX12-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
-; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10012
-; GFX12-NEXT:    s_bfe_u32 s8, s2, 0x10013
-; GFX12-NEXT:    v_lshrrev_b16 v5, 15, s2
-; GFX12-NEXT:    v_lshrrev_b16 v7, 14, s2
-; GFX12-NEXT:    v_lshrrev_b16 v21, 2, s2
-; GFX12-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10013
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10012
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:160
-; GFX12-NEXT:    v_mov_b32_e32 v0, s3
-; GFX12-NEXT:    v_mov_b32_e32 v2, s8
-; GFX12-NEXT:    v_lshrrev_b16 v15, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v17, 6, s2
-; GFX12-NEXT:    v_lshrrev_b16 v19, 4, s2
-; GFX12-NEXT:    v_lshrrev_b16 v32, 7, s4
-; GFX12-NEXT:    v_lshrrev_b16 v34, 6, s4
-; GFX12-NEXT:    v_lshrrev_b16 v16, 4, s4
-; GFX12-NEXT:    v_lshrrev_b16 v12, 2, s4
-; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT:    s_and_b32 s7, s2, 1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10011
-; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x10010
-; GFX12-NEXT:    v_and_b32_e32 v4, 1, v6
-; GFX12-NEXT:    v_and_b32_e32 v37, 0xffff, v37
-; GFX12-NEXT:    v_dual_mov_b32 v78, v1 :: v_dual_and_b32 v41, 0xffff, v41
-; GFX12-NEXT:    v_dual_mov_b32 v80, v1 :: v_dual_and_b32 v29, 0xffff, v45
-; GFX12-NEXT:    v_dual_mov_b32 v21, v1 :: v_dual_and_b32 v50, 1, v21
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff, v8
-; GFX12-NEXT:    v_and_b32_e32 v8, 1, v10
-; GFX12-NEXT:    v_and_b32_e32 v10, 0xffff, v39
-; GFX12-NEXT:    v_and_b32_e32 v39, 1, v25
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s3
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_and_b32 v77, 1, v7
-; GFX12-NEXT:    v_and_b32_e32 v79, 0xffff, v5
-; GFX12-NEXT:    v_dual_mov_b32 v70, v1 :: v_dual_and_b32 v63, 0xffff, v36
-; GFX12-NEXT:    v_dual_mov_b32 v74, v1 :: v_dual_and_b32 v61, 1, v15
-; GFX12-NEXT:    v_dual_mov_b32 v64, v1 :: v_dual_and_b32 v73, 1, v11
-; GFX12-NEXT:    v_dual_mov_b32 v59, v1 :: v_dual_and_b32 v12, 1, v12
-; GFX12-NEXT:    v_dual_mov_b32 v51, v1 :: v_dual_and_b32 v20, 1, v20
-; GFX12-NEXT:    v_dual_mov_b32 v53, v1 :: v_dual_and_b32 v22, 0xffff, v22
-; GFX12-NEXT:    v_and_b32_e32 v52, 0xffff, v43
-; GFX12-NEXT:    v_and_b32_e32 v58, 1, v17
-; GFX12-NEXT:    v_dual_mov_b32 v25, v1 :: v_dual_and_b32 v60, 0xffff, v38
-; GFX12-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1000f
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x1000e
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:128
-; GFX12-NEXT:    v_mov_b32_e32 v0, s7
-; GFX12-NEXT:    v_mov_b32_e32 v2, v29
-; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v1, v[77:80], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v1, v[69:72], s[0:1] offset:96
-; GFX12-NEXT:    global_store_b128 v1, v[65:68], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v1, v[61:64], s[0:1] offset:64
-; GFX12-NEXT:    v_dual_mov_b32 v61, v1 :: v_dual_and_b32 v16, 1, v16
-; GFX12-NEXT:    v_dual_mov_b32 v55, v1 :: v_dual_and_b32 v54, 1, v19
-; GFX12-NEXT:    v_dual_mov_b32 v15, v1 :: v_dual_and_b32 v56, 0xffff, v40
-; GFX12-NEXT:    v_dual_mov_b32 v57, v1 :: v_dual_and_b32 v18, 0xffff, v18
-; GFX12-NEXT:    v_and_b32_e32 v46, 1, v34
-; GFX12-NEXT:    v_and_b32_e32 v48, 0xffff, v32
-; GFX12-NEXT:    v_dual_mov_b32 v76, v1 :: v_dual_and_b32 v75, 0xffff, v9
-; GFX12-NEXT:    v_dual_mov_b32 v43, v1 :: v_dual_and_b32 v24, 1, v24
-; GFX12-NEXT:    v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v42, 1, v23
-; GFX12-NEXT:    v_dual_mov_b32 v45, v1 :: v_dual_and_b32 v44, 0xffff, v44
-; GFX12-NEXT:    s_clause 0x6
-; GFX12-NEXT:    global_store_b128 v1, v[58:61], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v1, v[54:57], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v1, v[50:53], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    global_store_b128 v1, v[46:49], s[0:1] offset:496
-; GFX12-NEXT:    global_store_b128 v1, v[73:76], s[0:1] offset:368
-; GFX12-NEXT:    global_store_b128 v1, v[42:45], s[0:1] offset:352
-; GFX12-NEXT:    v_mov_b32_e32 v40, v1
-; GFX12-NEXT:    v_mov_b32_e32 v42, v1
-; GFX12-NEXT:    v_mov_b32_e32 v32, v1
-; GFX12-NEXT:    v_mov_b32_e32 v34, v1
-; GFX12-NEXT:    v_mov_b32_e32 v0, s6
-; GFX12-NEXT:    v_mov_b32_e32 v2, v83
-; GFX12-NEXT:    v_mov_b32_e32 v36, v1
-; GFX12-NEXT:    v_dual_mov_b32 v38, v1 :: v_dual_mov_b32 v7, v1
-; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v1, v[39:42], s[0:1] offset:336
-; GFX12-NEXT:    global_store_b128 v1, v[35:38], s[0:1] offset:320
-; GFX12-NEXT:    global_store_b128 v1, v[31:34], s[0:1] offset:304
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:256
-; GFX12-NEXT:    v_mov_b32_e32 v0, s5
-; GFX12-NEXT:    v_dual_mov_b32 v2, v82 :: v_dual_mov_b32 v23, v1
-; GFX12-NEXT:    v_mov_b32_e32 v19, v1
-; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v1, v[24:27], s[0:1] offset:272
-; GFX12-NEXT:    global_store_b128 v1, v[20:23], s[0:1] offset:240
-; GFX12-NEXT:    global_store_b128 v1, v[16:19], s[0:1] offset:480
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:448
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12-NEXT:    v_dual_mov_b32 v2, v81 :: v_dual_mov_b32 v9, v1
-; GFX12-NEXT:    v_mov_b32_e32 v11, v1
-; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_and_b32 v28, 1, v28
-; GFX12-NEXT:    v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v30, 0xffff, v30
-; GFX12-NEXT:    v_mov_b32_e32 v31, v1
-; GFX12-NEXT:    s_clause 0x4
-; GFX12-NEXT:    global_store_b128 v1, v[12:15], s[0:1] offset:464
-; GFX12-NEXT:    global_store_b128 v1, v[8:11], s[0:1] offset:224
-; GFX12-NEXT:    global_store_b128 v1, v[4:7], s[0:1] offset:208
-; GFX12-NEXT:    global_store_b128 v1, v[28:31], s[0:1] offset:288
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1000d
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x1000c
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:112
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1000b
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x1000a
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10009
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10008
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10007
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10006
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:64
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10005
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10004
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10003
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10002
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10001
+; GFX12-NEXT:    s_and_b32 s2, s2, 1
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -8968,381 +8689,445 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ;
 ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX8-NEXT:    s_mov_b32 s7, 0
-; GFX8-NEXT:    s_mov_b32 s13, s7
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT:    s_mov_b32 s90, -1
+; GFX8-NEXT:    s_mov_b32 s91, 0xe80000
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dwordx2 s[10:11], s[2:3], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v29, s1
-; GFX8-NEXT:    v_mov_b32_e32 v28, s0
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
+; GFX8-NEXT:    s_add_u32 s88, s88, s9
+; GFX8-NEXT:    s_addc_u32 s89, s89, 0
+; GFX8-NEXT:    ; implicit-def: $vgpr44 : SGPR spill to VGPR lane
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s16, s11, 22
-; GFX8-NEXT:    s_lshr_b32 s18, s11, 23
-; GFX8-NEXT:    s_lshr_b32 s20, s11, 20
-; GFX8-NEXT:    s_lshr_b32 s22, s11, 21
-; GFX8-NEXT:    s_lshr_b32 s24, s11, 18
-; GFX8-NEXT:    s_lshr_b32 s26, s11, 19
-; GFX8-NEXT:    s_lshr_b32 s28, s11, 16
-; GFX8-NEXT:    s_lshr_b32 s30, s11, 17
-; GFX8-NEXT:    s_lshr_b32 s34, s10, 22
-; GFX8-NEXT:    s_lshr_b32 s36, s10, 23
-; GFX8-NEXT:    s_lshr_b32 s38, s10, 20
-; GFX8-NEXT:    s_lshr_b32 s40, s10, 21
-; GFX8-NEXT:    s_lshr_b32 s42, s10, 18
-; GFX8-NEXT:    s_lshr_b32 s44, s10, 19
-; GFX8-NEXT:    s_lshr_b32 s46, s10, 16
-; GFX8-NEXT:    s_lshr_b32 s48, s10, 17
-; GFX8-NEXT:    s_mov_b32 s6, s11
-; GFX8-NEXT:    s_lshr_b32 s12, s11, 24
-; GFX8-NEXT:    s_lshr_b32 s8, s10, 24
-; GFX8-NEXT:    s_bfe_i64 s[2:3], s[8:9], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[4:5], s[12:13], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[14:15], s[10:11], 0x10000
+; GFX8-NEXT:    s_lshr_b32 s0, s3, 8
+; GFX8-NEXT:    v_writelane_b32 v44, s0, 0
+; GFX8-NEXT:    v_writelane_b32 v44, s1, 1
+; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
+; GFX8-NEXT:    s_lshr_b32 s36, s3, 21
+; GFX8-NEXT:    s_lshr_b32 s30, s3, 19
+; GFX8-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x10000
+; GFX8-NEXT:    s_lshr_b32 s74, s3, 30
+; GFX8-NEXT:    s_lshr_b32 s50, s3, 31
+; GFX8-NEXT:    s_lshr_b32 s72, s3, 28
+; GFX8-NEXT:    s_lshr_b32 s48, s3, 29
+; GFX8-NEXT:    s_lshr_b32 s70, s3, 26
+; GFX8-NEXT:    s_lshr_b32 s46, s3, 27
+; GFX8-NEXT:    s_lshr_b32 s68, s3, 24
+; GFX8-NEXT:    s_lshr_b32 s42, s3, 25
+; GFX8-NEXT:    s_lshr_b32 s66, s3, 22
+; GFX8-NEXT:    s_lshr_b32 s40, s3, 23
+; GFX8-NEXT:    s_lshr_b32 s64, s3, 20
+; GFX8-NEXT:    s_lshr_b32 s62, s3, 18
+; GFX8-NEXT:    s_lshr_b32 s56, s3, 16
+; GFX8-NEXT:    s_lshr_b32 s18, s3, 17
+; GFX8-NEXT:    s_lshr_b32 s58, s3, 14
+; GFX8-NEXT:    s_lshr_b32 s38, s3, 15
+; GFX8-NEXT:    s_lshr_b32 s60, s3, 12
+; GFX8-NEXT:    s_lshr_b32 s44, s3, 13
+; GFX8-NEXT:    s_lshr_b32 s54, s3, 10
+; GFX8-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX8-NEXT:    v_writelane_b32 v44, s0, 2
+; GFX8-NEXT:    s_lshr_b32 s52, s3, 11
+; GFX8-NEXT:    s_bfe_i64 s[74:75], s[74:75], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[72:73], s[72:73], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[70:71], s[70:71], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[68:69], s[68:69], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[66:67], s[66:67], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[64:65], s[64:65], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[62:63], s[62:63], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[60:61], s[60:61], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[58:59], s[58:59], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[56:57], s[56:57], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX8-NEXT:    v_mov_b32_e32 v18, s36
+; GFX8-NEXT:    v_mov_b32_e32 v19, s37
+; GFX8-NEXT:    v_mov_b32_e32 v26, s30
+; GFX8-NEXT:    v_mov_b32_e32 v27, s31
+; GFX8-NEXT:    s_bfe_i64 s[30:31], s[44:45], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[36:37], s[38:39], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX8-NEXT:    v_writelane_b32 v44, s1, 3
+; GFX8-NEXT:    s_lshr_b32 s6, s3, 9
+; GFX8-NEXT:    s_lshr_b32 s8, s3, 6
+; GFX8-NEXT:    s_lshr_b32 s10, s3, 7
+; GFX8-NEXT:    s_lshr_b32 s12, s3, 4
+; GFX8-NEXT:    s_lshr_b32 s14, s3, 5
+; GFX8-NEXT:    s_lshr_b32 s16, s3, 2
+; GFX8-NEXT:    s_lshr_b32 s20, s3, 3
+; GFX8-NEXT:    s_lshr_b32 s22, s3, 1
+; GFX8-NEXT:    s_mov_b32 s24, s3
+; GFX8-NEXT:    s_lshr_b32 s26, s2, 30
+; GFX8-NEXT:    s_lshr_b32 s28, s2, 31
+; GFX8-NEXT:    s_lshr_b32 s34, s2, 28
+; GFX8-NEXT:    v_mov_b32_e32 v4, s74
+; GFX8-NEXT:    v_mov_b32_e32 v8, s72
+; GFX8-NEXT:    v_mov_b32_e32 v0, s70
+; GFX8-NEXT:    v_mov_b32_e32 v55, s68
+; GFX8-NEXT:    v_mov_b32_e32 v20, s66
+; GFX8-NEXT:    v_mov_b32_e32 v16, s64
+; GFX8-NEXT:    v_mov_b32_e32 v24, s62
+; GFX8-NEXT:    v_mov_b32_e32 v28, s56
+; GFX8-NEXT:    v_mov_b32_e32 v32, s58
+; GFX8-NEXT:    v_mov_b32_e32 v36, s60
+; GFX8-NEXT:    s_lshr_b32 s86, s2, 29
+; GFX8-NEXT:    v_mov_b32_e32 v40, s54
+; GFX8-NEXT:    s_lshr_b32 s84, s2, 26
+; GFX8-NEXT:    s_lshr_b32 s82, s2, 27
+; GFX8-NEXT:    s_lshr_b32 s80, s2, 24
+; GFX8-NEXT:    v_mov_b32_e32 v6, s50
+; GFX8-NEXT:    s_lshr_b32 s78, s2, 25
+; GFX8-NEXT:    s_lshr_b32 s76, s2, 22
+; GFX8-NEXT:    v_mov_b32_e32 v10, s48
+; GFX8-NEXT:    s_lshr_b32 s74, s2, 23
+; GFX8-NEXT:    s_lshr_b32 s72, s2, 20
+; GFX8-NEXT:    v_mov_b32_e32 v2, s46
+; GFX8-NEXT:    s_lshr_b32 s70, s2, 21
+; GFX8-NEXT:    s_lshr_b32 s68, s2, 18
+; GFX8-NEXT:    v_mov_b32_e32 v57, s42
+; GFX8-NEXT:    s_lshr_b32 s66, s2, 19
+; GFX8-NEXT:    s_lshr_b32 s64, s2, 16
+; GFX8-NEXT:    v_mov_b32_e32 v22, s40
+; GFX8-NEXT:    s_lshr_b32 s62, s2, 17
+; GFX8-NEXT:    s_lshr_b32 s60, s2, 14
+; GFX8-NEXT:    s_lshr_b32 s58, s2, 15
+; GFX8-NEXT:    s_lshr_b32 s56, s2, 12
+; GFX8-NEXT:    s_lshr_b32 s54, s2, 13
+; GFX8-NEXT:    s_bfe_i64 vcc, s[52:53], 0x10000
+; GFX8-NEXT:    s_lshr_b32 s52, s2, 10
+; GFX8-NEXT:    v_mov_b32_e32 v30, s18
+; GFX8-NEXT:    v_mov_b32_e32 v31, s19
+; GFX8-NEXT:    s_lshr_b32 s50, s2, 11
+; GFX8-NEXT:    s_lshr_b32 s48, s2, 8
+; GFX8-NEXT:    v_mov_b32_e32 v34, s36
+; GFX8-NEXT:    s_lshr_b32 s46, s2, 9
+; GFX8-NEXT:    s_lshr_b32 s44, s2, 6
+; GFX8-NEXT:    v_mov_b32_e32 v38, s30
+; GFX8-NEXT:    s_lshr_b32 s42, s2, 7
+; GFX8-NEXT:    s_lshr_b32 s40, s2, 4
+; GFX8-NEXT:    s_lshr_b32 s38, s2, 5
+; GFX8-NEXT:    s_lshr_b32 s36, s2, 2
+; GFX8-NEXT:    s_lshr_b32 s30, s2, 3
+; GFX8-NEXT:    s_bfe_i64 s[18:19], s[2:3], 0x10000
+; GFX8-NEXT:    v_readlane_b32 s2, v44, 0
+; GFX8-NEXT:    v_readlane_b32 s3, v44, 1
+; GFX8-NEXT:    v_mov_b32_e32 v5, s75
+; GFX8-NEXT:    v_mov_b32_e32 v7, s51
+; GFX8-NEXT:    v_mov_b32_e32 v9, s73
+; GFX8-NEXT:    v_mov_b32_e32 v11, s49
+; GFX8-NEXT:    v_mov_b32_e32 v1, s71
+; GFX8-NEXT:    v_mov_b32_e32 v3, s47
+; GFX8-NEXT:    v_mov_b32_e32 v56, s69
+; GFX8-NEXT:    v_mov_b32_e32 v58, s43
+; GFX8-NEXT:    v_mov_b32_e32 v21, s67
+; GFX8-NEXT:    v_mov_b32_e32 v23, s41
+; GFX8-NEXT:    v_mov_b32_e32 v17, s65
+; GFX8-NEXT:    v_mov_b32_e32 v25, s63
+; GFX8-NEXT:    v_mov_b32_e32 v29, s57
+; GFX8-NEXT:    v_mov_b32_e32 v33, s59
+; GFX8-NEXT:    v_mov_b32_e32 v35, s37
+; GFX8-NEXT:    v_mov_b32_e32 v37, s61
+; GFX8-NEXT:    v_mov_b32_e32 v39, s31
+; GFX8-NEXT:    v_mov_b32_e32 v41, s55
+; GFX8-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[52:53], s[52:53], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[56:57], s[56:57], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[58:59], s[58:59], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[60:61], s[60:61], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[62:63], s[62:63], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[64:65], s[64:65], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[66:67], s[66:67], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[68:69], s[68:69], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[70:71], s[70:71], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[72:73], s[72:73], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[74:75], s[74:75], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[76:77], s[76:77], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[78:79], s[78:79], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[80:81], s[80:81], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[82:83], s[82:83], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[84:85], s[84:85], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[86:87], s[86:87], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
-; GFX8-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
 ; GFX8-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
-; GFX8-NEXT:    v_mov_b32_e32 v22, s16
-; GFX8-NEXT:    s_add_u32 s16, s0, 0x1b0
-; GFX8-NEXT:    v_mov_b32_e32 v23, s17
-; GFX8-NEXT:    s_addc_u32 s17, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v27, s17
-; GFX8-NEXT:    v_mov_b32_e32 v26, s16
-; GFX8-NEXT:    s_add_u32 s16, s0, 0x1a0
-; GFX8-NEXT:    v_mov_b32_e32 v24, s18
-; GFX8-NEXT:    v_mov_b32_e32 v25, s19
-; GFX8-NEXT:    s_addc_u32 s17, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_mov_b32_e32 v27, s17
-; GFX8-NEXT:    v_mov_b32_e32 v26, s16
-; GFX8-NEXT:    s_add_u32 s16, s0, 0x190
-; GFX8-NEXT:    v_mov_b32_e32 v22, s20
-; GFX8-NEXT:    v_mov_b32_e32 v23, s21
-; GFX8-NEXT:    v_mov_b32_e32 v24, s22
-; GFX8-NEXT:    v_mov_b32_e32 v25, s23
-; GFX8-NEXT:    s_addc_u32 s17, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_mov_b32_e32 v27, s17
-; GFX8-NEXT:    v_mov_b32_e32 v26, s16
-; GFX8-NEXT:    s_add_u32 s16, s0, 0x180
-; GFX8-NEXT:    v_mov_b32_e32 v22, s24
-; GFX8-NEXT:    v_mov_b32_e32 v23, s25
-; GFX8-NEXT:    v_mov_b32_e32 v24, s26
-; GFX8-NEXT:    v_mov_b32_e32 v25, s27
-; GFX8-NEXT:    s_addc_u32 s17, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_mov_b32_e32 v27, s17
-; GFX8-NEXT:    v_mov_b32_e32 v26, s16
-; GFX8-NEXT:    s_add_u32 s16, s0, 0xb0
-; GFX8-NEXT:    v_mov_b32_e32 v22, s28
-; GFX8-NEXT:    v_mov_b32_e32 v23, s29
-; GFX8-NEXT:    v_mov_b32_e32 v24, s30
-; GFX8-NEXT:    v_mov_b32_e32 v25, s31
-; GFX8-NEXT:    s_addc_u32 s17, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_mov_b32_e32 v27, s17
-; GFX8-NEXT:    v_mov_b32_e32 v26, s16
-; GFX8-NEXT:    s_add_u32 s16, s0, 0xa0
-; GFX8-NEXT:    v_mov_b32_e32 v22, s34
-; GFX8-NEXT:    v_mov_b32_e32 v23, s35
-; GFX8-NEXT:    v_mov_b32_e32 v24, s36
-; GFX8-NEXT:    v_mov_b32_e32 v25, s37
-; GFX8-NEXT:    s_addc_u32 s17, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_mov_b32_e32 v27, s17
-; GFX8-NEXT:    v_mov_b32_e32 v26, s16
-; GFX8-NEXT:    s_add_u32 s16, s0, 0x90
-; GFX8-NEXT:    v_mov_b32_e32 v22, s38
-; GFX8-NEXT:    v_mov_b32_e32 v23, s39
-; GFX8-NEXT:    v_mov_b32_e32 v24, s40
-; GFX8-NEXT:    v_mov_b32_e32 v25, s41
-; GFX8-NEXT:    s_addc_u32 s17, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_mov_b32_e32 v27, s17
-; GFX8-NEXT:    v_mov_b32_e32 v26, s16
-; GFX8-NEXT:    s_add_u32 s16, s0, 0x80
-; GFX8-NEXT:    v_mov_b32_e32 v22, s42
-; GFX8-NEXT:    v_mov_b32_e32 v23, s43
-; GFX8-NEXT:    v_mov_b32_e32 v24, s44
-; GFX8-NEXT:    v_mov_b32_e32 v25, s45
-; GFX8-NEXT:    s_addc_u32 s17, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_mov_b32_e32 v27, s17
-; GFX8-NEXT:    v_mov_b32_e32 v26, s16
-; GFX8-NEXT:    s_add_u32 s16, s0, 0x70
-; GFX8-NEXT:    v_lshrrev_b16_e64 v20, 14, s10
-; GFX8-NEXT:    v_lshrrev_b16_e64 v21, 15, s10
-; GFX8-NEXT:    v_mov_b32_e32 v22, s46
-; GFX8-NEXT:    v_mov_b32_e32 v23, s47
-; GFX8-NEXT:    v_mov_b32_e32 v24, s48
-; GFX8-NEXT:    v_mov_b32_e32 v25, s49
-; GFX8-NEXT:    s_addc_u32 s17, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT:    v_bfe_i32 v26, v21, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v24, v20, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v21, s17
-; GFX8-NEXT:    v_mov_b32_e32 v20, s16
-; GFX8-NEXT:    s_add_u32 s16, s0, 0x60
-; GFX8-NEXT:    v_lshrrev_b16_e64 v18, 12, s10
-; GFX8-NEXT:    v_lshrrev_b16_e64 v19, 13, s10
-; GFX8-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
-; GFX8-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT:    s_addc_u32 s17, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[24:27]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v16, 10, s10
-; GFX8-NEXT:    v_bfe_i32 v26, v19, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v24, v18, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v19, s17
-; GFX8-NEXT:    v_lshrrev_b16_e64 v17, 11, s10
-; GFX8-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
-; GFX8-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT:    v_mov_b32_e32 v18, s16
-; GFX8-NEXT:    s_add_u32 s16, s0, 0x50
-; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[24:27]
-; GFX8-NEXT:    s_addc_u32 s17, s1, 0
-; GFX8-NEXT:    v_bfe_i32 v26, v17, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v24, v16, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v16, s16
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 8, s10
-; GFX8-NEXT:    v_lshrrev_b16_e64 v15, 9, s10
-; GFX8-NEXT:    v_lshrrev_b16_e64 v12, 6, s10
-; GFX8-NEXT:    v_lshrrev_b16_e64 v13, 7, s10
-; GFX8-NEXT:    v_lshrrev_b16_e64 v10, 4, s10
-; GFX8-NEXT:    v_lshrrev_b16_e64 v11, 5, s10
-; GFX8-NEXT:    v_lshrrev_b16_e64 v8, 2, s10
-; GFX8-NEXT:    v_lshrrev_b16_e64 v9, 3, s10
-; GFX8-NEXT:    v_lshrrev_b16_e64 v7, 1, s10
-; GFX8-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
-; GFX8-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT:    v_mov_b32_e32 v17, s17
-; GFX8-NEXT:    s_add_u32 s10, s0, 64
-; GFX8-NEXT:    v_lshrrev_b16_e64 v5, 14, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v6, 15, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 12, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 13, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 10, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 11, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 8, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v23, 9, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v22, 6, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v21, 7, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v20, 4, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v19, 5, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v18, 2, s11
-; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[24:27]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v17, 3, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v16, 1, s11
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    v_bfe_i32 v26, v15, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v24, v14, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v15, s11
-; GFX8-NEXT:    v_mov_b32_e32 v14, s10
-; GFX8-NEXT:    s_add_u32 s10, s0, 48
-; GFX8-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
-; GFX8-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
+; GFX8-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[0:1], s[6:7], 0x10000
+; GFX8-NEXT:    s_bfe_i64 s[6:7], s[2:3], 0x10000
+; GFX8-NEXT:    s_add_u32 s2, s4, 0x1f0
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v43, s3
+; GFX8-NEXT:    v_mov_b32_e32 v42, s2
+; GFX8-NEXT:    s_add_u32 s2, s4, 0x1e0
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v46, s3
+; GFX8-NEXT:    v_mov_b32_e32 v45, s2
+; GFX8-NEXT:    s_add_u32 s2, s4, 0x1d0
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v48, s3
+; GFX8-NEXT:    v_mov_b32_e32 v47, s2
+; GFX8-NEXT:    s_add_u32 s2, s4, 0x1c0
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v50, s3
+; GFX8-NEXT:    v_mov_b32_e32 v49, s2
+; GFX8-NEXT:    s_add_u32 s2, s4, 0x1b0
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v52, s3
+; GFX8-NEXT:    v_mov_b32_e32 v51, s2
+; GFX8-NEXT:    s_add_u32 s2, s4, 0x1a0
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v54, s3
+; GFX8-NEXT:    v_mov_b32_e32 v53, s2
+; GFX8-NEXT:    s_add_u32 s2, s4, 0x190
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v15, s3
+; GFX8-NEXT:    v_mov_b32_e32 v14, s2
+; GFX8-NEXT:    s_add_u32 s2, s4, 0x180
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v13, s3
+; GFX8-NEXT:    v_mov_b32_e32 v12, s2
+; GFX8-NEXT:    buffer_store_dword v12, off, s[88:91], 0 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v13, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT:    flat_store_dwordx4 v[42:43], v[4:7]
+; GFX8-NEXT:    flat_store_dwordx4 v[45:46], v[8:11]
+; GFX8-NEXT:    flat_store_dwordx4 v[47:48], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[49:50], v[55:58]
+; GFX8-NEXT:    flat_store_dwordx4 v[51:52], v[20:23]
+; GFX8-NEXT:    flat_store_dwordx4 v[53:54], v[16:19]
 ; GFX8-NEXT:    flat_store_dwordx4 v[14:15], v[24:27]
-; GFX8-NEXT:    v_bfe_i32 v3, v3, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v26, v13, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v24, v12, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v13, s11
-; GFX8-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
-; GFX8-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT:    v_mov_b32_e32 v12, s10
-; GFX8-NEXT:    s_add_u32 s10, s0, 32
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[24:27]
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    v_bfe_i32 v26, v11, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v24, v10, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v10, s10
-; GFX8-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
-; GFX8-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT:    v_mov_b32_e32 v11, s11
-; GFX8-NEXT:    s_add_u32 s10, s0, 16
-; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[24:27]
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    v_bfe_i32 v26, v9, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v24, v8, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v8, s10
-; GFX8-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
-; GFX8-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT:    v_mov_b32_e32 v9, s11
-; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[24:27]
-; GFX8-NEXT:    s_add_u32 s10, s0, 0x170
-; GFX8-NEXT:    v_bfe_i32 v26, v7, 0, 1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
-; GFX8-NEXT:    v_mov_b32_e32 v24, s14
-; GFX8-NEXT:    v_mov_b32_e32 v25, s15
-; GFX8-NEXT:    flat_store_dwordx4 v[28:29], v[24:27]
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    v_bfe_i32 v26, v6, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v24, v5, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v5, s10
-; GFX8-NEXT:    v_mov_b32_e32 v6, s11
-; GFX8-NEXT:    s_add_u32 s10, s0, 0x160
-; GFX8-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
-; GFX8-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[5:6], v[24:27]
-; GFX8-NEXT:    v_bfe_i32 v5, v4, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v25, s11
-; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; GFX8-NEXT:    v_mov_b32_e32 v24, s10
-; GFX8-NEXT:    s_add_u32 s10, s0, 0x150
-; GFX8-NEXT:    flat_store_dwordx4 v[24:25], v[3:6]
-; GFX8-NEXT:    v_bfe_i32 v1, v1, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v3, v2, 0, 1
-; GFX8-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v5, s10
-; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GFX8-NEXT:    v_mov_b32_e32 v6, s11
-; GFX8-NEXT:    v_lshrrev_b16_e64 v9, 6, s8
-; GFX8-NEXT:    v_lshrrev_b16_e64 v28, 7, s8
-; GFX8-NEXT:    v_lshrrev_b16_e64 v26, 4, s8
-; GFX8-NEXT:    v_lshrrev_b16_e64 v27, 5, s8
-; GFX8-NEXT:    v_lshrrev_b16_e64 v24, 2, s8
-; GFX8-NEXT:    v_lshrrev_b16_e64 v25, 3, s8
-; GFX8-NEXT:    flat_store_dwordx4 v[5:6], v[1:4]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v6, 1, s8
-; GFX8-NEXT:    s_add_u32 s8, s0, 0x140
-; GFX8-NEXT:    v_bfe_i32 v2, v23, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX8-NEXT:    s_addc_u32 s9, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX8-NEXT:    v_mov_b32_e32 v5, s9
-; GFX8-NEXT:    s_add_u32 s8, s0, 0x130
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT:    v_bfe_i32 v4, v22, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v2, v6, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v6, v21, 0, 1
-; GFX8-NEXT:    s_addc_u32 s9, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s9
-; GFX8-NEXT:    s_add_u32 s8, s0, 0x120
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT:    v_bfe_i32 v21, v19, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v19, v20, 0, 1
-; GFX8-NEXT:    s_addc_u32 s9, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
-; GFX8-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
-; GFX8-NEXT:    v_mov_b32_e32 v1, s9
-; GFX8-NEXT:    s_add_u32 s8, s0, 0x110
-; GFX8-NEXT:    v_bfe_i32 v6, v25, 0, 1
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[19:22]
-; GFX8-NEXT:    v_bfe_i32 v25, v17, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v23, v18, 0, 1
-; GFX8-NEXT:    s_addc_u32 s9, s1, 0
+; GFX8-NEXT:    buffer_load_dword v18, off, s[88:91], 0 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v19, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_add_u32 s2, s4, 0x170
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v60, s3
+; GFX8-NEXT:    v_mov_b32_e32 v59, s2
+; GFX8-NEXT:    s_add_u32 s2, s4, 0x160
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v62, s3
+; GFX8-NEXT:    v_mov_b32_e32 v61, s2
+; GFX8-NEXT:    s_add_u32 s2, s4, 0x150
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v46, s3
+; GFX8-NEXT:    v_mov_b32_e32 v45, s2
+; GFX8-NEXT:    s_add_u32 s2, s4, 0x140
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0x130
+; GFX8-NEXT:    v_mov_b32_e32 v7, s1
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v17, s1
+; GFX8-NEXT:    v_mov_b32_e32 v16, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0x120
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v15, s1
+; GFX8-NEXT:    v_mov_b32_e32 v14, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0x110
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    v_mov_b32_e32 v13, s3
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v42, vcc_lo
+; GFX8-NEXT:    v_mov_b32_e32 v43, vcc_hi
+; GFX8-NEXT:    v_mov_b32_e32 v12, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NEXT:    v_bfe_i32 v4, v24, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v19, v26, 0, 1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
-; GFX8-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s9
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[23:26]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 6, s12
-; GFX8-NEXT:    v_mov_b32_e32 v23, s6
-; GFX8-NEXT:    s_add_u32 s6, s0, 0x100
-; GFX8-NEXT:    v_bfe_i32 v25, v16, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v24, s7
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_lshrrev_b16_e64 v15, 7, s12
-; GFX8-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    s_add_u32 s6, s0, 0x1f0
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[23:26]
-; GFX8-NEXT:    v_bfe_i32 v16, v15, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v14, v14, 0, 1
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_lshrrev_b16_e64 v12, 4, s12
-; GFX8-NEXT:    v_lshrrev_b16_e64 v13, 5, s12
-; GFX8-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GFX8-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    s_add_u32 s6, s0, 0x1e0
-; GFX8-NEXT:    v_bfe_i32 v21, v27, 0, 1
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[14:17]
-; GFX8-NEXT:    v_bfe_i32 v29, v13, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v27, v12, 0, 1
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_lshrrev_b16_e64 v10, 2, s12
-; GFX8-NEXT:    v_lshrrev_b16_e64 v11, 3, s12
-; GFX8-NEXT:    v_bfe_i32 v25, v28, 0, 1
-; GFX8-NEXT:    v_ashrrev_i32_e32 v30, 31, v29
-; GFX8-NEXT:    v_ashrrev_i32_e32 v28, 31, v27
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    s_add_u32 s6, s0, 0x1d0
-; GFX8-NEXT:    v_bfe_i32 v23, v9, 0, 1
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[27:30]
-; GFX8-NEXT:    v_bfe_i32 v11, v11, 0, 1
-; GFX8-NEXT:    v_bfe_i32 v9, v10, 0, 1
-; GFX8-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
-; GFX8-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
-; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    v_lshrrev_b16_e64 v8, 1, s12
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[9:12]
-; GFX8-NEXT:    v_bfe_i32 v14, v8, 0, 1
-; GFX8-NEXT:    v_mov_b32_e32 v12, s4
-; GFX8-NEXT:    s_add_u32 s4, s0, 0x1c0
-; GFX8-NEXT:    v_mov_b32_e32 v13, s5
-; GFX8-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    s_add_u32 s4, s0, 0xf0
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
-; GFX8-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
-; GFX8-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    s_add_u32 s4, s0, 0xe0
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[23:26]
-; GFX8-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    s_add_u32 s4, s0, 0xd0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
-; GFX8-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
-; GFX8-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[19:22]
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    s_add_u32 s0, s0, 0xc0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, s12
+; GFX8-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NEXT:    v_mov_b32_e32 v9, s13
+; GFX8-NEXT:    v_mov_b32_e32 v10, s14
+; GFX8-NEXT:    v_mov_b32_e32 v11, s15
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[28:31]
+; GFX8-NEXT:    flat_store_dwordx4 v[59:60], v[32:35]
+; GFX8-NEXT:    flat_store_dwordx4 v[61:62], v[36:39]
+; GFX8-NEXT:    flat_store_dwordx4 v[45:46], v[40:43]
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
+; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0x100
+; GFX8-NEXT:    v_mov_b32_e32 v0, s16
+; GFX8-NEXT:    v_mov_b32_e32 v1, s17
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    v_mov_b32_e32 v3, s21
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0xf0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s24
+; GFX8-NEXT:    v_mov_b32_e32 v1, s25
+; GFX8-NEXT:    v_mov_b32_e32 v2, s22
+; GFX8-NEXT:    v_mov_b32_e32 v3, s23
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0xe0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s26
+; GFX8-NEXT:    v_mov_b32_e32 v1, s27
+; GFX8-NEXT:    v_mov_b32_e32 v2, s28
+; GFX8-NEXT:    v_mov_b32_e32 v3, s29
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0xd0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s34
+; GFX8-NEXT:    v_mov_b32_e32 v1, s35
+; GFX8-NEXT:    v_mov_b32_e32 v2, s86
+; GFX8-NEXT:    v_mov_b32_e32 v3, s87
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0xc0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s84
+; GFX8-NEXT:    v_mov_b32_e32 v1, s85
+; GFX8-NEXT:    v_mov_b32_e32 v2, s82
+; GFX8-NEXT:    v_mov_b32_e32 v3, s83
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0xb0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s80
+; GFX8-NEXT:    v_mov_b32_e32 v1, s81
+; GFX8-NEXT:    v_mov_b32_e32 v2, s78
+; GFX8-NEXT:    v_mov_b32_e32 v3, s79
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0xa0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s76
+; GFX8-NEXT:    v_mov_b32_e32 v1, s77
+; GFX8-NEXT:    v_mov_b32_e32 v2, s74
+; GFX8-NEXT:    v_mov_b32_e32 v3, s75
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0x90
+; GFX8-NEXT:    v_mov_b32_e32 v0, s72
+; GFX8-NEXT:    v_mov_b32_e32 v1, s73
+; GFX8-NEXT:    v_mov_b32_e32 v2, s70
+; GFX8-NEXT:    v_mov_b32_e32 v3, s71
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0x80
+; GFX8-NEXT:    v_mov_b32_e32 v0, s68
+; GFX8-NEXT:    v_mov_b32_e32 v1, s69
+; GFX8-NEXT:    v_mov_b32_e32 v2, s66
+; GFX8-NEXT:    v_mov_b32_e32 v3, s67
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0x70
+; GFX8-NEXT:    v_mov_b32_e32 v0, s64
+; GFX8-NEXT:    v_mov_b32_e32 v1, s65
+; GFX8-NEXT:    v_mov_b32_e32 v2, s62
+; GFX8-NEXT:    v_mov_b32_e32 v3, s63
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0x60
+; GFX8-NEXT:    v_mov_b32_e32 v0, s60
+; GFX8-NEXT:    v_mov_b32_e32 v1, s61
+; GFX8-NEXT:    v_mov_b32_e32 v2, s58
+; GFX8-NEXT:    v_mov_b32_e32 v3, s59
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 0x50
+; GFX8-NEXT:    v_mov_b32_e32 v0, s56
+; GFX8-NEXT:    v_mov_b32_e32 v1, s57
+; GFX8-NEXT:    v_mov_b32_e32 v2, s54
+; GFX8-NEXT:    v_mov_b32_e32 v3, s55
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 64
+; GFX8-NEXT:    v_mov_b32_e32 v0, s52
+; GFX8-NEXT:    v_mov_b32_e32 v1, s53
+; GFX8-NEXT:    v_mov_b32_e32 v2, s50
+; GFX8-NEXT:    v_mov_b32_e32 v3, s51
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 48
+; GFX8-NEXT:    v_mov_b32_e32 v0, s48
+; GFX8-NEXT:    v_mov_b32_e32 v1, s49
+; GFX8-NEXT:    v_mov_b32_e32 v2, s46
+; GFX8-NEXT:    v_mov_b32_e32 v3, s47
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 32
+; GFX8-NEXT:    v_mov_b32_e32 v0, s44
+; GFX8-NEXT:    v_mov_b32_e32 v1, s45
+; GFX8-NEXT:    v_mov_b32_e32 v2, s42
+; GFX8-NEXT:    v_mov_b32_e32 v3, s43
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, s40
+; GFX8-NEXT:    v_mov_b32_e32 v1, s41
+; GFX8-NEXT:    v_mov_b32_e32 v2, s38
+; GFX8-NEXT:    v_mov_b32_e32 v3, s39
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s36
+; GFX8-NEXT:    v_mov_b32_e32 v1, s37
+; GFX8-NEXT:    v_mov_b32_e32 v2, s30
+; GFX8-NEXT:    v_mov_b32_e32 v3, s31
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    v_readlane_b32 s0, v44, 2
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    v_readlane_b32 s1, v44, 3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NEXT:    v_mov_b32_e32 v1, s19
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    ; kill: killed $vgpr44
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_sextload_v64i1_to_v64i64:
@@ -9716,246 +9501,242 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX12-LABEL: constant_sextload_v64i1_to_v64i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b32 s19, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_mov_b32 s5, s19
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_load_b64 s[40:41], s[2:3], 0x0
+; GFX12-NEXT:    s_load_b64 s[12:13], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshr_b32 s26, s41, 22
-; GFX12-NEXT:    s_lshr_b32 s28, s41, 23
-; GFX12-NEXT:    s_lshr_b32 s30, s41, 20
-; GFX12-NEXT:    s_lshr_b32 s34, s41, 21
-; GFX12-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
-; GFX12-NEXT:    s_lshr_b32 s20, s41, 18
-; GFX12-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX12-NEXT:    s_lshr_b32 s96, s13, 30
+; GFX12-NEXT:    s_lshr_b32 s98, s13, 31
+; GFX12-NEXT:    s_lshr_b32 s92, s13, 28
+; GFX12-NEXT:    s_lshr_b32 s94, s13, 29
+; GFX12-NEXT:    s_lshr_b32 s78, s13, 26
+; GFX12-NEXT:    s_lshr_b32 s88, s13, 27
+; GFX12-NEXT:    s_bfe_i64 s[96:97], s[96:97], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[100:101], s[98:99], 0x10000
+; GFX12-NEXT:    s_lshr_b32 s66, s13, 24
+; GFX12-NEXT:    s_lshr_b32 s74, s13, 25
+; GFX12-NEXT:    s_bfe_i64 s[92:93], s[92:93], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[94:95], s[94:95], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s96
+; GFX12-NEXT:    s_lshr_b32 s56, s13, 22
+; GFX12-NEXT:    s_lshr_b32 s62, s13, 23
+; GFX12-NEXT:    v_dual_mov_b32 v2, s97 :: v_dual_mov_b32 v3, s100
+; GFX12-NEXT:    v_dual_mov_b32 v4, s101 :: v_dual_mov_b32 v5, s92
+; GFX12-NEXT:    s_bfe_i64 s[78:79], s[78:79], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[88:89], s[88:89], 0x10000
+; GFX12-NEXT:    s_lshr_b32 s44, s13, 20
+; GFX12-NEXT:    s_lshr_b32 s52, s13, 21
+; GFX12-NEXT:    s_lshr_b32 s30, s13, 18
+; GFX12-NEXT:    s_lshr_b32 s40, s13, 19
+; GFX12-NEXT:    s_lshr_b32 s18, s13, 16
+; GFX12-NEXT:    s_lshr_b32 s26, s13, 17
+; GFX12-NEXT:    s_lshr_b32 s2, s13, 14
+; GFX12-NEXT:    s_lshr_b32 s4, s13, 15
+; GFX12-NEXT:    v_dual_mov_b32 v6, s93 :: v_dual_mov_b32 v7, s94
+; GFX12-NEXT:    v_dual_mov_b32 v8, s95 :: v_dual_mov_b32 v9, s78
+; GFX12-NEXT:    s_bfe_i64 s[66:67], s[66:67], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[74:75], s[74:75], 0x10000
+; GFX12-NEXT:    s_lshr_b32 s6, s13, 12
+; GFX12-NEXT:    s_lshr_b32 s8, s13, 13
+; GFX12-NEXT:    v_dual_mov_b32 v10, s79 :: v_dual_mov_b32 v11, s88
+; GFX12-NEXT:    v_dual_mov_b32 v12, s89 :: v_dual_mov_b32 v13, s66
+; GFX12-NEXT:    s_bfe_i64 s[56:57], s[56:57], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[62:63], s[62:63], 0x10000
+; GFX12-NEXT:    s_lshr_b32 s10, s13, 10
+; GFX12-NEXT:    s_lshr_b32 s14, s13, 11
+; GFX12-NEXT:    v_dual_mov_b32 v14, s67 :: v_dual_mov_b32 v15, s74
+; GFX12-NEXT:    v_dual_mov_b32 v16, s75 :: v_dual_mov_b32 v17, s56
+; GFX12-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[52:53], s[52:53], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
-; GFX12-NEXT:    v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v69, s26
-; GFX12-NEXT:    v_dual_mov_b32 v70, s27 :: v_dual_mov_b32 v71, s28
-; GFX12-NEXT:    v_dual_mov_b32 v72, s29 :: v_dual_mov_b32 v73, s30
-; GFX12-NEXT:    s_lshr_b32 s22, s41, 19
-; GFX12-NEXT:    v_dual_mov_b32 v74, s31 :: v_dual_mov_b32 v75, s34
+; GFX12-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT:    s_lshr_b32 s16, s13, 8
+; GFX12-NEXT:    s_lshr_b32 s20, s13, 9
+; GFX12-NEXT:    v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v19, s62
+; GFX12-NEXT:    v_dual_mov_b32 v20, s63 :: v_dual_mov_b32 v21, s44
+; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX12-NEXT:    s_lshr_b32 s22, s13, 6
+; GFX12-NEXT:    s_lshr_b32 s24, s13, 7
+; GFX12-NEXT:    v_dual_mov_b32 v22, s45 :: v_dual_mov_b32 v23, s52
+; GFX12-NEXT:    v_dual_mov_b32 v24, s53 :: v_dual_mov_b32 v25, s30
+; GFX12-NEXT:    v_dual_mov_b32 v26, s31 :: v_dual_mov_b32 v27, s40
+; GFX12-NEXT:    v_dual_mov_b32 v28, s41 :: v_dual_mov_b32 v29, s18
+; GFX12-NEXT:    v_dual_mov_b32 v30, s19 :: v_dual_mov_b32 v31, s26
+; GFX12-NEXT:    v_mov_b32_e32 v32, s27
+; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX12-NEXT:    s_clause 0x7
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[0:1] offset:496
+; GFX12-NEXT:    global_store_b128 v0, v[5:8], s[0:1] offset:480
+; GFX12-NEXT:    global_store_b128 v0, v[9:12], s[0:1] offset:464
+; GFX12-NEXT:    global_store_b128 v0, v[13:16], s[0:1] offset:448
+; GFX12-NEXT:    global_store_b128 v0, v[17:20], s[0:1] offset:432
+; GFX12-NEXT:    global_store_b128 v0, v[21:24], s[0:1] offset:416
+; GFX12-NEXT:    global_store_b128 v0, v[25:28], s[0:1] offset:400
+; GFX12-NEXT:    global_store_b128 v0, v[29:32], s[0:1] offset:384
+; GFX12-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
+; GFX12-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5
+; GFX12-NEXT:    v_mov_b32_e32 v5, s6
+; GFX12-NEXT:    s_lshr_b32 s28, s13, 4
+; GFX12-NEXT:    s_lshr_b32 s34, s13, 5
+; GFX12-NEXT:    s_lshr_b32 s36, s13, 2
+; GFX12-NEXT:    s_lshr_b32 s38, s13, 3
 ; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
-; GFX12-NEXT:    v_mov_b32_e32 v76, s35
-; GFX12-NEXT:    s_lshr_b32 s24, s41, 16
-; GFX12-NEXT:    s_lshr_b32 s36, s41, 17
-; GFX12-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX12-NEXT:    s_lshr_b32 s12, s40, 22
-; GFX12-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s8
+; GFX12-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v9, s10
+; GFX12-NEXT:    s_lshr_b32 s42, s13, 1
+; GFX12-NEXT:    s_mov_b32 s46, s13
 ; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v12, v[69:72], s[0:1] offset:432
-; GFX12-NEXT:    global_store_b128 v12, v[73:76], s[0:1] offset:416
-; GFX12-NEXT:    v_dual_mov_b32 v69, s20 :: v_dual_mov_b32 v70, s21
-; GFX12-NEXT:    v_dual_mov_b32 v71, s22 :: v_dual_mov_b32 v72, s23
-; GFX12-NEXT:    v_mov_b32_e32 v73, s24
-; GFX12-NEXT:    s_lshr_b32 s14, s40, 23
-; GFX12-NEXT:    v_dual_mov_b32 v74, s25 :: v_dual_mov_b32 v75, s36
-; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
-; GFX12-NEXT:    v_mov_b32_e32 v76, s37
-; GFX12-NEXT:    s_lshr_b32 s16, s40, 20
-; GFX12-NEXT:    s_lshr_b32 s38, s40, 21
-; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX12-NEXT:    s_lshr_b32 s6, s40, 18
+; GFX12-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v11, s14
+; GFX12-NEXT:    v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s16
+; GFX12-NEXT:    s_lshr_b32 s48, s12, 30
+; GFX12-NEXT:    s_lshr_b32 s50, s12, 31
 ; GFX12-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v12, v[69:72], s[0:1] offset:400
-; GFX12-NEXT:    global_store_b128 v12, v[73:76], s[0:1] offset:384
-; GFX12-NEXT:    v_dual_mov_b32 v69, s12 :: v_dual_mov_b32 v70, s13
-; GFX12-NEXT:    v_dual_mov_b32 v71, s14 :: v_dual_mov_b32 v72, s15
-; GFX12-NEXT:    v_mov_b32_e32 v73, s16
-; GFX12-NEXT:    s_lshr_b32 s8, s40, 19
-; GFX12-NEXT:    s_lshr_b32 s10, s40, 16
-; GFX12-NEXT:    s_lshr_b32 s42, s40, 17
-; GFX12-NEXT:    v_dual_mov_b32 v74, s17 :: v_dual_mov_b32 v75, s38
-; GFX12-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX12-NEXT:    v_mov_b32_e32 v76, s39
+; GFX12-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v15, s20
+; GFX12-NEXT:    v_dual_mov_b32 v16, s21 :: v_dual_mov_b32 v17, s22
+; GFX12-NEXT:    s_lshr_b32 s54, s12, 28
+; GFX12-NEXT:    s_lshr_b32 s58, s12, 29
+; GFX12-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x10000
 ; GFX12-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX12-NEXT:    v_lshrrev_b16 v0, 14, s40
-; GFX12-NEXT:    v_lshrrev_b16 v1, 15, s40
-; GFX12-NEXT:    v_lshrrev_b16 v17, 12, s40
-; GFX12-NEXT:    v_lshrrev_b16 v18, 13, s40
-; GFX12-NEXT:    v_lshrrev_b16 v33, 10, s40
-; GFX12-NEXT:    v_lshrrev_b16 v34, 11, s40
-; GFX12-NEXT:    v_lshrrev_b16 v65, 8, s40
-; GFX12-NEXT:    v_lshrrev_b16 v66, 9, s40
-; GFX12-NEXT:    v_lshrrev_b16 v86, 6, s40
-; GFX12-NEXT:    v_lshrrev_b16 v82, 7, s40
-; GFX12-NEXT:    v_lshrrev_b16 v81, 4, s40
-; GFX12-NEXT:    v_lshrrev_b16 v83, 5, s40
-; GFX12-NEXT:    v_lshrrev_b16 v77, 2, s40
-; GFX12-NEXT:    v_lshrrev_b16 v78, 3, s40
-; GFX12-NEXT:    v_lshrrev_b16 v58, 1, s40
-; GFX12-NEXT:    v_lshrrev_b16 v60, 14, s41
-; GFX12-NEXT:    v_lshrrev_b16 v61, 15, s41
-; GFX12-NEXT:    v_lshrrev_b16 v57, 12, s41
-; GFX12-NEXT:    v_lshrrev_b16 v54, 13, s41
-; GFX12-NEXT:    v_lshrrev_b16 v50, 10, s41
-; GFX12-NEXT:    v_lshrrev_b16 v46, 11, s41
-; GFX12-NEXT:    v_lshrrev_b16 v49, 8, s41
-; GFX12-NEXT:    v_lshrrev_b16 v51, 9, s41
-; GFX12-NEXT:    v_lshrrev_b16 v45, 6, s41
-; GFX12-NEXT:    v_lshrrev_b16 v38, 7, s41
-; GFX12-NEXT:    v_lshrrev_b16 v40, 4, s41
-; GFX12-NEXT:    v_lshrrev_b16 v41, 5, s41
-; GFX12-NEXT:    v_lshrrev_b16 v37, 2, s41
-; GFX12-NEXT:    v_lshrrev_b16 v36, 3, s41
-; GFX12-NEXT:    v_lshrrev_b16 v30, 1, s41
-; GFX12-NEXT:    s_lshr_b32 s4, s41, 24
-; GFX12-NEXT:    s_mov_b32 s18, s41
-; GFX12-NEXT:    s_lshr_b32 s2, s40, 24
-; GFX12-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x10000
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v12, v[69:72], s[0:1] offset:176
-; GFX12-NEXT:    global_store_b128 v12, v[73:76], s[0:1] offset:160
-; GFX12-NEXT:    v_dual_mov_b32 v69, s6 :: v_dual_mov_b32 v70, s7
-; GFX12-NEXT:    v_dual_mov_b32 v71, s8 :: v_dual_mov_b32 v72, s9
-; GFX12-NEXT:    v_dual_mov_b32 v73, s10 :: v_dual_mov_b32 v74, s11
-; GFX12-NEXT:    v_dual_mov_b32 v75, s42 :: v_dual_mov_b32 v76, s43
-; GFX12-NEXT:    v_bfe_i32 v79, v1, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v85, v65, 0, 1
-; GFX12-NEXT:    v_mov_b32_e32 v65, s40
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v12, v[69:72], s[0:1] offset:144
-; GFX12-NEXT:    global_store_b128 v12, v[73:76], s[0:1] offset:128
-; GFX12-NEXT:    v_bfe_i32 v69, v77, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v77, v0, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v75, v18, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v73, v17, 0, 1
-; GFX12-NEXT:    v_lshrrev_b16 v26, 6, s4
-; GFX12-NEXT:    v_lshrrev_b16 v28, 7, s4
-; GFX12-NEXT:    v_lshrrev_b16 v20, 2, s4
-; GFX12-NEXT:    v_lshrrev_b16 v14, 3, s4
-; GFX12-NEXT:    v_lshrrev_b16 v22, 4, s4
-; GFX12-NEXT:    v_lshrrev_b16 v25, 5, s4
-; GFX12-NEXT:    v_lshrrev_b16 v19, 1, s4
-; GFX12-NEXT:    v_lshrrev_b16 v9, 6, s2
-; GFX12-NEXT:    v_lshrrev_b16 v5, 7, s2
-; GFX12-NEXT:    v_bfe_i32 v71, v78, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v80, 31, v79
-; GFX12-NEXT:    v_ashrrev_i32_e32 v78, 31, v77
-; GFX12-NEXT:    v_ashrrev_i32_e32 v76, 31, v75
-; GFX12-NEXT:    v_ashrrev_i32_e32 v74, 31, v73
-; GFX12-NEXT:    v_lshrrev_b16 v8, 4, s2
-; GFX12-NEXT:    v_lshrrev_b16 v7, 5, s2
-; GFX12-NEXT:    v_lshrrev_b16 v3, 3, s2
-; GFX12-NEXT:    v_lshrrev_b16 v4, 2, s2
-; GFX12-NEXT:    v_lshrrev_b16 v2, 1, s2
-; GFX12-NEXT:    v_bfe_i32 v23, v14, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v21, v20, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v31, v28, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v29, v26, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v55, v46, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v53, v50, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v91, v34, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v89, v33, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v19, v19, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v27, v25, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v25, v22, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v51, v51, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v49, v49, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v87, v66, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v15, v5, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v13, v9, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v47, v38, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v45, v45, 0, 1
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v12, v[77:80], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v12, v[73:76], s[0:1] offset:96
-; GFX12-NEXT:    v_bfe_i32 v77, v82, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v75, v86, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v10, v7, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v8, v8, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v43, v41, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v41, v40, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v83, v83, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v81, v81, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v6, v3, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v4, v4, 0, 1
-; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX12-NEXT:    v_bfe_i32 v39, v36, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v37, v37, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
-; GFX12-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
-; GFX12-NEXT:    v_bfe_i32 v35, v30, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v32, 31, v31
-; GFX12-NEXT:    v_ashrrev_i32_e32 v30, 31, v29
-; GFX12-NEXT:    v_ashrrev_i32_e32 v56, 31, v55
-; GFX12-NEXT:    v_bfe_i32 v59, v54, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v54, 31, v53
-; GFX12-NEXT:    v_bfe_i32 v57, v57, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v63, v61, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v61, v60, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v67, v58, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v92, 31, v91
-; GFX12-NEXT:    v_ashrrev_i32_e32 v90, 31, v89
-; GFX12-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX12-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
-; GFX12-NEXT:    v_ashrrev_i32_e32 v28, 31, v27
-; GFX12-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
-; GFX12-NEXT:    v_ashrrev_i32_e32 v52, 31, v51
-; GFX12-NEXT:    v_ashrrev_i32_e32 v50, 31, v49
-; GFX12-NEXT:    v_ashrrev_i32_e32 v88, 31, v87
-; GFX12-NEXT:    v_ashrrev_i32_e32 v86, 31, v85
-; GFX12-NEXT:    v_dual_mov_b32 v34, s19 :: v_dual_mov_b32 v17, s4
-; GFX12-NEXT:    v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GFX12-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
-; GFX12-NEXT:    v_ashrrev_i32_e32 v48, 31, v47
-; GFX12-NEXT:    v_ashrrev_i32_e32 v46, 31, v45
-; GFX12-NEXT:    v_ashrrev_i32_e32 v78, 31, v77
-; GFX12-NEXT:    v_ashrrev_i32_e32 v76, 31, v75
-; GFX12-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GFX12-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GFX12-NEXT:    v_ashrrev_i32_e32 v44, 31, v43
-; GFX12-NEXT:    v_ashrrev_i32_e32 v42, 31, v41
-; GFX12-NEXT:    v_ashrrev_i32_e32 v84, 31, v83
-; GFX12-NEXT:    v_ashrrev_i32_e32 v82, 31, v81
-; GFX12-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX12-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX12-NEXT:    v_ashrrev_i32_e32 v40, 31, v39
-; GFX12-NEXT:    v_ashrrev_i32_e32 v38, 31, v37
-; GFX12-NEXT:    v_ashrrev_i32_e32 v72, 31, v71
-; GFX12-NEXT:    v_ashrrev_i32_e32 v70, 31, v69
-; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT:    v_ashrrev_i32_e32 v36, 31, v35
-; GFX12-NEXT:    v_ashrrev_i32_e32 v60, 31, v59
-; GFX12-NEXT:    v_ashrrev_i32_e32 v58, 31, v57
-; GFX12-NEXT:    v_ashrrev_i32_e32 v64, 31, v63
-; GFX12-NEXT:    v_ashrrev_i32_e32 v62, 31, v61
-; GFX12-NEXT:    v_ashrrev_i32_e32 v68, 31, v67
-; GFX12-NEXT:    v_dual_mov_b32 v66, s41 :: v_dual_mov_b32 v33, s18
-; GFX12-NEXT:    s_clause 0xf
-; GFX12-NEXT:    global_store_b128 v12, v[89:92], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v12, v[85:88], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v12, v[75:78], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v12, v[81:84], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v12, v[69:72], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v12, v[65:68], s[0:1]
-; GFX12-NEXT:    global_store_b128 v12, v[61:64], s[0:1] offset:368
-; GFX12-NEXT:    global_store_b128 v12, v[57:60], s[0:1] offset:352
-; GFX12-NEXT:    global_store_b128 v12, v[53:56], s[0:1] offset:336
-; GFX12-NEXT:    global_store_b128 v12, v[49:52], s[0:1] offset:320
-; GFX12-NEXT:    global_store_b128 v12, v[45:48], s[0:1] offset:304
-; GFX12-NEXT:    global_store_b128 v12, v[41:44], s[0:1] offset:288
-; GFX12-NEXT:    global_store_b128 v12, v[37:40], s[0:1] offset:272
-; GFX12-NEXT:    global_store_b128 v12, v[33:36], s[0:1] offset:256
-; GFX12-NEXT:    global_store_b128 v12, v[29:32], s[0:1] offset:496
-; GFX12-NEXT:    global_store_b128 v12, v[25:28], s[0:1] offset:480
-; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    v_dual_mov_b32 v18, s23 :: v_dual_mov_b32 v19, s24
+; GFX12-NEXT:    v_dual_mov_b32 v20, s25 :: v_dual_mov_b32 v21, s28
+; GFX12-NEXT:    s_lshr_b32 s60, s12, 26
+; GFX12-NEXT:    s_lshr_b32 s64, s12, 27
+; GFX12-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s34
+; GFX12-NEXT:    v_mov_b32_e32 v24, s35
+; GFX12-NEXT:    s_clause 0x5
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[0:1] offset:368
+; GFX12-NEXT:    global_store_b128 v0, v[5:8], s[0:1] offset:352
+; GFX12-NEXT:    global_store_b128 v0, v[9:12], s[0:1] offset:336
+; GFX12-NEXT:    global_store_b128 v0, v[13:16], s[0:1] offset:320
+; GFX12-NEXT:    global_store_b128 v0, v[17:20], s[0:1] offset:304
+; GFX12-NEXT:    global_store_b128 v0, v[21:24], s[0:1] offset:288
+; GFX12-NEXT:    v_dual_mov_b32 v1, s36 :: v_dual_mov_b32 v2, s37
+; GFX12-NEXT:    v_dual_mov_b32 v3, s38 :: v_dual_mov_b32 v4, s39
+; GFX12-NEXT:    v_mov_b32_e32 v5, s46
+; GFX12-NEXT:    s_lshr_b32 s68, s12, 24
+; GFX12-NEXT:    s_lshr_b32 s70, s12, 25
+; GFX12-NEXT:    s_lshr_b32 s72, s12, 22
+; GFX12-NEXT:    s_lshr_b32 s76, s12, 23
+; GFX12-NEXT:    s_bfe_i64 s[58:59], s[58:59], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v6, s47 :: v_dual_mov_b32 v7, s42
+; GFX12-NEXT:    v_dual_mov_b32 v8, s43 :: v_dual_mov_b32 v9, s48
+; GFX12-NEXT:    s_lshr_b32 s80, s12, 20
+; GFX12-NEXT:    s_lshr_b32 s82, s12, 21
+; GFX12-NEXT:    s_bfe_i64 s[64:65], s[64:65], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[60:61], s[60:61], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v10, s49 :: v_dual_mov_b32 v11, s50
+; GFX12-NEXT:    v_dual_mov_b32 v12, s51 :: v_dual_mov_b32 v13, s54
+; GFX12-NEXT:    s_lshr_b32 s84, s12, 18
+; GFX12-NEXT:    s_lshr_b32 s86, s12, 19
+; GFX12-NEXT:    s_bfe_i64 s[76:77], s[76:77], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[72:73], s[72:73], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[70:71], s[70:71], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[68:69], s[68:69], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v14, s55 :: v_dual_mov_b32 v15, s58
+; GFX12-NEXT:    v_dual_mov_b32 v16, s59 :: v_dual_mov_b32 v17, s60
+; GFX12-NEXT:    s_lshr_b32 s90, s12, 16
+; GFX12-NEXT:    s_lshr_b32 s98, s12, 17
+; GFX12-NEXT:    s_bfe_i64 s[82:83], s[82:83], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[80:81], s[80:81], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v18, s61 :: v_dual_mov_b32 v19, s64
+; GFX12-NEXT:    v_dual_mov_b32 v20, s65 :: v_dual_mov_b32 v21, s68
+; GFX12-NEXT:    s_lshr_b32 s96, s12, 14
+; GFX12-NEXT:    s_lshr_b32 s100, s12, 15
+; GFX12-NEXT:    s_lshr_b32 s94, s12, 13
+; GFX12-NEXT:    s_lshr_b32 s88, s12, 11
+; GFX12-NEXT:    s_lshr_b32 s74, s12, 9
+; GFX12-NEXT:    s_lshr_b32 s62, s12, 7
+; GFX12-NEXT:    s_lshr_b32 s52, s12, 5
+; GFX12-NEXT:    s_lshr_b32 s40, s12, 3
+; GFX12-NEXT:    s_lshr_b32 s26, s12, 1
+; GFX12-NEXT:    s_bfe_i64 s[86:87], s[86:87], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[84:85], s[84:85], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v22, s69 :: v_dual_mov_b32 v23, s70
+; GFX12-NEXT:    v_mov_b32_e32 v24, s71
+; GFX12-NEXT:    s_clause 0x5
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[0:1] offset:272
+; GFX12-NEXT:    global_store_b128 v0, v[5:8], s[0:1] offset:256
+; GFX12-NEXT:    global_store_b128 v0, v[9:12], s[0:1] offset:240
+; GFX12-NEXT:    global_store_b128 v0, v[13:16], s[0:1] offset:224
+; GFX12-NEXT:    global_store_b128 v0, v[17:20], s[0:1] offset:208
+; GFX12-NEXT:    global_store_b128 v0, v[21:24], s[0:1] offset:192
+; GFX12-NEXT:    v_dual_mov_b32 v1, s72 :: v_dual_mov_b32 v2, s73
+; GFX12-NEXT:    v_dual_mov_b32 v3, s76 :: v_dual_mov_b32 v4, s77
+; GFX12-NEXT:    v_mov_b32_e32 v5, s80
+; GFX12-NEXT:    s_lshr_b32 s92, s12, 12
+; GFX12-NEXT:    s_lshr_b32 s78, s12, 10
+; GFX12-NEXT:    s_bfe_i64 s[98:99], s[98:99], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[90:91], s[90:91], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v6, s81 :: v_dual_mov_b32 v7, s82
+; GFX12-NEXT:    v_dual_mov_b32 v8, s83 :: v_dual_mov_b32 v9, s84
+; GFX12-NEXT:    s_lshr_b32 s66, s12, 8
+; GFX12-NEXT:    s_lshr_b32 s56, s12, 6
+; GFX12-NEXT:    s_lshr_b32 s44, s12, 4
+; GFX12-NEXT:    s_lshr_b32 s30, s12, 2
+; GFX12-NEXT:    s_bfe_i64 s[18:19], s[12:13], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[12:13], s[26:27], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[26:27], s[40:41], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[40:41], s[52:53], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[52:53], s[62:63], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[62:63], s[74:75], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[74:75], s[88:89], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[88:89], s[94:95], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[94:95], s[100:101], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[96:97], s[96:97], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v10, s85 :: v_dual_mov_b32 v11, s86
+; GFX12-NEXT:    v_dual_mov_b32 v12, s87 :: v_dual_mov_b32 v13, s90
+; GFX12-NEXT:    s_bfe_i64 s[78:79], s[78:79], 0x10000
+; GFX12-NEXT:    s_bfe_i64 s[92:93], s[92:93], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v14, s91 :: v_dual_mov_b32 v15, s98
+; GFX12-NEXT:    v_dual_mov_b32 v16, s99 :: v_dual_mov_b32 v17, s96
+; GFX12-NEXT:    s_bfe_i64 s[66:67], s[66:67], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v18, s97 :: v_dual_mov_b32 v19, s94
+; GFX12-NEXT:    v_dual_mov_b32 v20, s95 :: v_dual_mov_b32 v21, s92
+; GFX12-NEXT:    s_bfe_i64 s[56:57], s[56:57], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v22, s93 :: v_dual_mov_b32 v23, s88
+; GFX12-NEXT:    v_mov_b32_e32 v24, s89
+; GFX12-NEXT:    s_clause 0x5
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[0:1] offset:176
+; GFX12-NEXT:    global_store_b128 v0, v[5:8], s[0:1] offset:160
+; GFX12-NEXT:    global_store_b128 v0, v[9:12], s[0:1] offset:144
+; GFX12-NEXT:    global_store_b128 v0, v[13:16], s[0:1] offset:128
+; GFX12-NEXT:    global_store_b128 v0, v[17:20], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v0, v[21:24], s[0:1] offset:96
+; GFX12-NEXT:    v_dual_mov_b32 v1, s78 :: v_dual_mov_b32 v2, s79
+; GFX12-NEXT:    v_dual_mov_b32 v3, s74 :: v_dual_mov_b32 v4, s75
+; GFX12-NEXT:    v_mov_b32_e32 v5, s66
+; GFX12-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v6, s67 :: v_dual_mov_b32 v7, s62
+; GFX12-NEXT:    v_dual_mov_b32 v8, s63 :: v_dual_mov_b32 v9, s56
+; GFX12-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX12-NEXT:    v_dual_mov_b32 v10, s57 :: v_dual_mov_b32 v11, s52
+; GFX12-NEXT:    v_dual_mov_b32 v12, s53 :: v_dual_mov_b32 v13, s44
+; GFX12-NEXT:    v_dual_mov_b32 v14, s45 :: v_dual_mov_b32 v15, s40
+; GFX12-NEXT:    v_dual_mov_b32 v16, s41 :: v_dual_mov_b32 v17, s30
+; GFX12-NEXT:    v_dual_mov_b32 v18, s31 :: v_dual_mov_b32 v19, s26
+; GFX12-NEXT:    v_dual_mov_b32 v20, s27 :: v_dual_mov_b32 v21, s18
+; GFX12-NEXT:    v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s12
+; GFX12-NEXT:    v_mov_b32_e32 v24, s13
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v12, v[21:24], s[0:1] offset:464
-; GFX12-NEXT:    global_store_b128 v12, v[17:20], s[0:1] offset:448
-; GFX12-NEXT:    global_store_b128 v12, v[13:16], s[0:1] offset:240
-; GFX12-NEXT:    global_store_b128 v12, v[8:11], s[0:1] offset:224
-; GFX12-NEXT:    global_store_b128 v12, v[4:7], s[0:1] offset:208
-; GFX12-NEXT:    global_store_b128 v12, v[0:3], s[0:1] offset:192
+; GFX12-NEXT:    global_store_b128 v0, v[1:4], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v0, v[5:8], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v0, v[9:12], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v0, v[13:16], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v0, v[17:20], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v0, v[21:24], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 67a376b8c0f3c5..aca455b5925b75 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -869,6 +869,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i32:
 ; GFX8-NOHSA:       ; %bb.0:
 ; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -876,7 +877,7 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e32 v3, 8, v2
+; GFX8-NOHSA-NEXT:    v_lshrrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NOHSA-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -916,10 +917,10 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v2, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -976,9 +977,8 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e32 v3, 8, v2
+; GFX8-NOHSA-NEXT:    v_bfe_i32 v3, v2, 8, 8
 ; GFX8-NOHSA-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v3, v3, 0, 8
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
 ;
@@ -1012,14 +1012,14 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_sextload_v2i8_to_v2i32:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v0, v2, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, v0
-; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-NEXT:    s_load_u16 s2, s[2:3], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_sext_i32_i8 s3, s2
+; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x80008
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1076,11 +1076,12 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, s2, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s1, s2, 0x80010
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v1, 8, s2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s0, s2, 0x80008
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s2, s2, 0x80010
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
 ; GFX8-NOHSA-NEXT:    s_endpgm
 ;
@@ -1116,12 +1117,11 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT:    s_and_b32 s3, s2, 0xff
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x80008
+; GFX12-NEXT:    s_and_b32 s4, s2, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
+; GFX12-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s2
 ; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1179,11 +1179,11 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
 ; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s2, 0x80010
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s1, s2
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v1, v0, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s2, 0x80008
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s2, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -1220,13 +1220,11 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT:    s_sext_i32_i8 s3, s2
-; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    s_bfe_i32 s3, s2, 0x80010
+; GFX12-NEXT:    s_sext_i32_i8 s4, s2
+; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x80008
+; GFX12-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
+; GFX12-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
 ; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1287,10 +1285,11 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT:    s_and_b32 s1, s2, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v1, 8, s2
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s1, s2, 0x80008
+; GFX8-NOHSA-NEXT:    s_and_b32 s3, s2, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s2, s2, 0x80010
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -1326,13 +1325,12 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
 ; GFX12-NEXT:    s_lshr_b32 s3, s2, 24
-; GFX12-NEXT:    s_and_b32 s4, s2, 0xff
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x80008
+; GFX12-NEXT:    s_and_b32 s5, s2, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x80010
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4
+; GFX12-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
@@ -1392,12 +1390,12 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
 ; GFX8-NOHSA-NEXT:    s_ashr_i32 s0, s2, 24
 ; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s2, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s3, s2, 0x80008
 ; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s2, s2
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v1, v0, 0, 8
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -1435,14 +1433,14 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
 ; GFX12-NEXT:    s_ashr_i32 s3, s2, 24
-; GFX12-NEXT:    s_sext_i32_i8 s4, s2
-; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-NEXT:    s_bfe_i32 s4, s2, 0x80010
+; GFX12-NEXT:    s_sext_i32_i8 s5, s2
+; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x80008
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1521,26 +1519,28 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s4, s3, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX8-NOHSA-NEXT:    s_and_b32 s6, s3, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s4, s2, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s6, s3, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s7, s3, 0x80008
+; GFX8-NOHSA-NEXT:    s_and_b32 s8, s2, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s9, s2, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s2, s3, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s3, s3, 0x80010
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v3, 8, s2
-; GFX8-NOHSA-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s2, s2, 0x80010
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s0, 16
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s6
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -1587,22 +1587,22 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s3
-; GFX12-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX12-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x80010
-; GFX12-NEXT:    s_lshr_b32 s4, s3, 24
-; GFX12-NEXT:    s_and_b32 s6, s3, 0xff
+; GFX12-NEXT:    s_lshr_b32 s6, s3, 24
+; GFX12-NEXT:    s_bfe_u32 s7, s3, 0x80008
+; GFX12-NEXT:    s_and_b32 s9, s3, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s3, s3, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT:    v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
-; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX12-NEXT:    v_dual_mov_b32 v4, s6 :: v_dual_and_b32 v5, 0xffff, v5
-; GFX12-NEXT:    v_mov_b32_e32 v6, s3
+; GFX12-NEXT:    s_lshr_b32 s4, s2, 24
+; GFX12-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX12-NEXT:    s_and_b32 s8, s2, 0xff
+; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    v_dual_mov_b32 v0, s9 :: v_dual_mov_b32 v3, s6
+; GFX12-NEXT:    v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s4
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -1680,27 +1680,27 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s4, s3, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s5, s3, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s6, s2, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s7, s2, 0x80010
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s2, s2
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s0, 16
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v3, v0, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s7
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s4, s2, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s5, s2, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s6, s2, 0x80008
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s7, s2
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s2, s3, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s8, s3, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s9, s3, 0x80008
 ; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s3, s3
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1751,24 +1751,22 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s3
-; GFX12-NEXT:    s_ashr_i32 s6, s2, 24
-; GFX12-NEXT:    s_sext_i32_i8 s7, s2
-; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x80010
-; GFX12-NEXT:    s_ashr_i32 s4, s3, 24
-; GFX12-NEXT:    s_bfe_i32 s5, s3, 0x80010
-; GFX12-NEXT:    s_sext_i32_i8 s3, s3
-; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s6
-; GFX12-NEXT:    v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
-; GFX12-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v4, s3
+; GFX12-NEXT:    s_ashr_i32 s7, s3, 24
+; GFX12-NEXT:    s_bfe_i32 s8, s3, 0x80010
+; GFX12-NEXT:    s_sext_i32_i8 s9, s3
+; GFX12-NEXT:    s_bfe_i32 s3, s3, 0x80008
+; GFX12-NEXT:    s_ashr_i32 s4, s2, 24
+; GFX12-NEXT:    s_bfe_i32 s5, s2, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s6, s2, 0x80008
+; GFX12-NEXT:    s_sext_i32_i8 s2, s2
+; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s9 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v5, s6
+; GFX12-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v7, s4
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s5
-; GFX12-NEXT:    v_bfe_i32 v5, v5, 0, 8
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -1894,47 +1892,51 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s8, s4, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s9, s5, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s10, s6, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s9, s4, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s10, s5, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s11, s5, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s12, s6, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s13, s6, 0x80008
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s2, s7, 24
-; GFX8-NOHSA-NEXT:    s_and_b32 s11, s4, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v1, 8, s4
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s3, s7, 0x80008
+; GFX8-NOHSA-NEXT:    s_and_b32 s14, s4, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s12, s5, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v3, 8, s5
+; GFX8-NOHSA-NEXT:    s_and_b32 s15, s5, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s13, s6, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v5, 8, s6
+; GFX8-NOHSA-NEXT:    s_and_b32 s16, s6, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s6, s6, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s3, s7, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v7, 8, s7
+; GFX8-NOHSA-NEXT:    s_and_b32 s17, s7, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s7, s7, 0x80010
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s7
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s17
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[6:9]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s13
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s2
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s13
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s12
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s9
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s2
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s11
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s15
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s14
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -2003,30 +2005,30 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s7
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s6
-; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s5
-; GFX12-NEXT:    s_lshr_b32 s8, s6, 24
-; GFX12-NEXT:    s_lshr_b32 s9, s7, 24
-; GFX12-NEXT:    v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT:    s_and_b32 s12, s6, 0xff
-; GFX12-NEXT:    s_bfe_u32 s6, s6, 0x80010
-; GFX12-NEXT:    s_and_b32 s13, s7, 0xff
+; GFX12-NEXT:    s_lshr_b32 s12, s7, 24
+; GFX12-NEXT:    s_bfe_u32 s13, s7, 0x80008
+; GFX12-NEXT:    s_and_b32 s17, s7, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s7, s7, 0x80010
-; GFX12-NEXT:    s_and_b32 s11, s5, 0xff
-; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s9
-; GFX12-NEXT:    s_lshr_b32 s3, s5, 24
+; GFX12-NEXT:    s_lshr_b32 s10, s6, 24
+; GFX12-NEXT:    s_bfe_u32 s11, s6, 0x80008
+; GFX12-NEXT:    s_and_b32 s16, s6, 0xff
+; GFX12-NEXT:    s_bfe_u32 s6, s6, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13
+; GFX12-NEXT:    s_lshr_b32 s8, s5, 24
+; GFX12-NEXT:    s_bfe_u32 s9, s5, 0x80008
+; GFX12-NEXT:    s_and_b32 s15, s5, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s5, s5, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s8
-; GFX12-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s3
-; GFX12-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT:    v_dual_mov_b32 v0, s17 :: v_dual_mov_b32 v3, s12
+; GFX12-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s11
 ; GFX12-NEXT:    s_lshr_b32 s2, s4, 24
-; GFX12-NEXT:    s_and_b32 s10, s4, 0xff
+; GFX12-NEXT:    s_bfe_u32 s3, s4, 0x80008
+; GFX12-NEXT:    s_and_b32 s14, s4, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s4, s4, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT:    v_dual_mov_b32 v8, s11 :: v_dual_and_b32 v5, 0xffff, v5
-; GFX12-NEXT:    v_dual_mov_b32 v10, s5 :: v_dual_and_b32 v9, 0xffff, v9
-; GFX12-NEXT:    v_dual_mov_b32 v12, s10 :: v_dual_and_b32 v13, 0xffff, v13
+; GFX12-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s10
+; GFX12-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v9, s9
+; GFX12-NEXT:    v_dual_mov_b32 v8, s15 :: v_dual_mov_b32 v11, s8
+; GFX12-NEXT:    v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v13, s3
+; GFX12-NEXT:    v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v14, s4
 ; GFX12-NEXT:    s_clause 0x3
 ; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[0:1] offset:48
@@ -2159,50 +2161,50 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_ashr_i32 s8, s4, 24
 ; GFX8-NOHSA-NEXT:    s_bfe_i32 s9, s4, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s10, s5, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s11, s5, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s12, s6, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s13, s6, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s10, s4, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s11, s5, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s12, s5, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s13, s5, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s14, s6, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s15, s6, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s16, s6, 0x80008
 ; GFX8-NOHSA-NEXT:    s_ashr_i32 s2, s7, 24
 ; GFX8-NOHSA-NEXT:    s_bfe_i32 s3, s7, 0x80010
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s2
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s17, s7, 0x80008
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s3
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v0, 8, s7
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s7, s7
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v7, v0, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[6:9]
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v2, 8, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s3
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s6, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v5, v2, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s13
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s12
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s15
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s14
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v3, 8, s5
 ; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s5, s5
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s11
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s10
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s2
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v1, 8, s4
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s13
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s12
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -2279,35 +2281,31 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s7
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s6
-; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s5
-; GFX12-NEXT:    v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT:    s_ashr_i32 s12, s7, 24
-; GFX12-NEXT:    s_sext_i32_i8 s13, s7
-; GFX12-NEXT:    s_bfe_i32 s7, s7, 0x80010
-; GFX12-NEXT:    s_ashr_i32 s10, s6, 24
-; GFX12-NEXT:    s_bfe_i32 s11, s6, 0x80010
+; GFX12-NEXT:    s_ashr_i32 s15, s7, 24
+; GFX12-NEXT:    s_bfe_i32 s16, s7, 0x80010
+; GFX12-NEXT:    s_sext_i32_i8 s17, s7
+; GFX12-NEXT:    s_bfe_i32 s7, s7, 0x80008
+; GFX12-NEXT:    s_ashr_i32 s12, s6, 24
+; GFX12-NEXT:    s_bfe_i32 s13, s6, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s14, s6, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s6, s6
-; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s12
-; GFX12-NEXT:    s_ashr_i32 s8, s5, 24
-; GFX12-NEXT:    s_bfe_i32 s9, s5, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    s_ashr_i32 s9, s5, 24
+; GFX12-NEXT:    s_bfe_i32 s10, s5, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s11, s5, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s5, s5
-; GFX12-NEXT:    v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s10
-; GFX12-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s8
-; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-NEXT:    v_dual_mov_b32 v0, s17 :: v_dual_mov_b32 v3, s15
+; GFX12-NEXT:    v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v5, s14
 ; GFX12-NEXT:    s_ashr_i32 s2, s4, 24
 ; GFX12-NEXT:    s_bfe_i32 s3, s4, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s8, s4, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s4, s4
-; GFX12-NEXT:    v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT:    v_mov_b32_e32 v6, s11
-; GFX12-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v8, s5
-; GFX12-NEXT:    v_mov_b32_e32 v10, s9
-; GFX12-NEXT:    v_bfe_i32 v9, v9, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v12, s4
+; GFX12-NEXT:    v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v7, s12
+; GFX12-NEXT:    v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s11
+; GFX12-NEXT:    v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v11, s9
+; GFX12-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s8
+; GFX12-NEXT:    v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
 ; GFX12-NEXT:    v_mov_b32_e32 v14, s3
-; GFX12-NEXT:    v_bfe_i32 v13, v13, 0, 8
 ; GFX12-NEXT:    s_clause 0x3
 ; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[0:1] offset:48
 ; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
@@ -2525,103 +2523,111 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ;
 ; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i32:
 ; GFX8-NOHSA:       ; %bb.0:
-; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x24
+; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GFX8-NOHSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s10, s0, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s11, s1, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s12, s2, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s13, s3, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s14, s4, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s15, s5, 24
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s12, s4, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s13, s4, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s14, s5, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s15, s5, 0x80008
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s16, s6, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s17, s7, 24
-; GFX8-NOHSA-NEXT:    s_and_b32 s18, s0, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v1, 8, s0
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s19, s0, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s20, s1, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v3, 8, s1
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s21, s1, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s22, s2, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v5, 8, s2
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s2, s2, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s23, s3, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v7, 8, s3
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s3, s3, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s24, s4, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v9, 8, s4
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s17, s6, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s18, s7, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s19, s7, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s20, s8, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s21, s8, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s22, s9, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s23, s9, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s24, s10, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s25, s10, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s2, s11, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s3, s11, 0x80008
+; GFX8-NOHSA-NEXT:    s_and_b32 s26, s4, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s25, s5, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s26, s5, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s27, s6, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s28, s6, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, s7, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s1, s7, 0x80010
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 0x70
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s1
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 0x60
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s7
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s17
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 0x50
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s27
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s28
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s16
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 64
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s25
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s26
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s15
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s24
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 48
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s14
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s23
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 32
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s13
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[6:9]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s22
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 16
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s12
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s20
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s21
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s11
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s18
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s19
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s10
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s9
+; GFX8-NOHSA-NEXT:    s_and_b32 s27, s5, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s28, s6, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s6, s6, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s29, s7, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s7, s7, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s30, s8, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s8, s8, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s31, s9, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s9, s9, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s33, s10, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s10, s10, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s34, s11, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s11, s11, 0x80010
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 0x70
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s34
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s11
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 0x50
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s33
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s25
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s24
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 64
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s31
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s23
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s9
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s22
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 48
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s30
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s21
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s20
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 32
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s29
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s19
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s28
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s17
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s16
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s27
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s15
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s14
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s26
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s13
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s12
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
 ;
@@ -2728,67 +2734,66 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[4:11], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s11
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s10
-; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s9
-; GFX12-NEXT:    v_lshrrev_b16 v10, 8, s8
-; GFX12-NEXT:    v_lshrrev_b16 v11, 8, s7
-; GFX12-NEXT:    s_lshr_b32 s15, s9, 24
-; GFX12-NEXT:    s_lshr_b32 s17, s11, 24
-; GFX12-NEXT:    v_lshrrev_b16 v12, 8, s6
-; GFX12-NEXT:    s_and_b32 s23, s9, 0xff
-; GFX12-NEXT:    s_bfe_u32 s9, s9, 0x80010
-; GFX12-NEXT:    s_and_b32 s25, s11, 0xff
+; GFX12-NEXT:    s_lshr_b32 s24, s11, 24
+; GFX12-NEXT:    s_bfe_u32 s25, s11, 0x80008
+; GFX12-NEXT:    s_and_b32 s34, s11, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s11, s11, 0x80010
-; GFX12-NEXT:    s_lshr_b32 s14, s8, 24
-; GFX12-NEXT:    s_lshr_b32 s16, s10, 24
-; GFX12-NEXT:    v_lshrrev_b16 v14, 8, s5
-; GFX12-NEXT:    s_and_b32 s22, s8, 0xff
-; GFX12-NEXT:    s_bfe_u32 s8, s8, 0x80010
-; GFX12-NEXT:    s_and_b32 s24, s10, 0xff
+; GFX12-NEXT:    s_lshr_b32 s22, s10, 24
+; GFX12-NEXT:    s_bfe_u32 s23, s10, 0x80008
+; GFX12-NEXT:    s_and_b32 s33, s10, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s10, s10, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v3, s17
-; GFX12-NEXT:    s_lshr_b32 s13, s7, 24
-; GFX12-NEXT:    v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT:    s_and_b32 s21, s7, 0xff
+; GFX12-NEXT:    v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s25
+; GFX12-NEXT:    v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s24
+; GFX12-NEXT:    v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s23
+; GFX12-NEXT:    s_bfe_u32 s21, s9, 0x80008
+; GFX12-NEXT:    v_dual_mov_b32 v4, s33 :: v_dual_mov_b32 v7, s22
+; GFX12-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v9, s21
+; GFX12-NEXT:    s_lshr_b32 s20, s9, 24
+; GFX12-NEXT:    s_and_b32 s31, s9, 0xff
+; GFX12-NEXT:    s_bfe_u32 s9, s9, 0x80010
+; GFX12-NEXT:    s_lshr_b32 s18, s8, 24
+; GFX12-NEXT:    s_bfe_u32 s19, s8, 0x80008
+; GFX12-NEXT:    s_and_b32 s30, s8, 0xff
+; GFX12-NEXT:    s_bfe_u32 s8, s8, 0x80010
+; GFX12-NEXT:    s_lshr_b32 s16, s7, 24
+; GFX12-NEXT:    s_bfe_u32 s17, s7, 0x80008
+; GFX12-NEXT:    s_and_b32 s29, s7, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s7, s7, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s16
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_dual_mov_b32 v2, s11 :: v_dual_and_b32 v13, 0xffff, v13
-; GFX12-NEXT:    v_dual_mov_b32 v8, s23 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT:    v_dual_mov_b32 v28, s22 :: v_dual_and_b32 v25, 0xffff, v11
-; GFX12-NEXT:    v_dual_mov_b32 v30, s8 :: v_dual_and_b32 v29, 0xffff, v10
-; GFX12-NEXT:    v_dual_mov_b32 v24, s21 :: v_dual_and_b32 v9, 0xffff, v9
-; GFX12-NEXT:    v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v11, s15
-; GFX12-NEXT:    v_mov_b32_e32 v26, s7
-; GFX12-NEXT:    s_lshr_b32 s12, s6, 24
-; GFX12-NEXT:    s_and_b32 s20, s6, 0xff
+; GFX12-NEXT:    v_dual_mov_b32 v8, s31 :: v_dual_mov_b32 v11, s20
+; GFX12-NEXT:    v_mov_b32_e32 v10, s9
+; GFX12-NEXT:    s_lshr_b32 s14, s6, 24
+; GFX12-NEXT:    s_bfe_u32 s15, s6, 0x80008
+; GFX12-NEXT:    s_and_b32 s28, s6, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s6, s6, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v4, s24 :: v_dual_and_b32 v17, 0xffff, v14
-; GFX12-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_and_b32 v21, 0xffff, v12
-; GFX12-NEXT:    v_dual_mov_b32 v31, s14 :: v_dual_mov_b32 v20, s20
-; GFX12-NEXT:    s_lshr_b32 s3, s5, 24
-; GFX12-NEXT:    s_and_b32 s19, s5, 0xff
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:96
+; GFX12-NEXT:    v_dual_mov_b32 v1, s19 :: v_dual_mov_b32 v0, s30
+; GFX12-NEXT:    v_dual_mov_b32 v3, s18 :: v_dual_mov_b32 v2, s8
+; GFX12-NEXT:    v_mov_b32_e32 v5, s17
+; GFX12-NEXT:    s_lshr_b32 s12, s5, 24
+; GFX12-NEXT:    s_bfe_u32 s13, s5, 0x80008
+; GFX12-NEXT:    s_and_b32 s27, s5, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s5, s5, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v27, s13 :: v_dual_mov_b32 v22, s6
+; GFX12-NEXT:    v_dual_mov_b32 v4, s29 :: v_dual_mov_b32 v7, s16
+; GFX12-NEXT:    v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v13, s15
 ; GFX12-NEXT:    s_lshr_b32 s2, s4, 24
-; GFX12-NEXT:    s_and_b32 s18, s4, 0xff
+; GFX12-NEXT:    s_bfe_u32 s3, s4, 0x80008
+; GFX12-NEXT:    s_and_b32 s26, s4, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s4, s4, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v23, s12 :: v_dual_mov_b32 v16, s19
-; GFX12-NEXT:    v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s3
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:96
-; GFX12-NEXT:    v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT:    v_mov_b32_e32 v14, s4
+; GFX12-NEXT:    v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v15, s14
+; GFX12-NEXT:    v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v17, s13
+; GFX12-NEXT:    v_dual_mov_b32 v16, s27 :: v_dual_mov_b32 v19, s12
+; GFX12-NEXT:    v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v21, s3
+; GFX12-NEXT:    v_dual_mov_b32 v20, s26 :: v_dual_mov_b32 v23, s2
+; GFX12-NEXT:    v_mov_b32_e32 v22, s4
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v32, v[12:15], s[0:1]
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -3001,111 +3006,111 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ;
 ; GFX8-NOHSA-LABEL: constant_sextload_v32i8_to_v32i32:
 ; GFX8-NOHSA:       ; %bb.0:
-; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x24
+; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GFX8-NOHSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v3, 8, s1
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v1, 8, s0
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s10, s0, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s11, s0, 0x80010
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s12, s0
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s13, s1, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s14, s1, 0x80010
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s15, s1
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s16, s2, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s17, s2, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s18, s3, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s19, s3, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s20, s4, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s21, s4, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s22, s5, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s23, s5, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s24, s6, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s25, s6, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s0, s7, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s7, 0x80010
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 0x70
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s1
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s1
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v0, 8, s7
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s12, s4, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s13, s4, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s14, s4, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s15, s5, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s16, s5, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s17, s5, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s18, s6, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s19, s6, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s20, s6, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s21, s7, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s22, s7, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s23, s7, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s24, s8, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s25, s8, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s26, s8, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s27, s9, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s28, s9, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s29, s9, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s30, s10, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s31, s10, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s33, s10, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s2, s11, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s3, s11, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s34, s11, 0x80008
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 0x70
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s11, s11
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s11
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s34
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s10, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 0x50
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s33
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s31
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s30
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s9, s9
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 64
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s9
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s29
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s28
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s27
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s8, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 48
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s26
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s25
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s24
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s7, s7
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 0x60
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v11, v0, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s7
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s1
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v2, 8, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 32
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s23
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s22
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s21
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s6, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 0x50
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v11, v2, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s25
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s24
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s1
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v4, 8, s5
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s20
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s19
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s5, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 64
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v11, v4, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s23
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s22
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v6, 8, s4
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s1
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 48
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v9, v6, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s4
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s21
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s20
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v7, 8, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s1
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s3, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 32
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s19
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s18
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[6:9]
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v5, 8, s2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s1
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s2, s2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 16
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s17
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s15
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s17
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s16
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s15
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s14
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s13
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s12
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s11
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s10
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s9
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s14
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s13
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s12
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
 ;
@@ -3235,74 +3240,66 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[4:11], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s11
-; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s9
-; GFX12-NEXT:    v_lshrrev_b16 v10, 8, s8
-; GFX12-NEXT:    v_lshrrev_b16 v11, 8, s7
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s10
-; GFX12-NEXT:    v_lshrrev_b16 v12, 8, s6
-; GFX12-NEXT:    s_ashr_i32 s20, s9, 24
-; GFX12-NEXT:    s_bfe_i32 s21, s9, 0x80010
+; GFX12-NEXT:    s_ashr_i32 s31, s11, 24
+; GFX12-NEXT:    s_bfe_i32 s33, s11, 0x80010
+; GFX12-NEXT:    s_sext_i32_i8 s34, s11
+; GFX12-NEXT:    s_bfe_i32 s11, s11, 0x80008
+; GFX12-NEXT:    s_ashr_i32 s28, s10, 24
+; GFX12-NEXT:    s_bfe_i32 s29, s10, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s30, s10, 0x80008
+; GFX12-NEXT:    s_sext_i32_i8 s10, s10
+; GFX12-NEXT:    v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s11
+; GFX12-NEXT:    v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s31
+; GFX12-NEXT:    v_dual_mov_b32 v2, s33 :: v_dual_mov_b32 v5, s30
+; GFX12-NEXT:    s_bfe_i32 s27, s9, 0x80008
+; GFX12-NEXT:    v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v7, s28
+; GFX12-NEXT:    v_dual_mov_b32 v6, s29 :: v_dual_mov_b32 v9, s27
+; GFX12-NEXT:    s_ashr_i32 s25, s9, 24
+; GFX12-NEXT:    s_bfe_i32 s26, s9, 0x80010
 ; GFX12-NEXT:    s_sext_i32_i8 s9, s9
-; GFX12-NEXT:    s_ashr_i32 s24, s11, 24
-; GFX12-NEXT:    s_sext_i32_i8 s25, s11
-; GFX12-NEXT:    s_bfe_i32 s11, s11, 0x80010
-; GFX12-NEXT:    v_lshrrev_b16 v14, 8, s5
-; GFX12-NEXT:    s_ashr_i32 s18, s8, 24
-; GFX12-NEXT:    s_bfe_i32 s19, s8, 0x80010
+; GFX12-NEXT:    s_ashr_i32 s22, s8, 24
+; GFX12-NEXT:    s_bfe_i32 s23, s8, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s24, s8, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s8, s8
-; GFX12-NEXT:    s_ashr_i32 s22, s10, 24
-; GFX12-NEXT:    s_bfe_i32 s23, s10, 0x80010
-; GFX12-NEXT:    s_sext_i32_i8 s10, s10
-; GFX12-NEXT:    v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v3, s24
-; GFX12-NEXT:    v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT:    s_ashr_i32 s12, s5, 24
-; GFX12-NEXT:    s_ashr_i32 s14, s6, 24
-; GFX12-NEXT:    s_ashr_i32 s16, s7, 24
-; GFX12-NEXT:    s_bfe_i32 s17, s7, 0x80010
+; GFX12-NEXT:    s_ashr_i32 s19, s7, 24
+; GFX12-NEXT:    s_bfe_i32 s20, s7, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s21, s7, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s7, s7
-; GFX12-NEXT:    v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s22
-; GFX12-NEXT:    v_mov_b32_e32 v2, s11
-; GFX12-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v23, s14
-; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v25, v11, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v29, v10, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v9, v9, 0, 8
-; GFX12-NEXT:    v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v19, s12
-; GFX12-NEXT:    v_mov_b32_e32 v11, s20
-; GFX12-NEXT:    s_ashr_i32 s2, s4, 24
-; GFX12-NEXT:    s_bfe_i32 s15, s6, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s25
+; GFX12-NEXT:    v_mov_b32_e32 v10, s26
+; GFX12-NEXT:    s_ashr_i32 s16, s6, 24
+; GFX12-NEXT:    s_bfe_i32 s17, s6, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s18, s6, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s6, s6
-; GFX12-NEXT:    v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v31, s18
-; GFX12-NEXT:    v_dual_mov_b32 v6, s23 :: v_dual_mov_b32 v27, s16
-; GFX12-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX12-NEXT:    v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT:    v_mov_b32_e32 v30, s19
-; GFX12-NEXT:    s_bfe_i32 s13, s5, 0x80010
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:96
+; GFX12-NEXT:    v_dual_mov_b32 v1, s24 :: v_dual_mov_b32 v0, s8
+; GFX12-NEXT:    v_dual_mov_b32 v3, s22 :: v_dual_mov_b32 v2, s23
+; GFX12-NEXT:    v_mov_b32_e32 v5, s21
+; GFX12-NEXT:    s_ashr_i32 s13, s5, 24
+; GFX12-NEXT:    s_bfe_i32 s14, s5, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s15, s5, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s5, s5
-; GFX12-NEXT:    v_mov_b32_e32 v24, s7
-; GFX12-NEXT:    v_mov_b32_e32 v26, s17
+; GFX12-NEXT:    v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v7, s19
+; GFX12-NEXT:    v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v13, s18
+; GFX12-NEXT:    s_ashr_i32 s2, s4, 24
 ; GFX12-NEXT:    s_bfe_i32 s3, s4, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s12, s4, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s4, s4
-; GFX12-NEXT:    v_bfe_i32 v21, v12, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v20, s6
-; GFX12-NEXT:    v_mov_b32_e32 v22, s15
-; GFX12-NEXT:    v_bfe_i32 v17, v14, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v16, s5
-; GFX12-NEXT:    v_mov_b32_e32 v18, s13
-; GFX12-NEXT:    v_bfe_i32 v13, v13, 0, 8
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:96
-; GFX12-NEXT:    v_mov_b32_e32 v12, s4
-; GFX12-NEXT:    v_mov_b32_e32 v14, s3
+; GFX12-NEXT:    v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v15, s16
+; GFX12-NEXT:    v_dual_mov_b32 v14, s17 :: v_dual_mov_b32 v17, s15
+; GFX12-NEXT:    v_dual_mov_b32 v16, s5 :: v_dual_mov_b32 v19, s13
+; GFX12-NEXT:    v_dual_mov_b32 v18, s14 :: v_dual_mov_b32 v21, s12
+; GFX12-NEXT:    v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v23, s2
+; GFX12-NEXT:    v_mov_b32_e32 v22, s3
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v32, v[12:15], s[0:1]
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -3704,196 +3701,209 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s18, s0, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s19, s1, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s21, s2, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s23, s3, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s25, s4, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s27, s5, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s29, s6, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s33, s7, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s34, s8, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s35, s9, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s36, s10, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s37, s11, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s38, s12, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s39, s13, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s40, s14, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s30, s15, 24
-; GFX8-NOHSA-NEXT:    s_and_b32 s20, s0, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v1, 8, s0
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s19, s0, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s20, s1, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s21, s1, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s22, s2, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s23, s2, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s25, s3, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s26, s3, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s28, s4, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s29, s4, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s31, s5, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s33, s5, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s35, s6, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s36, s6, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s37, s7, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s38, s7, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s39, s8, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s40, s8, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s41, s9, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s42, s9, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s43, s10, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s44, s10, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s45, s11, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s46, s11, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s47, s12, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s48, s12, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s49, s13, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s50, s13, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s51, s14, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s52, s14, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s53, s15, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s54, s15, 0x80008
+; GFX8-NOHSA-NEXT:    s_and_b32 s24, s0, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s22, s1, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v3, 8, s1
+; GFX8-NOHSA-NEXT:    s_and_b32 s27, s1, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s1, s1, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s24, s2, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v5, 8, s2
+; GFX8-NOHSA-NEXT:    s_and_b32 s30, s2, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s2, s2, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s26, s3, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v7, 8, s3
+; GFX8-NOHSA-NEXT:    s_and_b32 s34, s3, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s3, s3, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s28, s4, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v9, 8, s4
+; GFX8-NOHSA-NEXT:    s_and_b32 s55, s4, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s41, s5, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s42, s5, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s43, s6, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s44, s6, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s45, s7, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s46, s7, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s47, s8, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s48, s8, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s49, s9, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s50, s9, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s51, s10, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s52, s10, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s53, s11, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s54, s11, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s55, s12, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s56, s12, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s57, s13, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s58, s13, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s59, s14, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s60, s14, 0x80010
-; GFX8-NOHSA-NEXT:    s_and_b32 s31, s15, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s15
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s15, s15, 0x80010
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s30
-; GFX8-NOHSA-NEXT:    s_add_u32 s30, s16, 0xf0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s31
-; GFX8-NOHSA-NEXT:    s_addc_u32 s31, s17, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s30
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s15
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s31
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s28
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s14
-; GFX8-NOHSA-NEXT:    s_add_u32 s14, s16, 0xe0
-; GFX8-NOHSA-NEXT:    s_addc_u32 s15, s17, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s14
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s59
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s60
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s40
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s15
-; GFX8-NOHSA-NEXT:    s_add_u32 s14, s16, 0xd0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    s_addc_u32 s15, s17, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s14
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s13
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s57
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s58
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s39
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s15
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s26
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s12
-; GFX8-NOHSA-NEXT:    s_add_u32 s12, s16, 0xc0
-; GFX8-NOHSA-NEXT:    s_addc_u32 s13, s17, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s13
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s12
-; GFX8-NOHSA-NEXT:    s_add_u32 s12, s16, 0xb0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s55
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s56
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s38
-; GFX8-NOHSA-NEXT:    s_addc_u32 s13, s17, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s13
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s11
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s53
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s54
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s37
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s12
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s24
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s10
-; GFX8-NOHSA-NEXT:    s_add_u32 s10, s16, 0xa0
-; GFX8-NOHSA-NEXT:    s_addc_u32 s11, s17, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s11
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s10
-; GFX8-NOHSA-NEXT:    s_add_u32 s10, s16, 0x90
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s51
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s52
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s36
-; GFX8-NOHSA-NEXT:    s_addc_u32 s11, s17, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s11
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s9
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s49
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s50
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s35
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s10
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s22
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s8
-; GFX8-NOHSA-NEXT:    s_add_u32 s8, s16, 0x80
-; GFX8-NOHSA-NEXT:    s_addc_u32 s9, s17, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s9
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s8
-; GFX8-NOHSA-NEXT:    s_add_u32 s8, s16, 0x70
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s47
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s48
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s34
-; GFX8-NOHSA-NEXT:    s_addc_u32 s9, s17, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s9
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s7
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s45
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s46
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s33
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s8
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s6
+; GFX8-NOHSA-NEXT:    s_and_b32 s56, s5, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s57, s6, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s58, s6, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s59, s7, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s60, s7, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s61, s8, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s8, s8, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s62, s9, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s9, s9, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s63, s10, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s10, s10, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s64, s11, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s11, s11, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s65, s12, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s12, s12, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s66, s13, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s13, s13, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s67, s14, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s14, s14, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s6, s15, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s7, s15, 0x80010
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0xf0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s54
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s53
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0xe0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s67
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s52
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s14
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s51
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0xd0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s66
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s50
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s13
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s49
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0xc0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s65
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s48
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s12
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s47
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0xb0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s64
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s46
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s11
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s45
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0xa0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s63
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s44
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s43
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0x90
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s62
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s42
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s9
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s41
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0x80
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s61
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s40
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s39
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0x70
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s59
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s38
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s60
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s37
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0x60
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s7
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s57
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s36
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s58
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0x50
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s43
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s44
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s29
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s7
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s41
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s42
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s27
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s6
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s56
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s33
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s31
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_nop 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NOHSA-NEXT:    s_add_u32 s4, s16, 64
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s17, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s55
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s29
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s28
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX8-NOHSA-NEXT:    s_add_u32 s4, s16, 48
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s25
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s17, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NOHSA-NEXT:    s_nop 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s23
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[6:9]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s34
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s26
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s25
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_nop 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s16, 32
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s17, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s16, 16
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s21
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s30
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s23
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s22
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s17, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NOHSA-NEXT:    s_nop 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s19
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s2
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT:    s_nop 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s27
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s21
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s20
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s24
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s19
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s18
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s17
@@ -4090,126 +4100,124 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v0, 8, s15
-; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s1
-; GFX12-NEXT:    v_lshrrev_b16 v2, 8, s14
-; GFX12-NEXT:    v_lshrrev_b16 v15, 8, s4
-; GFX12-NEXT:    v_lshrrev_b16 v13, 8, s3
-; GFX12-NEXT:    v_lshrrev_b16 v3, 8, s13
-; GFX12-NEXT:    v_lshrrev_b16 v12, 8, s6
-; GFX12-NEXT:    v_lshrrev_b16 v14, 8, s5
-; GFX12-NEXT:    s_lshr_b32 s34, s15, 24
-; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s12
-; GFX12-NEXT:    v_lshrrev_b16 v10, 8, s8
-; GFX12-NEXT:    v_lshrrev_b16 v11, 8, s7
-; GFX12-NEXT:    s_and_b32 s50, s15, 0xff
+; GFX12-NEXT:    s_lshr_b32 s49, s15, 24
+; GFX12-NEXT:    s_bfe_u32 s50, s15, 0x80008
+; GFX12-NEXT:    s_and_b32 s66, s15, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s15, s15, 0x80010
-; GFX12-NEXT:    s_lshr_b32 s33, s14, 24
-; GFX12-NEXT:    s_and_b32 s49, s14, 0xff
+; GFX12-NEXT:    s_lshr_b32 s47, s14, 24
+; GFX12-NEXT:    s_bfe_u32 s48, s14, 0x80008
+; GFX12-NEXT:    s_and_b32 s65, s14, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s14, s14, 0x80010
-; GFX12-NEXT:    s_lshr_b32 s26, s8, 24
-; GFX12-NEXT:    s_lshr_b32 s31, s13, 24
-; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s11
-; GFX12-NEXT:    v_lshrrev_b16 v7, 8, s10
-; GFX12-NEXT:    v_lshrrev_b16 v8, 8, s9
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s0
-; GFX12-NEXT:    v_dual_mov_b32 v60, 0 :: v_dual_and_b32 v5, 0xffff, v5
-; GFX12-NEXT:    v_dual_mov_b32 v56, s50 :: v_dual_and_b32 v9, 0xffff, v9
-; GFX12-NEXT:    v_mov_b32_e32 v58, s15
-; GFX12-NEXT:    s_and_b32 s43, s8, 0xff
-; GFX12-NEXT:    s_bfe_u32 s8, s8, 0x80010
-; GFX12-NEXT:    s_and_b32 s48, s13, 0xff
+; GFX12-NEXT:    v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s50
+; GFX12-NEXT:    s_lshr_b32 s45, s13, 24
+; GFX12-NEXT:    s_bfe_u32 s46, s13, 0x80008
+; GFX12-NEXT:    s_and_b32 s64, s13, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s13, s13, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v30, s43 :: v_dual_and_b32 v57, 0xffff, v0
-; GFX12-NEXT:    v_dual_mov_b32 v59, s34 :: v_dual_mov_b32 v32, s8
-; GFX12-NEXT:    s_lshr_b32 s27, s9, 24
-; GFX12-NEXT:    s_lshr_b32 s30, s12, 24
-; GFX12-NEXT:    v_dual_mov_b32 v52, s49 :: v_dual_and_b32 v13, 0xffff, v13
-; GFX12-NEXT:    v_dual_mov_b32 v54, s14 :: v_dual_and_b32 v17, 0xffff, v15
-; GFX12-NEXT:    s_and_b32 s42, s7, 0xff
-; GFX12-NEXT:    s_and_b32 s44, s9, 0xff
-; GFX12-NEXT:    s_bfe_u32 s9, s9, 0x80010
-; GFX12-NEXT:    s_and_b32 s47, s12, 0xff
+; GFX12-NEXT:    v_dual_mov_b32 v0, s66 :: v_dual_mov_b32 v3, s49
+; GFX12-NEXT:    v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s48
+; GFX12-NEXT:    s_lshr_b32 s43, s12, 24
+; GFX12-NEXT:    s_bfe_u32 s44, s12, 0x80008
+; GFX12-NEXT:    s_and_b32 s63, s12, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s12, s12, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v36, s9 :: v_dual_and_b32 v53, 0xffff, v2
-; GFX12-NEXT:    v_dual_mov_b32 v55, s33 :: v_dual_mov_b32 v26, s42
-; GFX12-NEXT:    s_lshr_b32 s25, s7, 24
-; GFX12-NEXT:    v_dual_mov_b32 v48, s48 :: v_dual_and_b32 v21, 0xffff, v14
-; GFX12-NEXT:    v_dual_mov_b32 v50, s13 :: v_dual_and_b32 v23, 0xffff, v12
-; GFX12-NEXT:    s_bfe_u32 s7, s7, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v34, s44 :: v_dual_and_b32 v49, 0xffff, v3
-; GFX12-NEXT:    v_dual_mov_b32 v51, s31 :: v_dual_mov_b32 v28, s7
-; GFX12-NEXT:    s_lshr_b32 s28, s10, 24
-; GFX12-NEXT:    s_lshr_b32 s29, s11, 24
-; GFX12-NEXT:    s_and_b32 s41, s6, 0xff
-; GFX12-NEXT:    v_dual_mov_b32 v44, s47 :: v_dual_and_b32 v27, 0xffff, v11
-; GFX12-NEXT:    v_dual_mov_b32 v46, s12 :: v_dual_and_b32 v31, 0xffff, v10
-; GFX12-NEXT:    s_and_b32 s45, s10, 0xff
-; GFX12-NEXT:    s_bfe_u32 s10, s10, 0x80010
-; GFX12-NEXT:    s_and_b32 s46, s11, 0xff
-; GFX12-NEXT:    v_dual_mov_b32 v40, s10 :: v_dual_and_b32 v45, 0xffff, v4
-; GFX12-NEXT:    v_dual_mov_b32 v47, s30 :: v_dual_mov_b32 v22, s41
+; GFX12-NEXT:    v_dual_mov_b32 v4, s65 :: v_dual_mov_b32 v7, s47
+; GFX12-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v9, s46
+; GFX12-NEXT:    v_dual_mov_b32 v8, s64 :: v_dual_mov_b32 v11, s45
+; GFX12-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v13, s44
+; GFX12-NEXT:    s_lshr_b32 s41, s11, 24
+; GFX12-NEXT:    s_bfe_u32 s42, s11, 0x80008
+; GFX12-NEXT:    s_and_b32 s62, s11, 0xff
+; GFX12-NEXT:    v_dual_mov_b32 v12, s63 :: v_dual_mov_b32 v15, s43
+; GFX12-NEXT:    v_mov_b32_e32 v14, s12
 ; GFX12-NEXT:    s_bfe_u32 s11, s11, 0x80010
-; GFX12-NEXT:    s_lshr_b32 s24, s6, 24
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT:    s_bfe_u32 s6, s6, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v42, s46 :: v_dual_and_b32 v35, 0xffff, v8
-; GFX12-NEXT:    v_and_b32_e32 v39, 0xffff, v7
-; GFX12-NEXT:    v_dual_mov_b32 v38, s45 :: v_dual_and_b32 v43, 0xffff, v6
+; GFX12-NEXT:    s_lshr_b32 s39, s10, 24
+; GFX12-NEXT:    s_bfe_u32 s40, s10, 0x80008
+; GFX12-NEXT:    s_and_b32 s61, s10, 0xff
+; GFX12-NEXT:    s_bfe_u32 s10, s10, 0x80010
+; GFX12-NEXT:    s_lshr_b32 s37, s9, 24
+; GFX12-NEXT:    s_bfe_u32 s38, s9, 0x80008
+; GFX12-NEXT:    s_and_b32 s60, s9, 0xff
+; GFX12-NEXT:    s_bfe_u32 s9, s9, 0x80010
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v60, v[56:59], s[16:17] offset:240
-; GFX12-NEXT:    global_store_b128 v60, v[52:55], s[16:17] offset:224
-; GFX12-NEXT:    global_store_b128 v60, v[48:51], s[16:17] offset:208
-; GFX12-NEXT:    global_store_b128 v60, v[44:47], s[16:17] offset:192
-; GFX12-NEXT:    v_dual_mov_b32 v44, s11 :: v_dual_mov_b32 v45, s29
-; GFX12-NEXT:    v_mov_b32_e32 v24, s6
-; GFX12-NEXT:    s_and_b32 s40, s5, 0xff
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v41, s28 :: v_dual_mov_b32 v20, s40
-; GFX12-NEXT:    s_lshr_b32 s23, s5, 24
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[16:17] offset:240
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[16:17] offset:224
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[16:17] offset:208
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:192
+; GFX12-NEXT:    v_dual_mov_b32 v1, s42 :: v_dual_mov_b32 v0, s62
+; GFX12-NEXT:    v_dual_mov_b32 v3, s41 :: v_dual_mov_b32 v2, s11
+; GFX12-NEXT:    v_mov_b32_e32 v5, s40
+; GFX12-NEXT:    s_lshr_b32 s35, s8, 24
+; GFX12-NEXT:    s_bfe_u32 s36, s8, 0x80008
+; GFX12-NEXT:    s_and_b32 s59, s8, 0xff
+; GFX12-NEXT:    s_bfe_u32 s8, s8, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v4, s61 :: v_dual_mov_b32 v7, s39
+; GFX12-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v9, s38
+; GFX12-NEXT:    s_lshr_b32 s33, s7, 24
+; GFX12-NEXT:    s_bfe_u32 s34, s7, 0x80008
+; GFX12-NEXT:    s_and_b32 s58, s7, 0xff
+; GFX12-NEXT:    s_bfe_u32 s7, s7, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v8, s60 :: v_dual_mov_b32 v11, s37
+; GFX12-NEXT:    v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v13, s36
+; GFX12-NEXT:    s_lshr_b32 s28, s5, 24
+; GFX12-NEXT:    s_bfe_u32 s29, s5, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s30, s6, 24
+; GFX12-NEXT:    s_bfe_u32 s31, s6, 0x80008
+; GFX12-NEXT:    s_and_b32 s56, s5, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s5, s5, 0x80010
-; GFX12-NEXT:    v_mov_b32_e32 v37, s27
-; GFX12-NEXT:    s_lshr_b32 s22, s4, 24
-; GFX12-NEXT:    s_and_b32 s38, s3, 0xff
-; GFX12-NEXT:    s_and_b32 s39, s4, 0xff
+; GFX12-NEXT:    s_and_b32 s57, s6, 0xff
+; GFX12-NEXT:    s_bfe_u32 s6, s6, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v12, s59 :: v_dual_mov_b32 v15, s35
+; GFX12-NEXT:    v_dual_mov_b32 v14, s8 :: v_dual_mov_b32 v17, s34
+; GFX12-NEXT:    s_lshr_b32 s26, s4, 24
+; GFX12-NEXT:    s_bfe_u32 s27, s4, 0x80008
+; GFX12-NEXT:    s_and_b32 s55, s4, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s4, s4, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v33, s26 :: v_dual_mov_b32 v16, s39
-; GFX12-NEXT:    v_dual_mov_b32 v29, s25 :: v_dual_mov_b32 v18, s4
-; GFX12-NEXT:    s_lshr_b32 s21, s3, 24
+; GFX12-NEXT:    v_dual_mov_b32 v16, s58 :: v_dual_mov_b32 v19, s33
+; GFX12-NEXT:    v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v21, s31
+; GFX12-NEXT:    s_lshr_b32 s24, s3, 24
+; GFX12-NEXT:    s_bfe_u32 s25, s3, 0x80008
+; GFX12-NEXT:    s_and_b32 s54, s3, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s3, s3, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v25, s24 :: v_dual_mov_b32 v12, s38
+; GFX12-NEXT:    v_dual_mov_b32 v20, s57 :: v_dual_mov_b32 v23, s30
+; GFX12-NEXT:    v_mov_b32_e32 v22, s6
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v60, v[42:45], s[16:17] offset:176
-; GFX12-NEXT:    global_store_b128 v60, v[38:41], s[16:17] offset:160
-; GFX12-NEXT:    global_store_b128 v60, v[34:37], s[16:17] offset:144
-; GFX12-NEXT:    global_store_b128 v60, v[30:33], s[16:17] offset:128
-; GFX12-NEXT:    global_store_b128 v60, v[26:29], s[16:17] offset:112
-; GFX12-NEXT:    global_store_b128 v60, v[22:25], s[16:17] offset:96
-; GFX12-NEXT:    v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v23, s23
-; GFX12-NEXT:    v_mov_b32_e32 v14, s3
-; GFX12-NEXT:    s_lshr_b32 s20, s2, 24
-; GFX12-NEXT:    s_and_b32 s37, s2, 0xff
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[16:17] offset:176
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[16:17] offset:160
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[16:17] offset:144
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:128
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[16:17] offset:112
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[16:17] offset:96
+; GFX12-NEXT:    v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v0, s56
+; GFX12-NEXT:    v_dual_mov_b32 v3, s28 :: v_dual_mov_b32 v2, s5
+; GFX12-NEXT:    v_mov_b32_e32 v5, s27
+; GFX12-NEXT:    s_lshr_b32 s22, s2, 24
+; GFX12-NEXT:    s_bfe_u32 s23, s2, 0x80008
+; GFX12-NEXT:    s_and_b32 s53, s2, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v19, s22 :: v_dual_mov_b32 v8, s37
-; GFX12-NEXT:    s_lshr_b32 s19, s1, 24
-; GFX12-NEXT:    s_and_b32 s36, s1, 0xff
+; GFX12-NEXT:    v_dual_mov_b32 v4, s55 :: v_dual_mov_b32 v7, s26
+; GFX12-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v9, s25
+; GFX12-NEXT:    s_lshr_b32 s20, s1, 24
+; GFX12-NEXT:    s_bfe_u32 s21, s1, 0x80008
+; GFX12-NEXT:    s_and_b32 s52, s1, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s1, s1, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v15, s21 :: v_dual_mov_b32 v10, s2
+; GFX12-NEXT:    v_dual_mov_b32 v8, s54 :: v_dual_mov_b32 v11, s24
+; GFX12-NEXT:    v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s23
 ; GFX12-NEXT:    s_lshr_b32 s18, s0, 24
-; GFX12-NEXT:    s_and_b32 s35, s0, 0xff
+; GFX12-NEXT:    s_bfe_u32 s19, s0, 0x80008
+; GFX12-NEXT:    s_and_b32 s51, s0, 0xff
 ; GFX12-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; GFX12-NEXT:    v_dual_mov_b32 v11, s20 :: v_dual_mov_b32 v4, s36
-; GFX12-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s19
-; GFX12-NEXT:    v_dual_mov_b32 v0, s35 :: v_dual_mov_b32 v3, s18
-; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    v_dual_mov_b32 v12, s53 :: v_dual_mov_b32 v15, s22
+; GFX12-NEXT:    v_dual_mov_b32 v14, s2 :: v_dual_mov_b32 v17, s21
+; GFX12-NEXT:    v_dual_mov_b32 v16, s52 :: v_dual_mov_b32 v19, s20
+; GFX12-NEXT:    v_dual_mov_b32 v18, s1 :: v_dual_mov_b32 v21, s19
+; GFX12-NEXT:    v_dual_mov_b32 v20, s51 :: v_dual_mov_b32 v23, s18
+; GFX12-NEXT:    v_mov_b32_e32 v22, s0
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v60, v[20:23], s[16:17] offset:80
-; GFX12-NEXT:    global_store_b128 v60, v[16:19], s[16:17] offset:64
-; GFX12-NEXT:    global_store_b128 v60, v[12:15], s[16:17] offset:48
-; GFX12-NEXT:    global_store_b128 v60, v[8:11], s[16:17] offset:32
-; GFX12-NEXT:    global_store_b128 v60, v[4:7], s[16:17] offset:16
-; GFX12-NEXT:    global_store_b128 v60, v[0:3], s[16:17]
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[16:17] offset:80
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[16:17] offset:64
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[16:17] offset:48
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:32
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[16:17] offset:16
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[16:17]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -4608,217 +4616,217 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v18, 8, s14
 ; GFX8-NOHSA-NEXT:    s_ashr_i32 s18, s0, 24
 ; GFX8-NOHSA-NEXT:    s_bfe_i32 s19, s0, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s20, s1, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s21, s1, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s22, s2, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s23, s2, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s24, s3, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s25, s3, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s26, s4, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s27, s4, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s28, s5, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s29, s5, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s30, s6, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s31, s6, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s33, s7, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s34, s7, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s35, s8, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s36, s8, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s37, s9, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s38, s9, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s39, s10, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s40, s10, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s41, s11, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s42, s11, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s43, s12, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s44, s12, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s45, s13, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s46, s13, 0x80010
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s47, s14, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s48, s14, 0x80010
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s49, s14
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s14, s15, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s50, s15, 0x80010
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v5, 8, s15
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s20, s0, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s21, s1, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s22, s1, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s23, s1, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s24, s2, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s25, s2, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s26, s2, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s27, s3, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s28, s3, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s29, s3, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s30, s4, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s31, s4, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s33, s4, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s34, s5, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s35, s5, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s36, s5, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s37, s6, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s38, s6, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s39, s6, 0x80008
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s40, s6
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s41, s7, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s42, s7, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s43, s7, 0x80008
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s44, s7
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s45, s8, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s46, s8, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s47, s8, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s48, s9, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s49, s9, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s50, s9, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s51, s10, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s52, s10, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s53, s10, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s54, s11, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s55, s11, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s56, s11, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s57, s12, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s58, s12, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s59, s12, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s60, s13, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s61, s13, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s62, s13, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s63, s14, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s64, s14, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s65, s14, 0x80008
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s6, s15, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s7, s15, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s66, s15, 0x80008
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0xf0
 ; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s15, s15
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s14
-; GFX8-NOHSA-NEXT:    s_add_u32 s14, s16, 0xf0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s15
-; GFX8-NOHSA-NEXT:    s_addc_u32 s15, s17, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s15
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s14
-; GFX8-NOHSA-NEXT:    s_add_u32 s14, s16, 0xe0
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v13, v5, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s50
-; GFX8-NOHSA-NEXT:    s_addc_u32 s15, s17, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s15
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v20, 8, s12
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v13, v18, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s49
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s48
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s47
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s14
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s14, s12
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s12, s13
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v19, 8, s13
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s12
-; GFX8-NOHSA-NEXT:    s_add_u32 s12, s16, 0xd0
-; GFX8-NOHSA-NEXT:    s_addc_u32 s13, s17, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s13
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s12
-; GFX8-NOHSA-NEXT:    s_add_u32 s12, s16, 0xc0
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v13, v19, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s46
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s45
-; GFX8-NOHSA-NEXT:    s_addc_u32 s13, s17, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s13
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s12
-; GFX8-NOHSA-NEXT:    s_add_u32 s12, s16, 0xb0
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v13, v20, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s14
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s44
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s43
-; GFX8-NOHSA-NEXT:    s_addc_u32 s13, s17, 0
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v11, 8, s11
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s15
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s66
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0xe0
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s14, s14
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s14
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s65
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s64
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s63
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0xd0
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s13, s13
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s13
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s62
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s61
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s60
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0xc0
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s12, s12
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s12
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s59
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s58
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s57
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0xb0
 ; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s11, s11
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s13
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v7, 8, s8
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v6, 8, s4
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v12, v11, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s11
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s42
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s41
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s12
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s11, s4
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s4, s8
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s8, s10
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[15:16], v[11:14]
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v8, 8, s9
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s8
-; GFX8-NOHSA-NEXT:    s_add_u32 s8, s16, 0xa0
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s12, s9
-; GFX8-NOHSA-NEXT:    s_addc_u32 s9, s17, 0
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v9, 8, s10
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s9
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v12, v9, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s40
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s39
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s8
-; GFX8-NOHSA-NEXT:    s_add_u32 s8, s16, 0x90
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[15:16], v[11:14]
-; GFX8-NOHSA-NEXT:    s_addc_u32 s9, s17, 0
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v12, v8, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s12
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s38
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s37
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s9
-; GFX8-NOHSA-NEXT:    s_add_u32 s8, s16, 0x80
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[11:14]
-; GFX8-NOHSA-NEXT:    s_addc_u32 s9, s17, 0
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v12, v7, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s4
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s36
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s35
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s9
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s4, s7
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[7:8], v[11:14]
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v0, 8, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s16, 0x70
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s8, s5
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s17, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s5
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v4, 8, s7
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s16, 0x60
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v12, v4, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s34
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s33
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s17, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[11:14]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s5
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v2, 8, s6
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s6, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s16, 0x50
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v12, v2, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s31
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s30
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s17, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[11:14]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s16, 64
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v12, v0, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s29
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v14, s28
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s17, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[11:14]
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v21, 8, s2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s16, 48
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s2, s2
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v7, v6, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s11
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s27
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s26
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s17, 0
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v10, 8, s3
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s3, s3
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[11:12], v[6:9]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s5
-; GFX8-NOHSA-NEXT:    s_add_u32 s2, s16, 32
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v10, v10, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s25
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s24
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s4
-; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s17, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[6:7], v[9:12]
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v5, v21, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s2
-; GFX8-NOHSA-NEXT:    s_add_u32 s2, s16, 16
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s23
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s22
-; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s17, 0
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v3, 8, s1
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s1, s1
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s21
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s20
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s2
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v1, 8, s0
-; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s0, s0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s16
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s19
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s18
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s17
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT:    s_endpgm
-;
-; EG-LABEL: constant_sextload_v64i8_to_v64i32:
-; EG:       ; %bb.0:
-; EG-NEXT:    ALU 0, @32, KC0[CB0:0-32], KC1[]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s11
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s56
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s55
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s54
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0xa0
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s10, s10
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s53
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s52
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s51
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0x90
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s9, s9
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s9
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s50
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s49
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s48
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0x80
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s8, s8
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s47
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s46
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s45
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0x70
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s44
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s43
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s42
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s41
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0x60
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s40
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s39
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s38
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s37
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s16, 0x50
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s5, s5
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s4, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s36
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s35
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s34
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s3, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT:    s_add_u32 s4, s16, 64
+; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s33
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s31
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s30
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT:    s_add_u32 s4, s16, 48
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s2, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s29
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s28
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s27
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s1, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s16, 32
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s17, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s16, 16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s26
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s25
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s24
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s17, 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s23
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s22
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s21
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_sext_i32_i8 s0, s0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s20
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s19
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s17
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_endpgm
+;
+; EG-LABEL: constant_sextload_v64i8_to_v64i32:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @32, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @24
 ; EG-NEXT:    ALU 40, @33, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @28
@@ -5051,138 +5059,124 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v0, 8, s15
-; GFX12-NEXT:    v_lshrrev_b16 v2, 8, s14
-; GFX12-NEXT:    v_lshrrev_b16 v3, 8, s13
-; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s12
-; GFX12-NEXT:    s_ashr_i32 s49, s15, 24
-; GFX12-NEXT:    s_bfe_i32 s50, s15, 0x80010
-; GFX12-NEXT:    s_sext_i32_i8 s15, s15
-; GFX12-NEXT:    s_ashr_i32 s47, s14, 24
-; GFX12-NEXT:    s_bfe_i32 s48, s14, 0x80010
+; GFX12-NEXT:    s_ashr_i32 s64, s15, 24
+; GFX12-NEXT:    s_bfe_i32 s65, s15, 0x80010
+; GFX12-NEXT:    s_sext_i32_i8 s66, s15
+; GFX12-NEXT:    s_bfe_i32 s15, s15, 0x80008
+; GFX12-NEXT:    s_ashr_i32 s61, s14, 24
+; GFX12-NEXT:    s_bfe_i32 s62, s14, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s63, s14, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s14, s14
-; GFX12-NEXT:    v_dual_mov_b32 v59, 0 :: v_dual_mov_b32 v52, s15
-; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s11
-; GFX12-NEXT:    s_ashr_i32 s45, s13, 24
-; GFX12-NEXT:    s_bfe_i32 s46, s13, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s15
+; GFX12-NEXT:    s_ashr_i32 s58, s13, 24
+; GFX12-NEXT:    s_bfe_i32 s59, s13, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s60, s13, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s13, s13
-; GFX12-NEXT:    v_bfe_i32 v53, v0, 0, 8
-; GFX12-NEXT:    v_dual_mov_b32 v54, s50 :: v_dual_mov_b32 v55, s49
-; GFX12-NEXT:    v_lshrrev_b16 v7, 8, s10
-; GFX12-NEXT:    s_ashr_i32 s43, s12, 24
-; GFX12-NEXT:    s_bfe_i32 s44, s12, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v0, s66 :: v_dual_mov_b32 v3, s64
+; GFX12-NEXT:    v_dual_mov_b32 v2, s65 :: v_dual_mov_b32 v5, s63
+; GFX12-NEXT:    s_ashr_i32 s55, s12, 24
+; GFX12-NEXT:    s_bfe_i32 s56, s12, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s57, s12, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s12, s12
-; GFX12-NEXT:    v_bfe_i32 v49, v2, 0, 8
-; GFX12-NEXT:    v_dual_mov_b32 v48, s14 :: v_dual_mov_b32 v51, s47
-; GFX12-NEXT:    v_dual_mov_b32 v50, s48 :: v_dual_mov_b32 v47, s45
-; GFX12-NEXT:    v_lshrrev_b16 v8, 8, s9
-; GFX12-NEXT:    s_ashr_i32 s41, s11, 24
-; GFX12-NEXT:    s_bfe_i32 s42, s11, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v4, s14 :: v_dual_mov_b32 v7, s61
+; GFX12-NEXT:    v_dual_mov_b32 v6, s62 :: v_dual_mov_b32 v9, s60
+; GFX12-NEXT:    v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v11, s58
+; GFX12-NEXT:    v_dual_mov_b32 v10, s59 :: v_dual_mov_b32 v13, s57
+; GFX12-NEXT:    s_ashr_i32 s52, s11, 24
+; GFX12-NEXT:    s_bfe_i32 s53, s11, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s54, s11, 0x80008
+; GFX12-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s55
+; GFX12-NEXT:    v_mov_b32_e32 v14, s56
 ; GFX12-NEXT:    s_sext_i32_i8 s11, s11
-; GFX12-NEXT:    v_bfe_i32 v45, v3, 0, 8
-; GFX12-NEXT:    v_dual_mov_b32 v44, s13 :: v_dual_mov_b32 v43, s43
-; GFX12-NEXT:    v_mov_b32_e32 v46, s46
-; GFX12-NEXT:    v_lshrrev_b16 v10, 8, s8
-; GFX12-NEXT:    s_ashr_i32 s39, s10, 24
-; GFX12-NEXT:    v_bfe_i32 v41, v4, 0, 8
-; GFX12-NEXT:    v_dual_mov_b32 v40, s12 :: v_dual_mov_b32 v57, s42
-; GFX12-NEXT:    v_mov_b32_e32 v42, s44
-; GFX12-NEXT:    v_lshrrev_b16 v11, 8, s7
-; GFX12-NEXT:    v_lshrrev_b16 v12, 8, s6
-; GFX12-NEXT:    s_bfe_i32 s40, s10, 0x80010
+; GFX12-NEXT:    s_ashr_i32 s49, s10, 24
+; GFX12-NEXT:    s_bfe_i32 s50, s10, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s51, s10, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s10, s10
-; GFX12-NEXT:    v_lshrrev_b16 v14, 8, s5
-; GFX12-NEXT:    v_lshrrev_b16 v15, 8, s4
-; GFX12-NEXT:    s_ashr_i32 s37, s9, 24
-; GFX12-NEXT:    s_bfe_i32 s38, s9, 0x80010
+; GFX12-NEXT:    s_ashr_i32 s46, s9, 24
+; GFX12-NEXT:    s_bfe_i32 s47, s9, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s48, s9, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s9, s9
-; GFX12-NEXT:    v_bfe_i32 v56, v6, 0, 8
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v59, v[52:55], s[16:17] offset:240
-; GFX12-NEXT:    global_store_b128 v59, v[48:51], s[16:17] offset:224
-; GFX12-NEXT:    global_store_b128 v59, v[44:47], s[16:17] offset:208
-; GFX12-NEXT:    global_store_b128 v59, v[40:43], s[16:17] offset:192
-; GFX12-NEXT:    v_mov_b32_e32 v41, s39
-; GFX12-NEXT:    v_dual_mov_b32 v55, s11 :: v_dual_mov_b32 v58, s41
-; GFX12-NEXT:    v_mov_b32_e32 v37, s37
-; GFX12-NEXT:    s_ashr_i32 s33, s7, 24
-; GFX12-NEXT:    s_ashr_i32 s35, s8, 24
-; GFX12-NEXT:    s_bfe_i32 s36, s8, 0x80010
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[16:17] offset:240
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[16:17] offset:224
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[16:17] offset:208
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:192
+; GFX12-NEXT:    v_dual_mov_b32 v1, s54 :: v_dual_mov_b32 v0, s11
+; GFX12-NEXT:    v_dual_mov_b32 v3, s52 :: v_dual_mov_b32 v2, s53
+; GFX12-NEXT:    v_mov_b32_e32 v5, s51
+; GFX12-NEXT:    s_ashr_i32 s43, s8, 24
+; GFX12-NEXT:    s_bfe_i32 s44, s8, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s45, s8, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s8, s8
-; GFX12-NEXT:    v_bfe_i32 v39, v7, 0, 8
-; GFX12-NEXT:    v_dual_mov_b32 v38, s10 :: v_dual_mov_b32 v33, s35
-; GFX12-NEXT:    v_dual_mov_b32 v40, s40 :: v_dual_mov_b32 v29, s33
-; GFX12-NEXT:    v_lshrrev_b16 v13, 8, s3
-; GFX12-NEXT:    s_ashr_i32 s28, s5, 24
-; GFX12-NEXT:    s_ashr_i32 s30, s6, 24
-; GFX12-NEXT:    s_bfe_i32 s31, s6, 0x80010
-; GFX12-NEXT:    s_sext_i32_i8 s6, s6
-; GFX12-NEXT:    s_bfe_i32 s34, s7, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v7, s49
+; GFX12-NEXT:    v_dual_mov_b32 v6, s50 :: v_dual_mov_b32 v9, s48
+; GFX12-NEXT:    s_ashr_i32 s40, s7, 24
+; GFX12-NEXT:    s_bfe_i32 s41, s7, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s42, s7, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s7, s7
-; GFX12-NEXT:    v_bfe_i32 v35, v8, 0, 8
-; GFX12-NEXT:    v_dual_mov_b32 v34, s9 :: v_dual_mov_b32 v25, s30
-; GFX12-NEXT:    v_mov_b32_e32 v36, s38
-; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s2
-; GFX12-NEXT:    s_ashr_i32 s18, s0, 24
-; GFX12-NEXT:    s_ashr_i32 s20, s1, 24
-; GFX12-NEXT:    s_ashr_i32 s22, s2, 24
-; GFX12-NEXT:    s_ashr_i32 s24, s3, 24
-; GFX12-NEXT:    s_ashr_i32 s26, s4, 24
-; GFX12-NEXT:    s_bfe_i32 s29, s5, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s46
+; GFX12-NEXT:    v_dual_mov_b32 v10, s47 :: v_dual_mov_b32 v13, s45
+; GFX12-NEXT:    s_ashr_i32 s34, s5, 24
+; GFX12-NEXT:    s_bfe_i32 s35, s5, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s36, s5, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s5, s5
-; GFX12-NEXT:    v_bfe_i32 v31, v10, 0, 8
-; GFX12-NEXT:    v_dual_mov_b32 v30, s8 :: v_dual_mov_b32 v19, s26
-; GFX12-NEXT:    v_mov_b32_e32 v32, s36
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s1
-; GFX12-NEXT:    s_bfe_i32 s27, s4, 0x80010
+; GFX12-NEXT:    s_ashr_i32 s37, s6, 24
+; GFX12-NEXT:    s_bfe_i32 s38, s6, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s39, s6, 0x80008
+; GFX12-NEXT:    s_sext_i32_i8 s6, s6
+; GFX12-NEXT:    v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s43
+; GFX12-NEXT:    v_dual_mov_b32 v14, s44 :: v_dual_mov_b32 v17, s42
+; GFX12-NEXT:    s_ashr_i32 s30, s4, 24
+; GFX12-NEXT:    s_bfe_i32 s31, s4, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s33, s4, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s4, s4
-; GFX12-NEXT:    v_bfe_i32 v23, v12, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v27, v11, 0, 8
-; GFX12-NEXT:    v_dual_mov_b32 v26, s7 :: v_dual_mov_b32 v11, s22
-; GFX12-NEXT:    v_dual_mov_b32 v28, s34 :: v_dual_mov_b32 v7, s20
-; GFX12-NEXT:    v_dual_mov_b32 v22, s6 :: v_dual_mov_b32 v3, s18
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s0
-; GFX12-NEXT:    s_bfe_i32 s25, s3, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v16, s7 :: v_dual_mov_b32 v19, s40
+; GFX12-NEXT:    v_dual_mov_b32 v18, s41 :: v_dual_mov_b32 v21, s39
+; GFX12-NEXT:    s_ashr_i32 s27, s3, 24
+; GFX12-NEXT:    s_bfe_i32 s28, s3, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s29, s3, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s3, s3
-; GFX12-NEXT:    v_bfe_i32 v17, v15, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v21, v14, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v24, s31
-; GFX12-NEXT:    v_dual_mov_b32 v20, s5 :: v_dual_mov_b32 v15, s24
+; GFX12-NEXT:    v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v23, s37
+; GFX12-NEXT:    v_mov_b32_e32 v22, s38
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v59, v[55:58], s[16:17] offset:176
-; GFX12-NEXT:    global_store_b128 v59, v[38:41], s[16:17] offset:160
-; GFX12-NEXT:    global_store_b128 v59, v[34:37], s[16:17] offset:144
-; GFX12-NEXT:    global_store_b128 v59, v[30:33], s[16:17] offset:128
-; GFX12-NEXT:    global_store_b128 v59, v[26:29], s[16:17] offset:112
-; GFX12-NEXT:    global_store_b128 v59, v[22:25], s[16:17] offset:96
-; GFX12-NEXT:    v_dual_mov_b32 v22, s29 :: v_dual_mov_b32 v23, s28
-; GFX12-NEXT:    s_bfe_i32 s23, s2, 0x80010
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[16:17] offset:176
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[16:17] offset:160
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[16:17] offset:144
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:128
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[16:17] offset:112
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[16:17] offset:96
+; GFX12-NEXT:    v_dual_mov_b32 v1, s36 :: v_dual_mov_b32 v0, s5
+; GFX12-NEXT:    v_dual_mov_b32 v3, s34 :: v_dual_mov_b32 v2, s35
+; GFX12-NEXT:    v_mov_b32_e32 v5, s33
+; GFX12-NEXT:    s_ashr_i32 s24, s2, 24
+; GFX12-NEXT:    s_bfe_i32 s25, s2, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s26, s2, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s2, s2
-; GFX12-NEXT:    v_mov_b32_e32 v16, s4
-; GFX12-NEXT:    v_mov_b32_e32 v18, s27
-; GFX12-NEXT:    s_bfe_i32 s21, s1, 0x80010
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s30
+; GFX12-NEXT:    v_dual_mov_b32 v6, s31 :: v_dual_mov_b32 v9, s29
+; GFX12-NEXT:    s_ashr_i32 s21, s1, 24
+; GFX12-NEXT:    s_bfe_i32 s22, s1, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s23, s1, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s1, s1
-; GFX12-NEXT:    v_bfe_i32 v13, v13, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v12, s3
-; GFX12-NEXT:    v_mov_b32_e32 v14, s25
+; GFX12-NEXT:    v_dual_mov_b32 v8, s3 :: v_dual_mov_b32 v11, s27
+; GFX12-NEXT:    v_dual_mov_b32 v10, s28 :: v_dual_mov_b32 v13, s26
+; GFX12-NEXT:    s_ashr_i32 s18, s0, 24
 ; GFX12-NEXT:    s_bfe_i32 s19, s0, 0x80010
+; GFX12-NEXT:    s_bfe_i32 s20, s0, 0x80008
 ; GFX12-NEXT:    s_sext_i32_i8 s0, s0
-; GFX12-NEXT:    v_bfe_i32 v9, v9, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v8, s2
-; GFX12-NEXT:    v_mov_b32_e32 v10, s23
-; GFX12-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v4, s1
-; GFX12-NEXT:    v_mov_b32_e32 v6, s21
-; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    v_mov_b32_e32 v2, s19
+; GFX12-NEXT:    v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v15, s24
+; GFX12-NEXT:    v_dual_mov_b32 v14, s25 :: v_dual_mov_b32 v17, s23
+; GFX12-NEXT:    v_dual_mov_b32 v16, s1 :: v_dual_mov_b32 v19, s21
+; GFX12-NEXT:    v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v21, s20
+; GFX12-NEXT:    v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v23, s18
+; GFX12-NEXT:    v_mov_b32_e32 v22, s19
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v59, v[20:23], s[16:17] offset:80
-; GFX12-NEXT:    global_store_b128 v59, v[16:19], s[16:17] offset:64
-; GFX12-NEXT:    global_store_b128 v59, v[12:15], s[16:17] offset:48
-; GFX12-NEXT:    global_store_b128 v59, v[8:11], s[16:17] offset:32
-; GFX12-NEXT:    global_store_b128 v59, v[4:7], s[16:17] offset:16
-; GFX12-NEXT:    global_store_b128 v59, v[0:3], s[16:17]
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[16:17] offset:80
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[16:17] offset:64
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[16:17] offset:48
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[16:17] offset:32
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[16:17] offset:16
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[16:17]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -5574,17 +5568,18 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i64:
 ; GFX8-NOHSA:       ; %bb.0:
 ; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT:    flat_load_ushort v2, v[0:1]
+; GFX8-NOHSA-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NOHSA-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v0, 0xff, v2
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e32 v2, 8, v2
+; GFX8-NOHSA-NEXT:    v_lshrrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NOHSA-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
 ;
@@ -5625,10 +5620,10 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v1, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v2, 8, v0
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5690,7 +5685,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e32 v2, 8, v0
+; GFX8-NOHSA-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX8-NOHSA-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX8-NOHSA-NEXT:    v_bfe_i32 v2, v2, 0, 8
 ; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -5736,7 +5731,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v0, v4, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_bfe_i32 v2, v1, 0, 8
@@ -5812,8 +5807,8 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s3, s2, 24
-; GFX8-NOHSA-NEXT:    s_and_b32 s4, s2, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v6, 8, s2
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s4, s2, 0x80008
+; GFX8-NOHSA-NEXT:    s_and_b32 s5, s2, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s2, s2, 0x80010
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
@@ -5823,8 +5818,8 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -5867,18 +5862,16 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s2
 ; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x80010
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
 ; GFX12-NEXT:    s_lshr_b32 s4, s2, 24
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x80008
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5957,24 +5950,25 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s6, s2, 24
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s8, s2, 8
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x80000
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x80000
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 16
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v2, v0, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 16
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s9
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -6018,19 +6012,18 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s2
 ; GFX12-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX12-NEXT:    s_lshr_b32 s6, s2, 24
+; GFX12-NEXT:    s_lshr_b32 s8, s2, 8
 ; GFX12-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GFX12-NEXT:    v_bfe_i32 v2, v1, 0, 8
 ; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s7
-; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT:    v_mov_b32_e32 v6, s6
-; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s7
+; GFX12-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v3, s9
+; GFX12-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
@@ -6132,37 +6125,37 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s4, s3, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX8-NOHSA-NEXT:    s_and_b32 s6, s3, 0xff
-; GFX8-NOHSA-NEXT:    s_and_b32 s7, s2, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v7, 8, s2
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s8, s2, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s5, s3, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s6, s2, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s7, s2, 0x80008
+; GFX8-NOHSA-NEXT:    s_and_b32 s8, s2, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s9, s2, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s10, s3, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s2, s3, 0x80010
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v6, 8, s3
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 32
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s0, 32
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v7
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s9
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -6225,29 +6218,26 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x80010
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
 ; GFX12-NEXT:    s_lshr_b32 s5, s3, 24
-; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s2
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x80008
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT:    s_lshr_b32 s4, s2, 24
-; GFX12-NEXT:    s_bfe_u32 s5, s2, 0x80010
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s3
+; GFX12-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT:    v_mov_b32_e32 v0, s5
+; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_lshr_b32 s3, s2, 24
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x80010
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x80008
 ; GFX12-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT:    s_and_b32 s2, s3, 0xff
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:32
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -6362,54 +6352,55 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64:
 ; GFX8-NOHSA:       ; %bb.0:
 ; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX8-NOHSA-NEXT:    s_mov_b32 s5, 0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s10, s2, 24
-; GFX8-NOHSA-NEXT:    s_mov_b32 s4, s3
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[12:13], s[2:3], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s6, s3, 8
+; GFX8-NOHSA-NEXT:    s_mov_b32 s8, s3
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s14, s2, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[16:17], s[2:3], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x80000
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX8-NOHSA-NEXT:    s_ashr_i64 s[2:3], s[2:3], 56
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT:    s_ashr_i64 s[2:3], s[2:3], 56
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v2, v1, 0, 8
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v6, v0, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s7
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 32
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s9
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s10
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s11
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s0, 32
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s12
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s13
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s12
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s13
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s17
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s14
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s15
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -6473,39 +6464,37 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_sextload_v8i8_to_v8i64:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    s_mov_b32 s5, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v7, 8, s3
-; GFX12-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX12-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX12-NEXT:    s_lshr_b32 s10, s2, 24
-; GFX12-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX12-NEXT:    s_mov_b32 s4, s3
-; GFX12-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GFX12-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX12-NEXT:    s_lshr_b32 s6, s3, 8
+; GFX12-NEXT:    s_mov_b32 s8, s3
+; GFX12-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX12-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX12-NEXT:    s_lshr_b32 s14, s2, 8
+; GFX12-NEXT:    s_bfe_i64 s[16:17], s[2:3], 0x80000
+; GFX12-NEXT:    s_ashr_i64 s[2:3], s[2:3], 56
+; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GFX12-NEXT:    v_bfe_i32 v14, v7, 0, 8
-; GFX12-NEXT:    s_bfe_i64 s[12:13], s[2:3], 0x80000
-; GFX12-NEXT:    s_ashr_i64 s[2:3], s[2:3], 56
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v9, s9
-; GFX12-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11
-; GFX12-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s5
-; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s13
-; GFX12-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX12-NEXT:    v_mov_b32_e32 v12, s4
-; GFX12-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s17
+; GFX12-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s9
+; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s7
+; GFX12-NEXT:    v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s11
+; GFX12-NEXT:    v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13
+; GFX12-NEXT:    v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v7, s15
+; GFX12-NEXT:    v_mov_b32_e32 v6, s14
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1]
 ; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -6666,15 +6655,16 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s8, s5, 24
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s2, s7, 24
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s9, s6, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s10, s4, 24
-; GFX8-NOHSA-NEXT:    s_and_b32 s11, s4, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v6, 8, s4
-; GFX8-NOHSA-NEXT:    s_and_b32 s12, s5, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v7, 8, s5
-; GFX8-NOHSA-NEXT:    s_and_b32 s13, s7, 0xff
-; GFX8-NOHSA-NEXT:    s_and_b32 s14, s6, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v9, 8, s6
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s10, s6, 0x80008
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s11, s7, 0x80008
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s12, s5, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s13, s4, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s14, s4, 0x80008
+; GFX8-NOHSA-NEXT:    s_and_b32 s15, s4, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s4, s4, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s16, s5, 0xff
+; GFX8-NOHSA-NEXT:    s_and_b32 s17, s7, 0xff
+; GFX8-NOHSA-NEXT:    s_and_b32 s18, s6, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s6, s6, 0x80010
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s5, s5, 0x80010
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s3, s7, 0x80010
@@ -6696,41 +6686,40 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 64
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 64
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s14
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v9
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 32
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s17
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v8, 8, s7
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s13
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v8
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s12
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v7
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s11
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s15
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s14
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -6837,43 +6826,39 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_lshr_b32 s2, s5, 24
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    s_bfe_u32 s3, s5, 0x80010
-; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s6
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s7
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:112
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    s_lshr_b32 s2, s6, 24
 ; GFX12-NEXT:    s_bfe_u32 s3, s6, 0x80010
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-NEXT:    s_lshr_b32 s2, s4, 24
-; GFX12-NEXT:    s_bfe_u32 s3, s4, 0x80010
+; GFX12-NEXT:    s_bfe_u32 s2, s6, 0x80008
+; GFX12-NEXT:    s_and_b32 s3, s6, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:80
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-NEXT:    s_and_b32 s2, s6, 0xff
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s5
-; GFX12-NEXT:    s_and_b32 s2, s7, 0xff
+; GFX12-NEXT:    s_bfe_u32 s2, s7, 0x80008
+; GFX12-NEXT:    s_and_b32 s3, s7, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:64
-; GFX12-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s4
-; GFX12-NEXT:    s_and_b32 s2, s5, 0xff
+; GFX12-NEXT:    v_mov_b32_e32 v0, s3
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    s_bfe_u32 s2, s5, 0x80008
+; GFX12-NEXT:    s_and_b32 s3, s5, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:96
-; GFX12-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT:    s_and_b32 s2, s4, 0xff
+; GFX12-NEXT:    v_mov_b32_e32 v0, s3
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    s_lshr_b32 s2, s4, 24
+; GFX12-NEXT:    s_bfe_u32 s3, s4, 0x80010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:32
-; GFX12-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-NEXT:    v_mov_b32_e32 v0, s3
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    s_bfe_u32 s2, s4, 0x80008
+; GFX12-NEXT:    s_and_b32 s3, s4, 0xff
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT:    v_mov_b32_e32 v0, s3
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -7076,25 +7061,28 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX8-NOHSA:       ; %bb.0:
 ; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
+; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s12, s11, 16
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s14, s10, 16
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s16, s10, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s18, s9, 16
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s20, s8, 16
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s22, s8, 24
-; GFX8-NOHSA-NEXT:    s_mov_b32 s24, s11
-; GFX8-NOHSA-NEXT:    s_mov_b32 s4, s9
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v0, 8, s11
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v1, 8, s10
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v4, 8, s9
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v2, 8, s8
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[2:3], s[8:9], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[10:11], 0x80000
-; GFX8-NOHSA-NEXT:    s_ashr_i64 s[8:9], s[8:9], 56
-; GFX8-NOHSA-NEXT:    s_ashr_i64 s[10:11], s[10:11], 56
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s10, s7, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s12, s7, 8
+; GFX8-NOHSA-NEXT:    s_mov_b32 s14, s7
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s16, s6, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s18, s6, 24
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s20, s6, 8
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s22, s5, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s24, s5, 8
+; GFX8-NOHSA-NEXT:    s_mov_b32 s26, s5
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s8, s4, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s28, s4, 24
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s30, s4, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[2:3], s[4:5], 0x80000
+; GFX8-NOHSA-NEXT:    s_ashr_i64 s[34:35], s[4:5], 56
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[36:37], s[6:7], 0x80000
+; GFX8-NOHSA-NEXT:    s_ashr_i64 s[38:39], s[6:7], 56
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[30:31], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[28:29], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x80000
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x80000
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
@@ -7102,74 +7090,76 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x80000
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v18, s10
-; GFX8-NOHSA-NEXT:    s_add_u32 s10, s0, 0x70
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v10, v1, 0, 8
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v14, v0, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v19, s11
-; GFX8-NOHSA-NEXT:    s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s12
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s13
+; GFX8-NOHSA-NEXT:    s_add_u32 s10, s0, 0x70
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NOHSA-NEXT:    s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s38
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s39
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NOHSA-NEXT:    s_add_u32 s10, s0, 0x60
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s14
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s15
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s12
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s13
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX8-NOHSA-NEXT:    s_add_u32 s10, s0, 0x50
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s11, s1, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s14
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s15
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v18, s16
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v19, s17
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s11
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v18, s8
-; GFX8-NOHSA-NEXT:    s_add_u32 s8, s0, 48
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v19, s9
-; GFX8-NOHSA-NEXT:    s_addc_u32 s9, s1, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s18
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s19
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s9
-; GFX8-NOHSA-NEXT:    s_add_u32 s8, s0, 16
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NOHSA-NEXT:    s_addc_u32 s9, s1, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s20
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s21
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v18, s22
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v19, s23
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s9
-; GFX8-NOHSA-NEXT:    s_add_u32 s8, s0, 0x60
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NOHSA-NEXT:    s_addc_u32 s9, s1, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s24
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s25
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s9
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s6
-; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 64
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s7
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s17
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s19
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NOHSA-NEXT:    s_add_u32 s10, s0, 64
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s36
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s37
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s21
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NOHSA-NEXT:    s_add_u32 s10, s0, 48
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s22
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s23
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NOHSA-NEXT:    s_add_u32 s10, s0, 32
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s26
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s27
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s24
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s25
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_nop 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v6, v4, 0, 8
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 32
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -7279,62 +7269,61 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v10, 8, s7
-; GFX12-NEXT:    v_lshrrev_b16 v11, 8, s6
-; GFX12-NEXT:    v_lshrrev_b16 v21, 8, s5
-; GFX12-NEXT:    v_lshrrev_b16 v23, 8, s4
-; GFX12-NEXT:    s_lshr_b32 s8, s7, 16
-; GFX12-NEXT:    s_lshr_b32 s10, s6, 16
-; GFX12-NEXT:    s_lshr_b32 s12, s6, 24
-; GFX12-NEXT:    v_bfe_i32 v22, v10, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v10, v11, 0, 8
-; GFX12-NEXT:    s_lshr_b32 s18, s4, 24
-; GFX12-NEXT:    s_mov_b32 s20, s7
-; GFX12-NEXT:    s_lshr_b32 s14, s5, 16
-; GFX12-NEXT:    s_bfe_i64 s[24:25], s[6:7], 0x80000
+; GFX12-NEXT:    s_lshr_b32 s2, s7, 16
+; GFX12-NEXT:    s_lshr_b32 s8, s7, 8
+; GFX12-NEXT:    s_mov_b32 s10, s7
+; GFX12-NEXT:    s_lshr_b32 s12, s6, 16
+; GFX12-NEXT:    s_lshr_b32 s14, s6, 24
+; GFX12-NEXT:    s_lshr_b32 s16, s6, 8
+; GFX12-NEXT:    s_bfe_i64 s[34:35], s[6:7], 0x80000
 ; GFX12-NEXT:    s_ashr_i64 s[6:7], s[6:7], 56
-; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX12-NEXT:    v_bfe_i32 v28, v21, 0, 8
-; GFX12-NEXT:    s_lshr_b32 s16, s4, 16
+; GFX12-NEXT:    v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    s_lshr_b32 s18, s5, 16
+; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v5, s35
+; GFX12-NEXT:    v_dual_mov_b32 v4, s34 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v9, s11
+; GFX12-NEXT:    s_lshr_b32 s20, s5, 8
 ; GFX12-NEXT:    s_mov_b32 s22, s5
-; GFX12-NEXT:    s_bfe_i64 s[2:3], s[4:5], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s9
+; GFX12-NEXT:    v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v13, s13
+; GFX12-NEXT:    s_lshr_b32 s24, s4, 16
+; GFX12-NEXT:    s_lshr_b32 s26, s4, 24
+; GFX12-NEXT:    s_lshr_b32 s28, s4, 8
+; GFX12-NEXT:    s_bfe_i64 s[30:31], s[4:5], 0x80000
 ; GFX12-NEXT:    s_ashr_i64 s[4:5], s[4:5], 56
-; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v30, 0 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v7, s5
-; GFX12-NEXT:    v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v13, s11
-; GFX12-NEXT:    v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13
-; GFX12-NEXT:    v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v5, s15
-; GFX12-NEXT:    v_bfe_i32 v24, v23, 0, 8
+; GFX12-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15
+; GFX12-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v7, s17
 ; GFX12-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v9, s25
-; GFX12-NEXT:    v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v21, s21
-; GFX12-NEXT:    v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v27, s23
-; GFX12-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
-; GFX12-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GFX12-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x80000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v4, s14 :: v_dual_mov_b32 v17, s17
-; GFX12-NEXT:    v_ashrrev_i32_e32 v29, 31, v28
-; GFX12-NEXT:    v_mov_b32_e32 v26, s22
-; GFX12-NEXT:    v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v19, s19
-; GFX12-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
+; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v17, s19
+; GFX12-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s5
+; GFX12-NEXT:    v_mov_b32_e32 v18, s4
+; GFX12-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v30, v[0:3], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v30, v[20:23], s[0:1] offset:96
-; GFX12-NEXT:    v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:112
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:96
+; GFX12-NEXT:    v_dual_mov_b32 v1, s23 :: v_dual_mov_b32 v0, s22
+; GFX12-NEXT:    v_dual_mov_b32 v3, s21 :: v_dual_mov_b32 v2, s20
+; GFX12-NEXT:    v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v8, s24
+; GFX12-NEXT:    v_dual_mov_b32 v11, s27 :: v_dual_mov_b32 v10, s26
+; GFX12-NEXT:    v_dual_mov_b32 v21, s31 :: v_dual_mov_b32 v20, s30
+; GFX12-NEXT:    v_dual_mov_b32 v23, s29 :: v_dual_mov_b32 v22, s28
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v30, v[12:15], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v30, v[8:11], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v30, v[4:7], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v30, v[26:29], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v30, v[16:19], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v30, v[22:25], s[0:1]
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[0:1] offset:80
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[0:1] offset:64
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[0:1] offset:48
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[0:1] offset:32
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -7610,154 +7599,149 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX8-NOHSA-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s12, s5, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s13, s7, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s14, s9, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s15, s11, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s16, s10, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s17, s8, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s18, s6, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s19, s4, 24
-; GFX8-NOHSA-NEXT:    s_and_b32 s2, s4, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v4, 8, s4
-; GFX8-NOHSA-NEXT:    s_and_b32 s3, s5, 0xff
-; GFX8-NOHSA-NEXT:    s_and_b32 s20, s6, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v6, 8, s6
-; GFX8-NOHSA-NEXT:    s_and_b32 s21, s7, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v7, 8, s7
-; GFX8-NOHSA-NEXT:    s_and_b32 s22, s8, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v8, 8, s8
-; GFX8-NOHSA-NEXT:    s_and_b32 s23, s9, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v9, 8, s9
-; GFX8-NOHSA-NEXT:    s_and_b32 s24, s10, 0xff
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v12, 8, s10
-; GFX8-NOHSA-NEXT:    s_and_b32 s25, s11, 0xff
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s26, s4, 0x80010
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s6, s6, 0x80010
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s8, s8, 0x80010
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s16, s9, 24
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s17, s11, 24
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s18, s13, 24
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s6, s15, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s19, s15, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s20, s14, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s21, s14, 0x80008
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s22, s13, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s23, s12, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s24, s12, 0x80008
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s25, s11, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s26, s10, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s27, s10, 0x80008
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s28, s9, 0x80008
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s4, s8, 24
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s2, s8, 0x80008
+; GFX8-NOHSA-NEXT:    s_and_b32 s3, s8, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s5, s8, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s8, s9, 0xff
+; GFX8-NOHSA-NEXT:    s_and_b32 s29, s10, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s10, s10, 0x80010
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s27, s5, 0x80010
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s7, s7, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s30, s11, 0xff
+; GFX8-NOHSA-NEXT:    s_and_b32 s31, s12, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s12, s12, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s33, s13, 0xff
+; GFX8-NOHSA-NEXT:    s_and_b32 s34, s14, 0xff
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s14, s14, 0x80010
+; GFX8-NOHSA-NEXT:    s_and_b32 s35, s15, 0xff
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s9, s9, 0x80010
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s4, s11, 0x80010
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 0xf0
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v5, 8, s5
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 0xb0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s15
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 0x70
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s9
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s14
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 48
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s11, s11, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s13, s13, 0x80010
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s7, s15, 0x80010
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 0xf0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s7
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s13
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 0xd0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s27
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s12
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 0x90
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s16
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 0x50
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s17
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 16
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 0xb0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s18
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 0xe0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s26
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 0x70
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s11
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 48
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s9
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 0xe0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s35
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s19
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v13, 8, s11
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 0xc0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s25
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v13
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 0xa0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s24
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v12
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT:    s_nop 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 0x80
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s23
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[9:10], v[0:3]
-; GFX8-NOHSA-NEXT:    s_nop 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 0x60
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s22
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
-; GFX8-NOHSA-NEXT:    s_nop 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v7
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 64
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s21
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 0xd0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s14
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 0xc0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s34
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s21
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 0xa0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s33
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s22
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 0x90
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s12
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s23
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 0x80
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s31
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s24
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 0x60
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s30
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s25
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 0x50
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s26
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 64
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s29
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s27
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s0, 32
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s28
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_nop 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 32
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s20
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NOHSA-NEXT:    s_add_u32 s4, s0, 16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s5
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
-; GFX8-NOHSA-NEXT:    s_nop 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s4
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[5:6], v[0:3]
-; GFX8-NOHSA-NEXT:    s_nop 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -7945,15 +7929,11 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_lshr_b32 s10, s5, 24
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v3, v1
 ; GFX12-NEXT:    s_bfe_u32 s11, s5, 0x80010
-; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s7
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s6
-; GFX12-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:240
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX12-NEXT:    s_lshr_b32 s10, s3, 24
 ; GFX12-NEXT:    s_bfe_u32 s11, s3, 0x80010
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:176
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s10
@@ -7962,70 +7942,66 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:112
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s10
-; GFX12-NEXT:    s_lshr_b32 s10, s6, 24
-; GFX12-NEXT:    s_bfe_u32 s11, s6, 0x80010
-; GFX12-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX12-NEXT:    s_bfe_u32 s10, s7, 0x80008
+; GFX12-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:48
-; GFX12-NEXT:    v_mov_b32_e32 v0, s11
-; GFX12-NEXT:    v_mov_b32_e32 v2, s10
-; GFX12-NEXT:    s_lshr_b32 s10, s4, 24
-; GFX12-NEXT:    s_bfe_u32 s11, s4, 0x80010
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:208
-; GFX12-NEXT:    v_mov_b32_e32 v0, s11
-; GFX12-NEXT:    v_mov_b32_e32 v2, s10
-; GFX12-NEXT:    s_lshr_b32 s10, s2, 24
-; GFX12-NEXT:    s_bfe_u32 s11, s2, 0x80010
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:144
-; GFX12-NEXT:    v_mov_b32_e32 v0, s11
-; GFX12-NEXT:    v_mov_b32_e32 v2, s10
-; GFX12-NEXT:    s_lshr_b32 s10, s0, 24
-; GFX12-NEXT:    s_bfe_u32 s11, s0, 0x80010
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:80
-; GFX12-NEXT:    v_mov_b32_e32 v0, s11
-; GFX12-NEXT:    v_mov_b32_e32 v2, s10
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:16
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s7
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s5
-; GFX12-NEXT:    s_and_b32 s5, s5, 0xff
+; GFX12-NEXT:    v_mov_b32_e32 v2, s10
+; GFX12-NEXT:    s_lshr_b32 s7, s6, 24
+; GFX12-NEXT:    s_bfe_u32 s10, s6, 0x80010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:224
+; GFX12-NEXT:    v_mov_b32_e32 v0, s10
+; GFX12-NEXT:    v_mov_b32_e32 v2, s7
+; GFX12-NEXT:    s_bfe_u32 s7, s6, 0x80008
+; GFX12-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:208
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s6
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s4
-; GFX12-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX12-NEXT:    v_mov_b32_e32 v2, s7
+; GFX12-NEXT:    s_bfe_u32 s6, s5, 0x80008
+; GFX12-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:192
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s5
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s3
-; GFX12-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX12-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-NEXT:    s_lshr_b32 s5, s4, 24
+; GFX12-NEXT:    s_bfe_u32 s6, s4, 0x80010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:160
+; GFX12-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-NEXT:    v_mov_b32_e32 v2, s5
+; GFX12-NEXT:    s_bfe_u32 s5, s4, 0x80008
+; GFX12-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:144
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s2
-; GFX12-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX12-NEXT:    v_mov_b32_e32 v2, s5
+; GFX12-NEXT:    s_bfe_u32 s4, s3, 0x80008
+; GFX12-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:128
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s3
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s1
-; GFX12-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX12-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-NEXT:    s_lshr_b32 s3, s2, 24
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x80010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:96
+; GFX12-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x80008
+; GFX12-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:80
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s0
-; GFX12-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-NEXT:    s_bfe_u32 s2, s1, 0x80008
+; GFX12-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:64
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s1
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    s_lshr_b32 s1, s0, 24
+; GFX12-NEXT:    s_bfe_u32 s2, s0, 0x80010
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:32
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-NEXT:    s_bfe_u32 s1, s0, 0x80008
+; GFX12-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9] offset:16
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[8:9]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -8410,194 +8386,211 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s28, s7, 16
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s30, s6, 16
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s34, s6, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s36, s5, 16
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s38, s4, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s46, s7, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s48, s7, 8
+; GFX8-NOHSA-NEXT:    s_mov_b32 s50, s7
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s52, s6, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s54, s6, 24
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s56, s6, 8
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s58, s5, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s60, s5, 8
+; GFX8-NOHSA-NEXT:    s_mov_b32 s62, s5
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s44, s4, 16
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s40, s4, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s42, s3, 16
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s44, s2, 16
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s46, s2, 24
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s26, s1, 16
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s22, s0, 16
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s20, s0, 24
-; GFX8-NOHSA-NEXT:    s_mov_b32 s48, s7
-; GFX8-NOHSA-NEXT:    s_mov_b32 s50, s5
-; GFX8-NOHSA-NEXT:    s_mov_b32 s52, s3
-; GFX8-NOHSA-NEXT:    s_mov_b32 s54, s1
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v4, 8, s7
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v1, 8, s6
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v0, 8, s5
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v5, 8, s4
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v8, 8, s3
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v9, 8, s2
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v6, 8, s1
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v2, 8, s0
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s38, s4, 8
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s36, s3, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s30, s3, 8
+; GFX8-NOHSA-NEXT:    s_mov_b32 s28, s3
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s24, s2, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s22, s2, 24
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s20, s2, 8
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s18, s1, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s14, s1, 8
+; GFX8-NOHSA-NEXT:    s_mov_b32 s64, s1
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s66, s0, 16
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s68, s0, 24
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s70, s0, 8
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[10:11], s[0:1], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[12:13], s[2:3], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[14:15], s[4:5], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[16:17], s[6:7], 0x80000
-; GFX8-NOHSA-NEXT:    s_ashr_i64 s[18:19], s[0:1], 56
-; GFX8-NOHSA-NEXT:    s_ashr_i64 s[24:25], s[2:3], 56
-; GFX8-NOHSA-NEXT:    s_ashr_i64 s[56:57], s[4:5], 56
-; GFX8-NOHSA-NEXT:    s_ashr_i64 s[58:59], s[6:7], 56
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[0:1], s[54:55], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[2:3], s[52:53], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[50:51], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[48:49], 0x80000
+; GFX8-NOHSA-NEXT:    s_ashr_i64 s[12:13], s[0:1], 56
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[16:17], s[2:3], 0x80000
+; GFX8-NOHSA-NEXT:    s_ashr_i64 s[26:27], s[2:3], 56
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[34:35], s[4:5], 0x80000
+; GFX8-NOHSA-NEXT:    s_ashr_i64 s[42:43], s[4:5], 56
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[72:73], s[6:7], 0x80000
+; GFX8-NOHSA-NEXT:    s_ashr_i64 s[74:75], s[6:7], 56
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[0:1], s[70:71], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[2:3], s[68:69], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[4:5], s[66:67], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[6:7], s[64:65], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x80000
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GFX8-NOHSA-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x80000
 ; GFX8-NOHSA-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s28
-; GFX8-NOHSA-NEXT:    s_add_u32 s28, s8, 0xf0
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v10, v9, 0, 8
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v14, v8, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s29
-; GFX8-NOHSA-NEXT:    s_addc_u32 s29, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s28
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v18, s58
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v19, s59
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s29
-; GFX8-NOHSA-NEXT:    s_add_u32 s28, s8, 0xd0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[16:19]
-; GFX8-NOHSA-NEXT:    s_addc_u32 s29, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s28
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s30
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s31
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v18, s34
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v19, s35
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s29
-; GFX8-NOHSA-NEXT:    s_add_u32 s28, s8, 0xb0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[16:19]
-; GFX8-NOHSA-NEXT:    s_addc_u32 s29, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s28
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s36
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s37
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v18, s56
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v19, s57
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s29
-; GFX8-NOHSA-NEXT:    s_add_u32 s28, s8, 0x90
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[16:19]
-; GFX8-NOHSA-NEXT:    s_addc_u32 s29, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s28
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s38
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s39
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v18, s40
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v19, s41
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s29
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[16:19]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v20, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v18, s24
-; GFX8-NOHSA-NEXT:    s_add_u32 s24, s8, 0x70
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v19, s25
-; GFX8-NOHSA-NEXT:    s_addc_u32 s25, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s24
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s42
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s43
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s25
-; GFX8-NOHSA-NEXT:    s_add_u32 s24, s8, 0x50
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[16:19]
-; GFX8-NOHSA-NEXT:    s_addc_u32 s25, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s24
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s44
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s45
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v18, s46
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v19, s47
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s25
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[16:19]
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v22, v4, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v18, s18
-; GFX8-NOHSA-NEXT:    s_add_u32 s18, s8, 48
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v19, s19
-; GFX8-NOHSA-NEXT:    s_addc_u32 s19, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s18
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s26
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s27
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s19
-; GFX8-NOHSA-NEXT:    s_add_u32 s18, s8, 16
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[16:19]
-; GFX8-NOHSA-NEXT:    s_addc_u32 s19, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s18
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s22
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s23
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v18, s20
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v19, s21
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s19
-; GFX8-NOHSA-NEXT:    s_add_u32 s6, s8, 0xe0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[16:19]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v21, s7
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v18, v5, 0, 8
-; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT:    s_add_u32 s6, s8, 0xc0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[20:23]
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[62:63], s[62:63], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[60:61], s[60:61], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[58:59], s[58:59], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[56:57], s[56:57], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[52:53], s[52:53], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x80000
+; GFX8-NOHSA-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x80000
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s46
+; GFX8-NOHSA-NEXT:    s_add_u32 s46, s8, 0xf0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s47
+; GFX8-NOHSA-NEXT:    s_addc_u32 s47, s9, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s46
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s74
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s75
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s47
+; GFX8-NOHSA-NEXT:    s_add_u32 s46, s8, 0xe0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s47, s9, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s46
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s50
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s51
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s48
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s49
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s47
+; GFX8-NOHSA-NEXT:    s_add_u32 s46, s8, 0xd0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s47, s9, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s46
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s52
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s53
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s54
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s55
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s47
+; GFX8-NOHSA-NEXT:    s_add_u32 s46, s8, 0xc0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s47, s9, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s46
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s72
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s73
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s56
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s57
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s47
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_nop 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s42
+; GFX8-NOHSA-NEXT:    s_add_u32 s42, s8, 0xb0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s43
+; GFX8-NOHSA-NEXT:    s_addc_u32 s43, s9, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s58
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s59
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s43
+; GFX8-NOHSA-NEXT:    s_add_u32 s42, s8, 0xa0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s43, s9, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s62
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s63
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s60
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s61
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s43
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_nop 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s40
+; GFX8-NOHSA-NEXT:    s_add_u32 s40, s8, 0x90
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s41
+; GFX8-NOHSA-NEXT:    s_addc_u32 s41, s9, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s40
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s44
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s45
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s41
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_nop 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s34
+; GFX8-NOHSA-NEXT:    s_add_u32 s34, s8, 0x80
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s35
+; GFX8-NOHSA-NEXT:    s_addc_u32 s35, s9, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s34
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s38
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s39
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s35
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_nop 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s26
+; GFX8-NOHSA-NEXT:    s_add_u32 s26, s8, 0x70
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s27
+; GFX8-NOHSA-NEXT:    s_addc_u32 s27, s9, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s36
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s37
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NOHSA-NEXT:    s_add_u32 s26, s8, 0x60
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_addc_u32 s27, s9, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s26
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s28
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s29
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s30
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s31
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s27
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_nop 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s22
+; GFX8-NOHSA-NEXT:    s_add_u32 s22, s8, 0x50
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s23
+; GFX8-NOHSA-NEXT:    s_addc_u32 s23, s9, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s22
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s24
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s25
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s23
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_nop 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s16
+; GFX8-NOHSA-NEXT:    s_add_u32 s16, s8, 64
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s17
+; GFX8-NOHSA-NEXT:    s_addc_u32 s17, s9, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s21
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s17
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_nop 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s12
+; GFX8-NOHSA-NEXT:    s_add_u32 s12, s8, 48
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s13
+; GFX8-NOHSA-NEXT:    s_addc_u32 s13, s9, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s12
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s18
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s19
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s13
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_nop 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT:    s_add_u32 s6, s8, 32
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s7, s9, 0
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v22, v1, 0, 8
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s6
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v20, s16
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v21, s17
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s14
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s15
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s7
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[20:23]
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v20, s4
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s8, 0xa0
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v22, v0, 0, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v21, s5
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT:    s_add_u32 s4, s8, 0x80
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
-; GFX8-NOHSA-NEXT:    s_addc_u32 s5, s9, 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    s_nop 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT:    s_add_u32 s2, s8, 16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s9, 0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s14
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v17, s15
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s2
-; GFX8-NOHSA-NEXT:    s_add_u32 s2, s8, 0x60
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s3
-; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT:    s_add_u32 s2, s8, 64
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
-; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s12
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s13
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 32
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NOHSA-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NOHSA-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -8797,113 +8790,116 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v0, 8, s7
-; GFX12-NEXT:    v_lshrrev_b16 v3, 8, s5
-; GFX12-NEXT:    v_lshrrev_b16 v7, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s6
-; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s4
-; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s1
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s3
-; GFX12-NEXT:    v_lshrrev_b16 v2, 8, s0
-; GFX12-NEXT:    s_lshr_b32 s20, s7, 16
-; GFX12-NEXT:    s_lshr_b32 s24, s6, 24
-; GFX12-NEXT:    s_lshr_b32 s26, s5, 16
-; GFX12-NEXT:    s_lshr_b32 s36, s2, 16
-; GFX12-NEXT:    s_lshr_b32 s38, s2, 24
-; GFX12-NEXT:    v_bfe_i32 v10, v7, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v22, v3, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v30, v0, 0, 8
-; GFX12-NEXT:    s_lshr_b32 s42, s0, 16
-; GFX12-NEXT:    s_mov_b32 s46, s7
-; GFX12-NEXT:    s_mov_b32 s48, s5
-; GFX12-NEXT:    s_mov_b32 s50, s3
-; GFX12-NEXT:    s_lshr_b32 s22, s6, 16
-; GFX12-NEXT:    s_lshr_b32 s28, s4, 16
-; GFX12-NEXT:    s_lshr_b32 s30, s4, 24
-; GFX12-NEXT:    s_lshr_b32 s40, s1, 16
-; GFX12-NEXT:    s_bfe_i64 s[16:17], s[6:7], 0x80000
-; GFX12-NEXT:    s_ashr_i64 s[54:55], s[2:3], 56
-; GFX12-NEXT:    s_ashr_i64 s[56:57], s[4:5], 56
-; GFX12-NEXT:    s_ashr_i64 s[6:7], s[6:7], 56
-; GFX12-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v18, v4, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v26, v1, 0, 8
-; GFX12-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x80000
+; GFX12-NEXT:    s_lshr_b32 s36, s7, 16
+; GFX12-NEXT:    s_lshr_b32 s38, s7, 8
+; GFX12-NEXT:    s_mov_b32 s40, s7
+; GFX12-NEXT:    s_lshr_b32 s42, s6, 16
+; GFX12-NEXT:    s_lshr_b32 s44, s6, 24
+; GFX12-NEXT:    s_ashr_i64 s[74:75], s[6:7], 56
 ; GFX12-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GFX12-NEXT:    s_lshr_b32 s34, s3, 16
-; GFX12-NEXT:    s_lshr_b32 s44, s0, 24
-; GFX12-NEXT:    s_mov_b32 s52, s1
-; GFX12-NEXT:    s_bfe_i64 s[12:13], s[2:3], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[14:15], s[4:5], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[2:3], s[50:51], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[4:5], s[48:49], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v64, 0 :: v_dual_mov_b32 v33, s21
-; GFX12-NEXT:    s_ashr_i64 s[18:19], s[0:1], 56
-; GFX12-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-NEXT:    v_bfe_i32 v14, v5, 0, 8
+; GFX12-NEXT:    s_lshr_b32 s46, s6, 8
 ; GFX12-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v35, s7
-; GFX12-NEXT:    v_dual_mov_b32 v34, s6 :: v_dual_mov_b32 v37, s23
-; GFX12-NEXT:    v_dual_mov_b32 v38, s24 :: v_dual_mov_b32 v41, s27
-; GFX12-NEXT:    v_dual_mov_b32 v40, s26 :: v_dual_mov_b32 v43, s57
-; GFX12-NEXT:    v_dual_mov_b32 v42, s56 :: v_dual_mov_b32 v45, s29
-; GFX12-NEXT:    v_dual_mov_b32 v50, s54 :: v_dual_mov_b32 v53, s37
-; GFX12-NEXT:    v_dual_mov_b32 v52, s36 :: v_dual_mov_b32 v55, s39
-; GFX12-NEXT:    v_dual_mov_b32 v54, s38 :: v_dual_mov_b32 v57, s41
-; GFX12-NEXT:    s_bfe_i64 s[10:11], s[0:1], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[0:1], s[52:53], 0x80000
-; GFX12-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
-; GFX12-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
-; GFX12-NEXT:    v_ashrrev_i32_e32 v31, 31, v30
+; GFX12-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s37
 ; GFX12-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v60, s42 :: v_dual_mov_b32 v29, s47
-; GFX12-NEXT:    v_dual_mov_b32 v28, s46 :: v_dual_mov_b32 v63, s45
-; GFX12-NEXT:    v_dual_mov_b32 v24, s16 :: v_dual_mov_b32 v21, s5
-; GFX12-NEXT:    v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v17, s15
-; GFX12-NEXT:    v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v9, s13
-; GFX12-NEXT:    v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v5, s1
+; GFX12-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s75
+; GFX12-NEXT:    v_dual_mov_b32 v2, s74 :: v_dual_mov_b32 v5, s41
+; GFX12-NEXT:    s_lshr_b32 s48, s5, 16
+; GFX12-NEXT:    s_bfe_i64 s[72:73], s[6:7], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s39
+; GFX12-NEXT:    v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v9, s43
+; GFX12-NEXT:    s_lshr_b32 s50, s5, 8
+; GFX12-NEXT:    s_mov_b32 s52, s5
+; GFX12-NEXT:    v_dual_mov_b32 v8, s42 :: v_dual_mov_b32 v11, s45
+; GFX12-NEXT:    v_dual_mov_b32 v10, s44 :: v_dual_mov_b32 v13, s73
+; GFX12-NEXT:    s_lshr_b32 s54, s4, 16
+; GFX12-NEXT:    s_lshr_b32 s56, s4, 24
+; GFX12-NEXT:    s_ashr_i64 s[70:71], s[4:5], 56
+; GFX12-NEXT:    v_dual_mov_b32 v12, s72 :: v_dual_mov_b32 v15, s47
+; GFX12-NEXT:    s_bfe_i64 s[36:37], s[48:49], 0x80000
+; GFX12-NEXT:    v_mov_b32_e32 v14, s46
+; GFX12-NEXT:    s_lshr_b32 s58, s4, 8
+; GFX12-NEXT:    s_bfe_i64 s[52:53], s[52:53], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x80000
+; GFX12-NEXT:    s_lshr_b32 s60, s3, 16
+; GFX12-NEXT:    s_bfe_i64 s[56:57], s[56:57], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x80000
+; GFX12-NEXT:    s_clause 0x3
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[8:9] offset:240
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[8:9] offset:224
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[8:9] offset:208
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[8:9] offset:192
+; GFX12-NEXT:    v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v0, s36
+; GFX12-NEXT:    v_dual_mov_b32 v3, s71 :: v_dual_mov_b32 v2, s70
+; GFX12-NEXT:    v_mov_b32_e32 v5, s53
+; GFX12-NEXT:    s_lshr_b32 s34, s3, 8
+; GFX12-NEXT:    s_mov_b32 s30, s3
+; GFX12-NEXT:    s_lshr_b32 s24, s2, 16
+; GFX12-NEXT:    s_lshr_b32 s22, s2, 24
+; GFX12-NEXT:    s_bfe_i64 s[28:29], s[4:5], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[58:59], s[58:59], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v4, s52 :: v_dual_mov_b32 v7, s51
+; GFX12-NEXT:    v_dual_mov_b32 v6, s50 :: v_dual_mov_b32 v9, s55
+; GFX12-NEXT:    s_lshr_b32 s20, s2, 8
+; GFX12-NEXT:    s_ashr_i64 s[26:27], s[2:3], 56
+; GFX12-NEXT:    s_bfe_i64 s[60:61], s[60:61], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v8, s54 :: v_dual_mov_b32 v11, s57
+; GFX12-NEXT:    v_dual_mov_b32 v10, s56 :: v_dual_mov_b32 v13, s29
+; GFX12-NEXT:    s_lshr_b32 s18, s1, 16
+; GFX12-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v36, s22 :: v_dual_mov_b32 v39, s25
-; GFX12-NEXT:    v_dual_mov_b32 v44, s28 :: v_dual_mov_b32 v47, s31
-; GFX12-NEXT:    v_dual_mov_b32 v46, s30 :: v_dual_mov_b32 v49, s35
-; GFX12-NEXT:    v_dual_mov_b32 v56, s40 :: v_dual_mov_b32 v59, s19
-; GFX12-NEXT:    v_dual_mov_b32 v58, s18 :: v_dual_mov_b32 v61, s43
-; GFX12-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX12-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; GFX12-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
-; GFX12-NEXT:    v_dual_mov_b32 v62, s44 :: v_dual_mov_b32 v25, s17
-; GFX12-NEXT:    v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v13, s3
-; GFX12-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v1, s11
-; GFX12-NEXT:    v_dual_mov_b32 v48, s34 :: v_dual_mov_b32 v51, s55
-; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
-; GFX12-NEXT:    s_clause 0x9
-; GFX12-NEXT:    global_store_b128 v64, v[32:35], s[8:9] offset:240
-; GFX12-NEXT:    global_store_b128 v64, v[28:31], s[8:9] offset:224
-; GFX12-NEXT:    global_store_b128 v64, v[36:39], s[8:9] offset:208
-; GFX12-NEXT:    global_store_b128 v64, v[24:27], s[8:9] offset:192
-; GFX12-NEXT:    global_store_b128 v64, v[40:43], s[8:9] offset:176
-; GFX12-NEXT:    global_store_b128 v64, v[20:23], s[8:9] offset:160
-; GFX12-NEXT:    global_store_b128 v64, v[44:47], s[8:9] offset:144
-; GFX12-NEXT:    global_store_b128 v64, v[16:19], s[8:9] offset:128
-; GFX12-NEXT:    global_store_b128 v64, v[48:51], s[8:9] offset:112
-; GFX12-NEXT:    global_store_b128 v64, v[12:15], s[8:9] offset:96
-; GFX12-NEXT:    v_mov_b32_e32 v0, s10
+; GFX12-NEXT:    v_dual_mov_b32 v12, s28 :: v_dual_mov_b32 v15, s59
+; GFX12-NEXT:    v_dual_mov_b32 v14, s58 :: v_dual_mov_b32 v17, s61
+; GFX12-NEXT:    s_lshr_b32 s14, s1, 8
+; GFX12-NEXT:    s_mov_b32 s62, s1
+; GFX12-NEXT:    s_bfe_i64 s[16:17], s[2:3], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v16, s60 :: v_dual_mov_b32 v19, s27
+; GFX12-NEXT:    v_dual_mov_b32 v18, s26 :: v_dual_mov_b32 v21, s31
+; GFX12-NEXT:    s_lshr_b32 s64, s0, 16
+; GFX12-NEXT:    s_lshr_b32 s66, s0, 24
+; GFX12-NEXT:    s_ashr_i64 s[12:13], s[0:1], 56
+; GFX12-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v20, s30 :: v_dual_mov_b32 v23, s35
+; GFX12-NEXT:    v_mov_b32_e32 v22, s34
 ; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v64, v[52:55], s[8:9] offset:80
-; GFX12-NEXT:    global_store_b128 v64, v[8:11], s[8:9] offset:64
-; GFX12-NEXT:    global_store_b128 v64, v[56:59], s[8:9] offset:48
-; GFX12-NEXT:    global_store_b128 v64, v[4:7], s[8:9] offset:32
-; GFX12-NEXT:    global_store_b128 v64, v[60:63], s[8:9] offset:16
-; GFX12-NEXT:    global_store_b128 v64, v[0:3], s[8:9]
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[8:9] offset:176
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[8:9] offset:160
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[8:9] offset:144
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[8:9] offset:128
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[8:9] offset:112
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[8:9] offset:96
+; GFX12-NEXT:    v_dual_mov_b32 v1, s25 :: v_dual_mov_b32 v0, s24
+; GFX12-NEXT:    v_dual_mov_b32 v3, s23 :: v_dual_mov_b32 v2, s22
+; GFX12-NEXT:    v_mov_b32_e32 v5, s17
+; GFX12-NEXT:    s_lshr_b32 s68, s0, 8
+; GFX12-NEXT:    s_bfe_i64 s[6:7], s[62:63], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v7, s21
+; GFX12-NEXT:    v_dual_mov_b32 v6, s20 :: v_dual_mov_b32 v9, s19
+; GFX12-NEXT:    s_bfe_i64 s[2:3], s[66:67], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[4:5], s[64:65], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v8, s18 :: v_dual_mov_b32 v11, s13
+; GFX12-NEXT:    v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v13, s7
+; GFX12-NEXT:    s_bfe_i64 s[10:11], s[0:1], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[0:1], s[68:69], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v15, s15
+; GFX12-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v17, s5
+; GFX12-NEXT:    v_dual_mov_b32 v16, s4 :: v_dual_mov_b32 v19, s3
+; GFX12-NEXT:    v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v21, s11
+; GFX12-NEXT:    v_dual_mov_b32 v20, s10 :: v_dual_mov_b32 v23, s1
+; GFX12-NEXT:    v_mov_b32_e32 v22, s0
+; GFX12-NEXT:    s_clause 0x5
+; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[8:9] offset:80
+; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[8:9] offset:64
+; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[8:9] offset:48
+; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[8:9] offset:32
+; GFX12-NEXT:    global_store_b128 v24, v[16:19], s[8:9] offset:16
+; GFX12-NEXT:    global_store_b128 v24, v[20:23], s[8:9]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -9317,7 +9313,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i16:
 ; GFX8-NOHSA:       ; %bb.0:
 ; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9325,10 +9320,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e32 v4, 8, v2
-; GFX8-NOHSA-NEXT:    v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v3, 8, v2
+; GFX8-NOHSA-NEXT:    v_and_b32_e32 v3, 0xff0000, v3
+; GFX8-NOHSA-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NOHSA-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NOHSA-NEXT:    s_endpgm
 ;
@@ -9361,11 +9355,13 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xff, v1
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -9421,6 +9417,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX8-NOHSA:       ; %bb.0:
 ; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, 0xffff
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, 8
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9429,8 +9426,8 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NOHSA-NEXT:    v_and_b32_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e32 v2, 8, v2
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NOHSA-NEXT:    v_lshlrev_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NOHSA-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NOHSA-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NOHSA-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -9474,11 +9471,13 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_bfe_i32 v2, v1, 0, 8
-; GFX12-NEXT:    v_ashrrev_i16 v1, 8, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX12-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -9538,14 +9537,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v2, 8, s2
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NOHSA-NEXT:    s_and_b32 s1, s2, 0xff
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v3, s0, v3, 16
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v3
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v2, s1, v2
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NOHSA-NEXT:    v_alignbit_b32 v2, s0, v2, 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s2, 0xff0000
+; GFX8-NOHSA-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NOHSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
 ;
@@ -9599,22 +9599,19 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX12-LABEL: constant_zextload_v4i8_to_v4i16:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX12-NEXT:    v_and_b32_e64 v0, 0xff, s2
-; GFX12-NEXT:    v_and_b32_e64 v1, 0xff, s3
-; GFX12-NEXT:    v_lshrrev_b16 v2, 8, s2
-; GFX12-NEXT:    s_lshr_b32 s2, s2, 24
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX12-NEXT:    v_lshl_or_b32 v1, s2, 16, v1
-; GFX12-NEXT:    global_store_b64 v3, v[0:1], s[0:1]
+; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s4, s2, 24
+; GFX12-NEXT:    s_and_b32 s5, s2, 0xff
+; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT:    s_pack_ll_b32_b16 s3, s5, s3
+; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    v_mov_b32_e32 v0, s3
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -9680,16 +9677,18 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX8-NOHSA-NEXT:    s_ashr_i32 s1, s2, 24
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s0, 0x80000
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s1, s2
 ; GFX8-NOHSA-NEXT:    s_bfe_i32 s3, s2, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v2, 8, s2
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s2, s2, 24
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s0, 0x80000
+; GFX8-NOHSA-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xffff0000
 ; GFX8-NOHSA-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NOHSA-NEXT:    s_and_b32 s2, 0xffff, s3
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NOHSA-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX8-NOHSA-NEXT:    s_or_b32 s1, s3, s1
+; GFX8-NOHSA-NEXT:    s_or_b32 s0, s0, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -9753,17 +9752,17 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_bfe_i32 s4, s2, 0x80000
 ; GFX12-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX12-NEXT:    v_ashrrev_i16 v0, 8, s2
-; GFX12-NEXT:    v_and_b32_e64 v1, 0xffff, s4
-; GFX12-NEXT:    s_ashr_i32 s2, s2, 24
+; GFX12-NEXT:    s_sext_i32_i16 s5, s2
+; GFX12-NEXT:    s_ashr_i32 s4, s2, 24
+; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s5, s5, 8
 ; GFX12-NEXT:    s_bfe_i32 s3, s3, 0x80000
+; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
+; GFX12-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -9841,23 +9840,25 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s0, s2, 24
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s1, s3, 24
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v3, 8, s2
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s4, s3, 0x80010
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s5, s3, 0xff
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX8-NOHSA-NEXT:    v_alignbit_b32 v0, s0, v0, 16
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v2, 8, s3
-; GFX8-NOHSA-NEXT:    s_or_b32 s0, s4, s1
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s2, 0xff
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s3, s3, 0xff0000
+; GFX8-NOHSA-NEXT:    s_and_b32 s2, s2, 0xff0000
+; GFX8-NOHSA-NEXT:    s_or_b32 s1, s4, s1
+; GFX8-NOHSA-NEXT:    s_or_b32 s3, s5, s3
+; GFX8-NOHSA-NEXT:    s_or_b32 s0, s0, s2
 ; GFX8-NOHSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
-; GFX8-NOHSA-NEXT:    s_and_b32 s1, s2, 0xff
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v0, s1, v0
-; GFX8-NOHSA-NEXT:    s_and_b32 s1, s3, 0xff
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v2, s1, v2
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
 ;
@@ -9944,25 +9945,22 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshr_b32 s5, s2, 16
-; GFX12-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX12-NEXT:    v_and_b32_e64 v0, 0xff, s2
-; GFX12-NEXT:    v_and_b32_e64 v2, 0xff, s3
-; GFX12-NEXT:    v_and_b32_e64 v3, 0xff, s6
-; GFX12-NEXT:    v_and_b32_e64 v5, 0xff, s5
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s3
-; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s2
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT:    s_lshr_b32 s4, s2, 24
-; GFX12-NEXT:    s_lshr_b32 s2, s3, 24
-; GFX12-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
-; GFX12-NEXT:    v_lshl_or_b32 v2, v1, 16, v2
-; GFX12-NEXT:    v_lshl_or_b32 v3, s2, 16, v3
-; GFX12-NEXT:    v_lshl_or_b32 v1, s4, 16, v5
+; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s5, s2, 24
+; GFX12-NEXT:    s_bfe_u32 s6, s3, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s7, s3, 24
+; GFX12-NEXT:    s_bfe_u32 s8, s3, 0x80010
+; GFX12-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX12-NEXT:    s_bfe_u32 s9, s2, 0x80010
+; GFX12-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX12-NEXT:    s_pack_ll_b32_b16 s7, s8, s7
+; GFX12-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
+; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX12-NEXT:    s_pack_ll_b32_b16 s4, s9, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -10052,29 +10050,33 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s0, s3
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s3, 0x80000
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_or_b32 s7, s1, s0
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s0, s2
 ; GFX8-NOHSA-NEXT:    s_bfe_i32 s6, s2, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT:    s_ashr_i64 s[0:1], s[2:3], 56
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX8-NOHSA-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s0, 0xffff0000
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s1, 0xffff, s6
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v0, s1, v0
+; GFX8-NOHSA-NEXT:    s_or_b32 s6, s6, s0
+; GFX8-NOHSA-NEXT:    s_ashr_i64 s[0:1], s[2:3], 56
 ; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s5, 0x80000
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NOHSA-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NOHSA-NEXT:    s_or_b32 s0, s1, s0
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s3, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v2, s1, v1
 ; GFX8-NOHSA-NEXT:    s_ashr_i32 s1, s2, 24
 ; GFX8-NOHSA-NEXT:    s_bfe_i32 s2, s4, 0x80000
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NOHSA-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NOHSA-NEXT:    s_or_b32 s1, s2, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s7
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -10175,24 +10177,26 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_bfe_i32 s8, s2, 0x80000
-; GFX12-NEXT:    s_bfe_i32 s9, s3, 0x80000
+; GFX12-NEXT:    s_ashr_i64 s[4:5], s[2:3], 56
 ; GFX12-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX12-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX12-NEXT:    v_ashrrev_i16 v0, 8, s2
-; GFX12-NEXT:    v_ashrrev_i16 v2, 8, s3
-; GFX12-NEXT:    s_ashr_i64 s[4:5], s[2:3], 56
-; GFX12-NEXT:    v_and_b32_e64 v3, 0xffff, s8
-; GFX12-NEXT:    v_and_b32_e64 v5, 0xffff, s9
-; GFX12-NEXT:    s_ashr_i32 s2, s2, 24
-; GFX12-NEXT:    s_bfe_i32 s3, s6, 0x80000
-; GFX12-NEXT:    s_bfe_i32 s5, s7, 0x80000
-; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
-; GFX12-NEXT:    s_pack_ll_b32_b16 s3, s5, s4
-; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
-; GFX12-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
-; GFX12-NEXT:    v_mov_b32_e32 v3, s3
+; GFX12-NEXT:    s_bfe_i32 s5, s3, 0x80000
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_ashr_i32 s8, s2, 24
+; GFX12-NEXT:    s_bfe_i32 s9, s2, 0x80000
+; GFX12-NEXT:    s_sext_i32_i16 s2, s2
+; GFX12-NEXT:    s_bfe_i32 s7, s7, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s3, s3, 8
+; GFX12-NEXT:    s_bfe_i32 s6, s6, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX12-NEXT:    s_pack_ll_b32_b16 s4, s7, s4
+; GFX12-NEXT:    s_pack_ll_b32_b16 s3, s5, s3
+; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s9, s2
+; GFX12-NEXT:    s_pack_ll_b32_b16 s5, s6, s8
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
+; GFX12-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -10310,45 +10314,47 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v0, 8, s4
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s3, s4, 24
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v1, s3, v1, 16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT:    v_alignbit_b32 v0, s3, v0, 16
 ; GFX8-NOHSA-NEXT:    s_and_b32 s3, s4, 0xff
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v0, s3, v0
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s3, s7, 24
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX8-NOHSA-NEXT:    s_bfe_u32 s4, s7, 0x80010
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s8, s5, 24
+; GFX8-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff0000
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s9, s5, 0x80010
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX8-NOHSA-NEXT:    s_or_b32 s4, s3, s4
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s3, s7, 24
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s2, s6, 24
+; GFX8-NOHSA-NEXT:    s_or_b32 s8, s9, s8
+; GFX8-NOHSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX8-NOHSA-NEXT:    s_bfe_u32 s9, s7, 0x80010
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NOHSA-NEXT:    s_and_b32 s10, s5, 0xff
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NOHSA-NEXT:    s_or_b32 s3, s4, s3
-; GFX8-NOHSA-NEXT:    s_and_b32 s4, s7, 0xff
+; GFX8-NOHSA-NEXT:    s_or_b32 s3, s9, s3
+; GFX8-NOHSA-NEXT:    s_and_b32 s9, s7, 0xff
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v2, 8, s6
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s2, s6, 24
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX8-NOHSA-NEXT:    v_alignbit_b32 v0, s2, v0, 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s2, s6, 0xff
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s6, s6, 8
 ; GFX8-NOHSA-NEXT:    s_and_b32 s5, s5, 0xff0000
 ; GFX8-NOHSA-NEXT:    s_and_b32 s7, s7, 0xff0000
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s6
-; GFX8-NOHSA-NEXT:    s_or_b32 s8, s9, s8
+; GFX8-NOHSA-NEXT:    s_and_b32 s6, s6, 0xff0000
 ; GFX8-NOHSA-NEXT:    s_or_b32 s5, s10, s5
-; GFX8-NOHSA-NEXT:    s_or_b32 s4, s4, s7
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v3, s2, v3, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s2, s6, 0xff
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX8-NOHSA-NEXT:    s_or_b32 s7, s9, s7
+; GFX8-NOHSA-NEXT:    s_or_b32 s2, s2, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s3
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s8
@@ -10505,45 +10511,38 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshr_b32 s3, s6, 16
-; GFX12-NEXT:    s_lshr_b32 s9, s7, 16
-; GFX12-NEXT:    s_lshr_b32 s11, s4, 16
-; GFX12-NEXT:    s_lshr_b32 s13, s5, 16
-; GFX12-NEXT:    v_and_b32_e64 v4, 0xff, s5
-; GFX12-NEXT:    v_and_b32_e64 v5, 0xff, s4
-; GFX12-NEXT:    v_and_b32_e64 v6, 0xff, s7
-; GFX12-NEXT:    v_and_b32_e64 v7, 0xff, s6
-; GFX12-NEXT:    v_and_b32_e64 v11, 0xff, s9
-; GFX12-NEXT:    v_and_b32_e64 v12, 0xff, s3
-; GFX12-NEXT:    v_and_b32_e64 v9, 0xff, s13
-; GFX12-NEXT:    v_and_b32_e64 v10, 0xff, s11
-; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v5, 0xffff, v5
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s6
-; GFX12-NEXT:    v_lshrrev_b16 v3, 8, s7
-; GFX12-NEXT:    v_lshrrev_b16 v0, 8, s4
-; GFX12-NEXT:    v_lshrrev_b16 v2, 8, s5
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX12-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX12-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX12-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX12-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX12-NEXT:    s_lshr_b32 s2, s6, 24
-; GFX12-NEXT:    s_lshr_b32 s8, s7, 24
-; GFX12-NEXT:    s_lshr_b32 s10, s4, 24
-; GFX12-NEXT:    s_lshr_b32 s12, s5, 24
-; GFX12-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
-; GFX12-NEXT:    v_lshl_or_b32 v0, v0, 16, v5
-; GFX12-NEXT:    v_lshl_or_b32 v6, v3, 16, v6
-; GFX12-NEXT:    v_lshl_or_b32 v4, v1, 16, v7
-; GFX12-NEXT:    v_lshl_or_b32 v7, s8, 16, v11
-; GFX12-NEXT:    v_lshl_or_b32 v5, s2, 16, v12
-; GFX12-NEXT:    v_lshl_or_b32 v3, s12, 16, v9
-; GFX12-NEXT:    v_lshl_or_b32 v1, s10, 16, v10
+; GFX12-NEXT:    s_bfe_u32 s2, s6, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s3, s6, 24
+; GFX12-NEXT:    s_bfe_u32 s8, s7, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s9, s7, 24
+; GFX12-NEXT:    s_bfe_u32 s16, s7, 0x80010
+; GFX12-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX12-NEXT:    s_bfe_u32 s17, s6, 0x80010
+; GFX12-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX12-NEXT:    s_bfe_u32 s10, s4, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s11, s4, 24
+; GFX12-NEXT:    s_bfe_u32 s12, s5, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s13, s5, 24
+; GFX12-NEXT:    s_bfe_u32 s14, s5, 0x80010
+; GFX12-NEXT:    s_and_b32 s5, s5, 0xff
+; GFX12-NEXT:    s_bfe_u32 s15, s4, 0x80010
+; GFX12-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX12-NEXT:    s_pack_ll_b32_b16 s9, s16, s9
+; GFX12-NEXT:    s_pack_ll_b32_b16 s7, s7, s8
+; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s6, s2
+; GFX12-NEXT:    s_pack_ll_b32_b16 s3, s17, s3
+; GFX12-NEXT:    s_pack_ll_b32_b16 s13, s14, s13
+; GFX12-NEXT:    s_pack_ll_b32_b16 s5, s5, s12
+; GFX12-NEXT:    s_pack_ll_b32_b16 s11, s15, s11
+; GFX12-NEXT:    s_pack_ll_b32_b16 s4, s4, s10
+; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s9
+; GFX12-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s11
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s13
+; GFX12-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -10685,57 +10684,69 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s10, s5, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v0, 8, s5
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s10, 0xffff, s10
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s10, s5
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s11, s5, 0x80000
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s5, s5, 16
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s3, s3, 0x80000
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s2, s4, 16
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s5, s4, 0x80000
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v2, s10, v0
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v0, 8, s4
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s4, s3, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v3, s4, v1
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s3, s2, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v1, 8, s2
+; GFX8-NOHSA-NEXT:    s_and_b32 s5, s5, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s12, s4
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX8-NOHSA-NEXT:    s_or_b32 s5, s3, s5
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s3, s4, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s2, s2, 0x80000
+; GFX8-NOHSA-NEXT:    s_and_b32 s11, 0xffff, s11
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s12, s12, 8
+; GFX8-NOHSA-NEXT:    s_and_b32 s10, s10, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX8-NOHSA-NEXT:    s_or_b32 s10, s11, s10
+; GFX8-NOHSA-NEXT:    s_and_b32 s11, s12, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s12, s4, 0x80000
+; GFX8-NOHSA-NEXT:    s_or_b32 s4, s2, s3
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s2, s7
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s3, s7, 0x80000
+; GFX8-NOHSA-NEXT:    s_and_b32 s12, 0xffff, s12
+; GFX8-NOHSA-NEXT:    s_and_b32 s2, s2, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT:    s_or_b32 s11, s12, s11
+; GFX8-NOHSA-NEXT:    s_or_b32 s12, s3, s2
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s2, s6
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s3, s6, 0x80000
+; GFX8-NOHSA-NEXT:    s_and_b32 s2, s2, 0xffff0000
 ; GFX8-NOHSA-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s9, s7, 16
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v1, s3, v1
+; GFX8-NOHSA-NEXT:    s_or_b32 s13, s3, s2
 ; GFX8-NOHSA-NEXT:    s_ashr_i64 s[2:3], s[6:7], 56
 ; GFX8-NOHSA-NEXT:    s_bfe_i32 s3, s9, 0x80000
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s8, s6, 16
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s2, s2, 16
 ; GFX8-NOHSA-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NOHSA-NEXT:    s_or_b32 s2, s3, s2
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s3, s7, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v4, 8, s7
-; GFX8-NOHSA-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v6, s3, v4
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s3, s6, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v4, 8, s6
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s8, s6, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v4, s3, v4
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s3, s8, 0x80000
-; GFX8-NOHSA-NEXT:    s_and_b32 s5, 0xffff, s5
-; GFX8-NOHSA-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v5, 8, s8
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s2
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s3, s6, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s6, s8, 0x80000
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX8-NOHSA-NEXT:    s_or_b32 s3, s6, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NOHSA-NEXT:    s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v5, s3, v5
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s2
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v0, s5, v0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s13
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s12
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s11
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s10
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -10916,43 +10927,44 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_ashr_i64 s[2:3], s[6:7], 56
 ; GFX12-NEXT:    s_lshr_b32 s8, s6, 16
-; GFX12-NEXT:    v_ashrrev_i16 v5, 8, s6
-; GFX12-NEXT:    s_bfe_i32 s6, s6, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s9, s7, 16
+; GFX12-NEXT:    s_bfe_i32 s3, s7, 0x80000
+; GFX12-NEXT:    s_sext_i32_i16 s7, s7
+; GFX12-NEXT:    s_ashr_i32 s16, s6, 24
+; GFX12-NEXT:    s_bfe_i32 s17, s6, 0x80000
+; GFX12-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX12-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX12-NEXT:    s_lshr_b32 s11, s5, 16
-; GFX12-NEXT:    v_ashrrev_i16 v1, 8, s4
-; GFX12-NEXT:    s_bfe_i32 s4, s4, 0x80000
-; GFX12-NEXT:    v_ashrrev_i16 v0, 8, s5
-; GFX12-NEXT:    s_bfe_i32 s5, s5, 0x80000
-; GFX12-NEXT:    s_bfe_i32 s12, s7, 0x80000
-; GFX12-NEXT:    s_ashr_i64 s[2:3], s[6:7], 56
-; GFX12-NEXT:    v_and_b32_e64 v12, 0xffff, s6
-; GFX12-NEXT:    s_bfe_i32 s6, s8, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s9, s7, 16
-; GFX12-NEXT:    v_and_b32_e64 v7, 0xffff, s4
-; GFX12-NEXT:    s_bfe_i32 s3, s11, 0x80000
-; GFX12-NEXT:    s_bfe_i32 s4, s10, 0x80000
-; GFX12-NEXT:    v_ashrrev_i16 v2, 8, s7
-; GFX12-NEXT:    v_and_b32_e64 v4, 0xffff, s5
-; GFX12-NEXT:    v_and_b32_e64 v11, 0xffff, s12
-; GFX12-NEXT:    v_ashrrev_i16 v13, 8, s8
-; GFX12-NEXT:    v_and_b32_e64 v16, 0xffff, s6
-; GFX12-NEXT:    v_ashrrev_i16 v9, 8, s11
-; GFX12-NEXT:    v_ashrrev_i16 v10, 8, s10
-; GFX12-NEXT:    s_bfe_i32 s5, s9, 0x80000
-; GFX12-NEXT:    v_and_b32_e64 v14, 0xffff, s3
-; GFX12-NEXT:    v_and_b32_e64 v15, 0xffff, s4
-; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s5, s2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s2
-; GFX12-NEXT:    v_lshl_or_b32 v6, v0, 16, v4
-; GFX12-NEXT:    v_lshl_or_b32 v4, v1, 16, v7
-; GFX12-NEXT:    v_lshl_or_b32 v2, v2, 16, v11
-; GFX12-NEXT:    v_lshl_or_b32 v0, v5, 16, v12
-; GFX12-NEXT:    v_lshl_or_b32 v1, v13, 16, v16
-; GFX12-NEXT:    v_lshl_or_b32 v7, v9, 16, v14
-; GFX12-NEXT:    v_lshl_or_b32 v5, v10, 16, v15
+; GFX12-NEXT:    s_ashr_i32 s12, s5, 16
+; GFX12-NEXT:    s_bfe_i32 s13, s5, 0x80000
+; GFX12-NEXT:    s_sext_i32_i16 s5, s5
+; GFX12-NEXT:    s_ashr_i32 s14, s4, 24
+; GFX12-NEXT:    s_bfe_i32 s15, s4, 0x80000
+; GFX12-NEXT:    s_sext_i32_i16 s4, s4
+; GFX12-NEXT:    s_bfe_i32 s9, s9, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s7, s7, 8
+; GFX12-NEXT:    s_bfe_i32 s8, s8, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s6, s6, 8
+; GFX12-NEXT:    s_lshr_b32 s12, s12, 8
+; GFX12-NEXT:    s_bfe_i32 s11, s11, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s5, s5, 8
+; GFX12-NEXT:    s_bfe_i32 s10, s10, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s4, s4, 8
+; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s9, s2
+; GFX12-NEXT:    s_pack_ll_b32_b16 s3, s3, s7
+; GFX12-NEXT:    s_pack_ll_b32_b16 s6, s17, s6
+; GFX12-NEXT:    s_pack_ll_b32_b16 s7, s8, s16
+; GFX12-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
+; GFX12-NEXT:    s_pack_ll_b32_b16 s5, s13, s5
+; GFX12-NEXT:    s_pack_ll_b32_b16 s10, s10, s14
+; GFX12-NEXT:    s_pack_ll_b32_b16 s4, s15, s4
+; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT:    v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s10
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s11
+; GFX12-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
 ; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
@@ -11157,90 +11169,94 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s14, s1, 24
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v0, 8, s0
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s14, s14, 16
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s15, s1, 0x80010
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s13, s0, 24
-; GFX8-NOHSA-NEXT:    s_or_b32 s14, s15, s14
-; GFX8-NOHSA-NEXT:    s_and_b32 s15, s1, 0xff
+; GFX8-NOHSA-NEXT:    s_and_b32 s16, s1, 0xff
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s14, s14, 16
 ; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xff0000
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s13, s0, 24
+; GFX8-NOHSA-NEXT:    s_or_b32 s14, s15, s14
+; GFX8-NOHSA-NEXT:    s_or_b32 s15, s16, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, s0, 0xff
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s0, 0xff0000
+; GFX8-NOHSA-NEXT:    v_alignbit_b32 v0, s13, v0, 16
+; GFX8-NOHSA-NEXT:    s_or_b32 s13, s1, s0
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s0, s3, 24
-; GFX8-NOHSA-NEXT:    s_or_b32 s15, s15, s1
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s1, s3, 0x80010
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v1, s13, v1, 16
-; GFX8-NOHSA-NEXT:    s_or_b32 s13, s1, s0
+; GFX8-NOHSA-NEXT:    s_or_b32 s16, s1, s0
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s3, 8
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v2, 8, s2
 ; GFX8-NOHSA-NEXT:    s_and_b32 s0, s3, 0xff
 ; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xff0000
 ; GFX8-NOHSA-NEXT:    s_or_b32 s3, s0, s1
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s2, 8
 ; GFX8-NOHSA-NEXT:    s_and_b32 s0, s2, 0xff
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v2, s0, v2
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xff0000
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX8-NOHSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT:    s_or_b32 s2, s0, s1
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s0, s5, 24
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s1, s5, 0x80010
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s12, s2, 24
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NOHSA-NEXT:    s_or_b32 s2, s1, s0
+; GFX8-NOHSA-NEXT:    v_alignbit_b32 v0, s12, v0, 16
+; GFX8-NOHSA-NEXT:    s_or_b32 s12, s1, s0
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s5, 8
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v4, 8, s4
 ; GFX8-NOHSA-NEXT:    s_and_b32 s0, s5, 0xff
 ; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xff0000
 ; GFX8-NOHSA-NEXT:    s_or_b32 s5, s0, s1
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s4, 8
 ; GFX8-NOHSA-NEXT:    s_and_b32 s0, s4, 0xff
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v4, s0, v4
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s0, s7, 24
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xff0000
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s11, s4, 24
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NOHSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT:    s_or_b32 s4, s0, s1
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s0, s7, 24
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NOHSA-NEXT:    s_bfe_u32 s1, s7, 0x80010
-; GFX8-NOHSA-NEXT:    s_lshl_b32 s4, s7, 8
 ; GFX8-NOHSA-NEXT:    s_or_b32 s0, s1, s0
 ; GFX8-NOHSA-NEXT:    s_and_b32 s1, s7, 0xff
-; GFX8-NOHSA-NEXT:    s_and_b32 s4, s4, 0xff0000
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX8-NOHSA-NEXT:    v_alignbit_b32 v0, s11, v0, 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s7, s7, 0xff0000
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s10, s6, 24
-; GFX8-NOHSA-NEXT:    s_or_b32 s1, s1, s4
-; GFX8-NOHSA-NEXT:    s_and_b32 s4, s6, 0xff
+; GFX8-NOHSA-NEXT:    v_and_b32_e32 v5, 0xff00ff, v0
+; GFX8-NOHSA-NEXT:    s_or_b32 s1, s1, s7
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT:    s_and_b32 s7, s6, 0xff
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NOHSA-NEXT:    s_and_b32 s6, s6, 0xff0000
+; GFX8-NOHSA-NEXT:    s_or_b32 s6, s7, s6
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s0
 ; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 48
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    v_lshrrev_b16_e64 v6, 8, s6
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s6
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s1
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v7, s10, v7, 16
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NOHSA-NEXT:    v_alignbit_b32 v0, s10, v0, 16
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v10, s0
 ; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 32
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v7, 0xff00ff, v7
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v6, s4, v6
+; GFX8-NOHSA-NEXT:    v_and_b32_e32 v7, 0xff00ff, v0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s6
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[10:11], v[6:9]
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v5, s11, v5, 16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s1
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s0
 ; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 16
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v5, 0xff00ff, v5
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s5
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s12
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    v_alignbit_b32 v3, s12, v3, 16
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v3, 0xff00ff, v3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s3
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s13
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s16
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT:    v_and_b32_e32 v1, 0xff00ff, v1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s15
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s14
@@ -11527,83 +11543,68 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshr_b32 s25, s1, 16
-; GFX12-NEXT:    s_lshr_b32 s21, s3, 16
-; GFX12-NEXT:    s_lshr_b32 s23, s0, 16
-; GFX12-NEXT:    v_and_b32_e64 v6, 0xff, s1
-; GFX12-NEXT:    v_and_b32_e64 v10, 0xff, s3
-; GFX12-NEXT:    v_and_b32_e64 v11, 0xff, s2
-; GFX12-NEXT:    v_and_b32_e64 v12, 0xff, s5
-; GFX12-NEXT:    v_and_b32_e64 v13, 0xff, s4
-; GFX12-NEXT:    v_and_b32_e64 v14, 0xff, s25
-; GFX12-NEXT:    v_and_b32_e64 v7, 0xff, s0
-; GFX12-NEXT:    v_and_b32_e64 v15, 0xff, s23
-; GFX12-NEXT:    v_and_b32_e64 v17, 0xff, s21
-; GFX12-NEXT:    s_lshr_b32 s17, s5, 16
-; GFX12-NEXT:    v_lshrrev_b16 v8, 8, s4
-; GFX12-NEXT:    v_lshrrev_b16 v9, 8, s5
-; GFX12-NEXT:    v_lshrrev_b16 v3, 8, s2
-; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s3
-; GFX12-NEXT:    v_lshrrev_b16 v2, 8, s1
-; GFX12-NEXT:    v_and_b32_e64 v19, 0xff, s17
-; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX12-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX12-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX12-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX12-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX12-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_and_b32 v7, 0xffff, v7
-; GFX12-NEXT:    v_lshrrev_b16 v0, 8, s0
-; GFX12-NEXT:    v_and_b32_e32 v20, 0xffff, v15
-; GFX12-NEXT:    v_and_b32_e32 v15, 0xffff, v17
-; GFX12-NEXT:    s_lshr_b32 s11, s6, 16
-; GFX12-NEXT:    s_lshr_b32 s13, s7, 16
-; GFX12-NEXT:    s_lshr_b32 s24, s1, 24
-; GFX12-NEXT:    s_lshr_b32 s15, s4, 16
-; GFX12-NEXT:    s_lshr_b32 s20, s3, 24
-; GFX12-NEXT:    s_lshr_b32 s19, s2, 16
-; GFX12-NEXT:    v_and_b32_e32 v17, 0xffff, v19
-; GFX12-NEXT:    v_lshl_or_b32 v2, v2, 16, v6
-; GFX12-NEXT:    v_lshl_or_b32 v6, v4, 16, v10
-; GFX12-NEXT:    v_lshl_or_b32 v4, v3, 16, v11
-; GFX12-NEXT:    v_lshl_or_b32 v3, s24, 16, v14
-; GFX12-NEXT:    v_lshl_or_b32 v10, v9, 16, v12
-; GFX12-NEXT:    v_lshl_or_b32 v8, v8, 16, v13
-; GFX12-NEXT:    v_and_b32_e64 v9, 0xff, s7
-; GFX12-NEXT:    v_and_b32_e64 v12, 0xff, s6
-; GFX12-NEXT:    v_and_b32_e64 v13, 0xff, s13
-; GFX12-NEXT:    v_and_b32_e64 v14, 0xff, s11
-; GFX12-NEXT:    v_lshl_or_b32 v0, v0, 16, v7
-; GFX12-NEXT:    v_lshl_or_b32 v7, s20, 16, v15
-; GFX12-NEXT:    v_and_b32_e64 v15, 0xff, s15
-; GFX12-NEXT:    v_and_b32_e64 v18, 0xff, s19
-; GFX12-NEXT:    s_lshr_b32 s16, s5, 24
-; GFX12-NEXT:    v_lshrrev_b16 v1, 8, s6
-; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s7
-; GFX12-NEXT:    v_lshl_or_b32 v11, s16, 16, v17
-; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX12-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX12-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX12-NEXT:    v_and_b32_e32 v17, 0xffff, v14
-; GFX12-NEXT:    v_and_b32_e32 v19, 0xffff, v15
-; GFX12-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX12-NEXT:    s_lshr_b32 s10, s6, 24
-; GFX12-NEXT:    s_lshr_b32 s12, s7, 24
-; GFX12-NEXT:    s_lshr_b32 s14, s4, 24
-; GFX12-NEXT:    s_lshr_b32 s18, s2, 24
-; GFX12-NEXT:    v_lshl_or_b32 v14, v5, 16, v9
-; GFX12-NEXT:    v_lshl_or_b32 v12, v1, 16, v12
-; GFX12-NEXT:    v_lshl_or_b32 v15, s12, 16, v13
-; GFX12-NEXT:    v_lshl_or_b32 v13, s10, 16, v17
-; GFX12-NEXT:    s_lshr_b32 s22, s0, 24
-; GFX12-NEXT:    v_lshl_or_b32 v9, s14, 16, v19
-; GFX12-NEXT:    v_lshl_or_b32 v5, s18, 16, v18
-; GFX12-NEXT:    v_lshl_or_b32 v1, s22, 16, v20
+; GFX12-NEXT:    s_bfe_u32 s12, s7, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s13, s7, 24
+; GFX12-NEXT:    s_bfe_u32 s33, s7, 0x80010
+; GFX12-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX12-NEXT:    s_bfe_u32 s10, s6, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s11, s6, 24
+; GFX12-NEXT:    s_pack_ll_b32_b16 s7, s7, s12
+; GFX12-NEXT:    s_and_b32 s12, s6, 0xff
+; GFX12-NEXT:    s_bfe_u32 s6, s6, 0x80010
+; GFX12-NEXT:    s_bfe_u32 s14, s4, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s15, s4, 24
+; GFX12-NEXT:    s_bfe_u32 s16, s5, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s17, s5, 24
+; GFX12-NEXT:    s_bfe_u32 s30, s5, 0x80010
+; GFX12-NEXT:    s_and_b32 s5, s5, 0xff
+; GFX12-NEXT:    s_bfe_u32 s31, s4, 0x80010
+; GFX12-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX12-NEXT:    s_bfe_u32 s18, s2, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s19, s2, 24
+; GFX12-NEXT:    s_bfe_u32 s20, s3, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s21, s3, 24
+; GFX12-NEXT:    s_bfe_u32 s28, s3, 0x80010
+; GFX12-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX12-NEXT:    s_bfe_u32 s29, s2, 0x80010
+; GFX12-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX12-NEXT:    s_pack_ll_b32_b16 s13, s33, s13
+; GFX12-NEXT:    s_pack_ll_b32_b16 s10, s12, s10
+; GFX12-NEXT:    s_pack_ll_b32_b16 s6, s6, s11
+; GFX12-NEXT:    s_bfe_u32 s22, s0, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s23, s0, 24
+; GFX12-NEXT:    s_bfe_u32 s24, s1, 0x80008
+; GFX12-NEXT:    s_lshr_b32 s25, s1, 24
+; GFX12-NEXT:    s_bfe_u32 s26, s1, 0x80010
+; GFX12-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX12-NEXT:    s_bfe_u32 s27, s0, 0x80010
+; GFX12-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX12-NEXT:    s_pack_ll_b32_b16 s17, s30, s17
+; GFX12-NEXT:    s_pack_ll_b32_b16 s5, s5, s16
+; GFX12-NEXT:    s_pack_ll_b32_b16 s15, s31, s15
+; GFX12-NEXT:    s_pack_ll_b32_b16 s4, s4, s14
+; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-NEXT:    s_pack_ll_b32_b16 s21, s28, s21
+; GFX12-NEXT:    s_pack_ll_b32_b16 s3, s3, s20
+; GFX12-NEXT:    s_pack_ll_b32_b16 s19, s29, s19
+; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s2, s18
+; GFX12-NEXT:    v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v3, s13
+; GFX12-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s15
+; GFX12-NEXT:    s_pack_ll_b32_b16 s25, s26, s25
+; GFX12-NEXT:    s_pack_ll_b32_b16 s1, s1, s24
+; GFX12-NEXT:    s_pack_ll_b32_b16 s23, s27, s23
+; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s0, s22
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s17
+; GFX12-NEXT:    v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v9, s19
+; GFX12-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s21
+; GFX12-NEXT:    v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s23
+; GFX12-NEXT:    v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s25
+; GFX12-NEXT:    v_mov_b32_e32 v14, s1
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[8:9] offset:48
-; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[8:9] offset:32
-; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[8:9] offset:16
-; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[8:9]
+; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[8:9] offset:48
+; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[8:9] offset:32
+; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[8:9] offset:16
+; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[8:9]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -11855,114 +11856,137 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NOHSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
 ; GFX8-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s18, s1, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v0, 8, s1
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s17, s1, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s18, 0xffff, s18
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s18, s1
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s19, s1, 0x80000
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s1, s1, 16
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s18, s18, 8
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s17, s17, 0x80000
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s16, s0, 16
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v2, s18, v0
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s0, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v0, 8, s0
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s17, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v1, 8, s17
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v3, s0, v1
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s16, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v1, 8, s16
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v1, s0, v1
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s3, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v4, 8, s3
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v6, s0, v4
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s2, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v4, 8, s2
+; GFX8-NOHSA-NEXT:    s_and_b32 s18, s18, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_and_b32 s19, 0xffff, s19
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_and_b32 s17, 0xffff, s17
+; GFX8-NOHSA-NEXT:    s_or_b32 s18, s19, s18
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s19, s0
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s20, s0, 0x80000
+; GFX8-NOHSA-NEXT:    s_or_b32 s17, s17, s1
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s0, s0, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s16, 0x80000
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT:    s_or_b32 s16, s1, s0
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s0, s3
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s19, s19, 8
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s3, 0x80000
+; GFX8-NOHSA-NEXT:    s_and_b32 s19, s19, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_and_b32 s20, 0xffff, s20
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT:    s_or_b32 s19, s20, s19
+; GFX8-NOHSA-NEXT:    s_or_b32 s20, s1, s0
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s0, s2
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s2, 0x80000
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s15, s3, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v4, s0, v4
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s15, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v5, 8, s15
+; GFX8-NOHSA-NEXT:    s_or_b32 s21, s1, s0
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s0, s3, 16
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s15, 0x80000
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s14, s2, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v7, s0, v5
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s14, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v5, 8, s14
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT:    s_or_b32 s3, s1, s0
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s0, s2, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s14, 0x80000
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT:    s_or_b32 s2, s1, s0
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s0, s5
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s5, 0x80000
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT:    s_or_b32 s14, s1, s0
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s0, s4
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s4, 0x80000
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s0, 0xffff0000
 ; GFX8-NOHSA-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s13, s5, 16
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v0, s1, v0
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v5, s0, v5
+; GFX8-NOHSA-NEXT:    s_or_b32 s15, s1, s0
 ; GFX8-NOHSA-NEXT:    s_ashr_i64 s[0:1], s[4:5], 56
 ; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s13, 0x80000
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s12, s4, 16
 ; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX8-NOHSA-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX8-NOHSA-NEXT:    s_or_b32 s2, s1, s0
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s5, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v8, 8, s5
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v10, s0, v8
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s4, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v8, 8, s4
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v8, s0, v8
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s12, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v9, 8, s12
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v9, s0, v9
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s7, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v11, 8, s7
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v13, s0, v11
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s6, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v11, 8, s6
-; GFX8-NOHSA-NEXT:    s_lshr_b32 s11, s7, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v11, s0, v11
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s11, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v12, 8, s11
+; GFX8-NOHSA-NEXT:    s_or_b32 s5, s1, s0
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s0, s4, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s12, 0x80000
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT:    s_or_b32 s4, s1, s0
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s0, s7
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s1, s7, 0x80000
+; GFX8-NOHSA-NEXT:    s_and_b32 s0, s0, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NOHSA-NEXT:    s_lshr_b32 s10, s6, 16
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v14, s0, v12
-; GFX8-NOHSA-NEXT:    s_bfe_i32 s0, s10, 0x80000
-; GFX8-NOHSA-NEXT:    v_ashrrev_i16_e64 v12, 8, s10
-; GFX8-NOHSA-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX8-NOHSA-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NOHSA-NEXT:    v_or_b32_e32 v12, s0, v12
+; GFX8-NOHSA-NEXT:    s_lshr_b32 s11, s7, 16
+; GFX8-NOHSA-NEXT:    s_or_b32 s0, s1, s0
+; GFX8-NOHSA-NEXT:    s_sext_i32_i16 s1, s6
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s7, s7, 16
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s12, s6, 0x80000
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s11, s11, 0x80000
+; GFX8-NOHSA-NEXT:    s_ashr_i32 s6, s6, 24
+; GFX8-NOHSA-NEXT:    s_bfe_i32 s10, s10, 0x80000
+; GFX8-NOHSA-NEXT:    s_and_b32 s1, s1, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_and_b32 s12, 0xffff, s12
+; GFX8-NOHSA-NEXT:    s_and_b32 s7, s7, 0xffff0000
+; GFX8-NOHSA-NEXT:    s_and_b32 s11, 0xffff, s11
+; GFX8-NOHSA-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX8-NOHSA-NEXT:    s_and_b32 s10, 0xffff, s10
+; GFX8-NOHSA-NEXT:    s_or_b32 s1, s12, s1
+; GFX8-NOHSA-NEXT:    s_or_b32 s7, s11, s7
+; GFX8-NOHSA-NEXT:    s_or_b32 s6, s10, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 48
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v16, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v15, s0
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 32
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[15:16], v[11:14]
-; GFX8-NOHSA-NEXT:    s_nop 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v13, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v12, s0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NOHSA-NEXT:    s_add_u32 s0, s8, 16
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v11, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s15
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s14
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NOHSA-NEXT:    s_addc_u32 s1, s9, 0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NOHSA-NEXT:    s_nop 0
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v9, s1
-; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v8, s0
-; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NOHSA-NEXT:    s_nop 0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s21
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v4, s8
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v0, s19
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v1, s16
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v3, s17
 ; GFX8-NOHSA-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX8-NOHSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NOHSA-NEXT:    s_endpgm
@@ -12301,84 +12325,87 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX12-NEXT:    s_lshr_b32 s14, s2, 16
-; GFX12-NEXT:    v_ashrrev_i16 v4, 8, s2
-; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x80000
-; GFX12-NEXT:    s_bfe_i32 s20, s5, 0x80000
-; GFX12-NEXT:    v_ashrrev_i16 v7, 8, s4
-; GFX12-NEXT:    s_bfe_i32 s4, s4, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s17, s1, 16
-; GFX12-NEXT:    s_lshr_b32 s15, s3, 16
+; GFX12-NEXT:    s_lshr_b32 s13, s5, 16
 ; GFX12-NEXT:    s_lshr_b32 s16, s0, 16
-; GFX12-NEXT:    v_ashrrev_i16 v0, 8, s1
-; GFX12-NEXT:    s_bfe_i32 s18, s1, 0x80000
-; GFX12-NEXT:    v_ashrrev_i16 v1, 8, s0
-; GFX12-NEXT:    s_bfe_i32 s19, s0, 0x80000
-; GFX12-NEXT:    v_ashrrev_i16 v5, 8, s5
+; GFX12-NEXT:    s_lshr_b32 s17, s1, 16
+; GFX12-NEXT:    s_ashr_i32 s18, s1, 16
+; GFX12-NEXT:    s_bfe_i32 s19, s1, 0x80000
+; GFX12-NEXT:    s_sext_i32_i16 s20, s1
+; GFX12-NEXT:    s_ashr_i32 s21, s0, 24
+; GFX12-NEXT:    s_bfe_i32 s22, s0, 0x80000
+; GFX12-NEXT:    s_sext_i32_i16 s23, s0
 ; GFX12-NEXT:    s_ashr_i64 s[0:1], s[4:5], 56
-; GFX12-NEXT:    v_and_b32_e64 v10, 0xffff, s2
-; GFX12-NEXT:    v_and_b32_e64 v12, 0xffff, s20
-; GFX12-NEXT:    s_bfe_i32 s1, s17, 0x80000
-; GFX12-NEXT:    v_ashrrev_i16 v3, 8, s3
-; GFX12-NEXT:    s_bfe_i32 s3, s3, 0x80000
-; GFX12-NEXT:    s_bfe_i32 s2, s15, 0x80000
-; GFX12-NEXT:    v_and_b32_e64 v14, 0xffff, s1
-; GFX12-NEXT:    s_bfe_i32 s1, s12, 0x80000
-; GFX12-NEXT:    v_and_b32_e64 v2, 0xffff, s18
-; GFX12-NEXT:    v_and_b32_e64 v6, 0xffff, s19
-; GFX12-NEXT:    v_and_b32_e64 v8, 0xffff, s3
-; GFX12-NEXT:    v_ashrrev_i16 v11, 8, s15
-; GFX12-NEXT:    v_and_b32_e64 v13, 0xffff, s4
-; GFX12-NEXT:    v_and_b32_e64 v15, 0xffff, s2
-; GFX12-NEXT:    v_lshl_or_b32 v4, v4, 16, v10
-; GFX12-NEXT:    v_lshl_or_b32 v10, v5, 16, v12
-; GFX12-NEXT:    v_and_b32_e64 v5, 0xffff, s1
-; GFX12-NEXT:    s_bfe_i32 s1, s7, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s12, s4, 16
+; GFX12-NEXT:    s_bfe_i32 s1, s5, 0x80000
+; GFX12-NEXT:    s_sext_i32_i16 s5, s5
+; GFX12-NEXT:    s_bfe_i32 s13, s13, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s5, s5, 8
+; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s13, s0
+; GFX12-NEXT:    s_ashr_i32 s13, s4, 24
+; GFX12-NEXT:    s_bfe_i32 s12, s12, 0x80000
+; GFX12-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
+; GFX12-NEXT:    s_pack_ll_b32_b16 s5, s12, s13
+; GFX12-NEXT:    s_sext_i32_i16 s12, s4
+; GFX12-NEXT:    s_bfe_i32 s4, s4, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s12, s12, 8
+; GFX12-NEXT:    s_ashr_i32 s13, s7, 16
+; GFX12-NEXT:    s_pack_ll_b32_b16 s4, s4, s12
+; GFX12-NEXT:    s_lshr_b32 s12, s13, 8
+; GFX12-NEXT:    s_sext_i32_i16 s13, s7
 ; GFX12-NEXT:    s_lshr_b32 s11, s7, 16
-; GFX12-NEXT:    v_and_b32_e64 v12, 0xffff, s1
-; GFX12-NEXT:    s_bfe_i32 s1, s6, 0x80000
+; GFX12-NEXT:    s_bfe_i32 s7, s7, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s13, s13, 8
 ; GFX12-NEXT:    s_lshr_b32 s10, s6, 16
-; GFX12-NEXT:    v_lshl_or_b32 v2, v0, 16, v2
-; GFX12-NEXT:    v_lshl_or_b32 v0, v1, 16, v6
-; GFX12-NEXT:    v_lshl_or_b32 v6, v3, 16, v8
-; GFX12-NEXT:    v_lshl_or_b32 v8, v7, 16, v13
-; GFX12-NEXT:    v_lshl_or_b32 v7, v11, 16, v15
-; GFX12-NEXT:    v_and_b32_e64 v15, 0xffff, s1
-; GFX12-NEXT:    s_bfe_i32 s1, s11, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s13, s5, 16
-; GFX12-NEXT:    v_and_b32_e64 v22, 0xffff, s1
-; GFX12-NEXT:    s_bfe_i32 s1, s10, 0x80000
-; GFX12-NEXT:    v_ashrrev_i16 v9, 8, s17
-; GFX12-NEXT:    s_bfe_i32 s3, s14, 0x80000
-; GFX12-NEXT:    v_ashrrev_i16 v11, 8, s7
-; GFX12-NEXT:    v_ashrrev_i16 v13, 8, s6
-; GFX12-NEXT:    v_ashrrev_i16 v21, 8, s11
-; GFX12-NEXT:    v_ashrrev_i16 v23, 8, s10
-; GFX12-NEXT:    v_and_b32_e64 v24, 0xffff, s1
-; GFX12-NEXT:    s_bfe_i32 s5, s16, 0x80000
-; GFX12-NEXT:    v_ashrrev_i16 v1, 8, s12
-; GFX12-NEXT:    v_ashrrev_i16 v18, 8, s14
-; GFX12-NEXT:    s_bfe_i32 s4, s13, 0x80000
-; GFX12-NEXT:    v_and_b32_e64 v20, 0xffff, s3
-; GFX12-NEXT:    v_ashrrev_i16 v17, 8, s16
-; GFX12-NEXT:    v_and_b32_e64 v19, 0xffff, s5
-; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s4, s0
-; GFX12-NEXT:    v_mov_b32_e32 v16, 0
-; GFX12-NEXT:    v_lshl_or_b32 v3, v9, 16, v14
-; GFX12-NEXT:    v_lshl_or_b32 v14, v11, 16, v12
-; GFX12-NEXT:    v_mov_b32_e32 v11, s0
-; GFX12-NEXT:    v_lshl_or_b32 v12, v13, 16, v15
-; GFX12-NEXT:    v_lshl_or_b32 v15, v21, 16, v22
-; GFX12-NEXT:    v_lshl_or_b32 v13, v23, 16, v24
-; GFX12-NEXT:    v_lshl_or_b32 v9, v1, 16, v5
-; GFX12-NEXT:    v_lshl_or_b32 v5, v18, 16, v20
-; GFX12-NEXT:    v_lshl_or_b32 v1, v17, 16, v19
+; GFX12-NEXT:    s_bfe_i32 s11, s11, 0x80000
+; GFX12-NEXT:    s_pack_ll_b32_b16 s7, s7, s13
+; GFX12-NEXT:    s_sext_i32_i16 s13, s6
+; GFX12-NEXT:    s_lshr_b32 s14, s2, 16
+; GFX12-NEXT:    s_lshr_b32 s15, s3, 16
+; GFX12-NEXT:    s_ashr_i32 s24, s3, 16
+; GFX12-NEXT:    s_bfe_i32 s25, s3, 0x80000
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_ashr_i32 s26, s2, 24
+; GFX12-NEXT:    s_bfe_i32 s27, s2, 0x80000
+; GFX12-NEXT:    s_sext_i32_i16 s2, s2
+; GFX12-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
+; GFX12-NEXT:    s_ashr_i32 s12, s6, 24
+; GFX12-NEXT:    s_bfe_i32 s6, s6, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s13, s13, 8
+; GFX12-NEXT:    s_bfe_i32 s10, s10, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s24, s24, 8
+; GFX12-NEXT:    s_bfe_i32 s15, s15, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s3, s3, 8
+; GFX12-NEXT:    s_bfe_i32 s14, s14, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX12-NEXT:    s_pack_ll_b32_b16 s6, s6, s13
+; GFX12-NEXT:    s_pack_ll_b32_b16 s10, s10, s12
+; GFX12-NEXT:    s_lshr_b32 s18, s18, 8
+; GFX12-NEXT:    s_bfe_i32 s17, s17, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s20, s20, 8
+; GFX12-NEXT:    s_bfe_i32 s16, s16, 0x80000
+; GFX12-NEXT:    s_lshr_b32 s23, s23, 8
+; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s10
+; GFX12-NEXT:    s_pack_ll_b32_b16 s15, s15, s24
+; GFX12-NEXT:    s_pack_ll_b32_b16 s3, s25, s3
+; GFX12-NEXT:    s_pack_ll_b32_b16 s14, s14, s26
+; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s27, s2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT:    s_pack_ll_b32_b16 s17, s17, s18
+; GFX12-NEXT:    s_pack_ll_b32_b16 s18, s19, s20
+; GFX12-NEXT:    s_pack_ll_b32_b16 s16, s16, s21
+; GFX12-NEXT:    s_pack_ll_b32_b16 s19, s22, s23
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s0
+; GFX12-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v9, s14
+; GFX12-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s15
+; GFX12-NEXT:    v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s16
+; GFX12-NEXT:    v_dual_mov_b32 v12, s19 :: v_dual_mov_b32 v15, s17
+; GFX12-NEXT:    v_mov_b32_e32 v14, s18
 ; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[8:9] offset:48
-; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[8:9] offset:32
-; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[8:9] offset:16
-; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[8:9]
+; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[8:9] offset:48
+; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[8:9] offset:32
+; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[8:9] offset:16
+; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[8:9]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index 6ed99f7074b641..add5f13bd2d996 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -1,6 +1,6 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-NOHSA,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-HSA,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-NOHSA,FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
 
@@ -163,8 +163,7 @@ define amdgpu_kernel void @global_sextload_v2i8_to_v2i32(ptr addrspace(1) %out,
 ; GCN-NOHSA: buffer_load_dword v
 ; GCN-HSA: flat_load_dword v
 
-; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
-; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
 ; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
 
@@ -186,16 +185,12 @@ entry:
 ; GCN-NOHSA: buffer_load_dword v
 ; GCN-HSA: flat_load_dword v
 
-;FIXME: Need to optimize this sequence to avoid extra shift on VI.
-
 ; t23: i16 = truncate t18
 ; t49: i16 = srl t23, Constant:i32<8>
 ; t57: i32 = any_extend t49
 ; t58: i32 = sign_extend_inreg t57, ValueType:ch:i8
 
-; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
-; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}}
-; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
 
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
index a2e55ce06b5252..97314910f82809 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,FUNC %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,FUNC %s
 ; RUN: llc -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
 
@@ -161,12 +161,8 @@ define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(ptr addrspace(3) %out, p
 ;          t31: i32 = any_extend t23
 ;        t33: i32 = sign_extend_inreg t31, ValueType:ch:i8
 
-; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
-; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
-
-; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}}
-; VI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
-; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
+; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
 
 ; EG: LDS_USHORT_READ_RET
 ; EG-DAG: BFE_INT
@@ -182,8 +178,7 @@ define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(ptr addrspace(3) %out, p
 ; GFX9-NOT: m0
 ; GCN: ds_read_b32
 
-; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
-; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, {{v[0-9]+}}
+; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
 ; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index 0fb9e2572446b5..68b07bae032139 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 3
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O3 --amdgpu-lower-module-lds-strategy=module < %s | FileCheck -check-prefix=GCN %s
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
@@ -28,7 +28,7 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    ds_write_b8 v1, v2 offset:6
 ; GCN-NEXT:    ds_write_b16 v1, v3 offset:4
-; GCN-NEXT:    v_cmp_eq_u16_sdwa s[2:3], v3, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT:    v_cmp_eq_u32_sdwa s[2:3], v3, v0 src0_sel:BYTE_0 src1_sel:DWORD
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; GCN-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index a4bde5c9d82153..c06a3dab329822 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -20,13 +20,13 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s4, s2, 0xffff
-; VI-NEXT:    s_lshr_b32 s2, s2, 16
-; VI-NEXT:    s_lshr_b32 s5, s3, 16
-; VI-NEXT:    s_lshr_b32 s2, s2, s5
-; VI-NEXT:    s_lshr_b32 s3, s4, s3
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_or_b32 s2, s3, s2
+; VI-NEXT:    s_lshr_b32 s4, s3, 16
+; VI-NEXT:    s_lshr_b32 s5, s2, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_lshr_b32 s4, s5, s4
+; VI-NEXT:    s_lshr_b32 s2, s2, s3
+; VI-NEXT:    s_lshl_b32 s3, s4, 16
+; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 686797f290b97f..9f34e4e90f45ee 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -438,8 +438,10 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_sext_i32_i8 s2, s2
-; VI-NEXT:    s_sext_i32_i8 s3, s3
+; VI-NEXT:    s_bfe_i32 s2, s2, 0x80000
+; VI-NEXT:    s_bfe_i32 s3, s3, 0x80000
+; VI-NEXT:    s_sext_i32_i16 s3, s3
+; VI-NEXT:    s_sext_i32_i16 s2, s2
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -454,8 +456,10 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i8 s2, s2
-; GFX9-NEXT:    s_sext_i32_i8 s3, s3
+; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x80000
+; GFX9-NEXT:    s_bfe_i32 s3, s3, 0x80000
+; GFX9-NEXT:    s_sext_i32_i16 s3, s3
+; GFX9-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX9-NEXT:    s_min_i32 s2, s2, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
@@ -464,14 +468,16 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; GFX10-LABEL: s_test_imin_sle_i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dword s2, s[6:7], 0x28
-; GFX10-NEXT:    s_load_dword s3, s[6:7], 0x4c
+; GFX10-NEXT:    s_load_dword s2, s[6:7], 0x4c
+; GFX10-NEXT:    s_load_dword s3, s[6:7], 0x28
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i8 s2, s2
-; GFX10-NEXT:    s_sext_i32_i8 s3, s3
-; GFX10-NEXT:    s_min_i32 s2, s2, s3
+; GFX10-NEXT:    s_bfe_i32 s2, s2, 0x80000
+; GFX10-NEXT:    s_bfe_i32 s3, s3, 0x80000
+; GFX10-NEXT:    s_sext_i32_i16 s2, s2
+; GFX10-NEXT:    s_sext_i32_i16 s3, s3
+; GFX10-NEXT:    s_min_i32 s2, s3, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
@@ -479,15 +485,17 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; GFX11-LABEL: s_test_imin_sle_i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x28
-; GFX11-NEXT:    s_load_b32 s5, s[2:3], 0x4c
+; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x4c
+; GFX11-NEXT:    s_load_b32 s5, s[2:3], 0x28
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sext_i32_i8 s2, s4
-; GFX11-NEXT:    s_sext_i32_i8 s3, s5
+; GFX11-NEXT:    s_bfe_i32 s2, s4, 0x80000
+; GFX11-NEXT:    s_bfe_i32 s3, s5, 0x80000
+; GFX11-NEXT:    s_sext_i32_i16 s2, s2
+; GFX11-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_min_i32 s2, s2, s3
+; GFX11-NEXT:    s_min_i32 s2, s3, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
@@ -590,56 +598,81 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; VI-LABEL: s_test_imin_sle_v4i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
-; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
-; VI-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
-; VI-NEXT:    s_ashr_i32 s4, s2, 24
-; VI-NEXT:    s_bfe_i32 s5, s2, 0x80010
-; VI-NEXT:    s_sext_i32_i8 s2, s2
-; VI-NEXT:    s_ashr_i32 s6, s3, 24
-; VI-NEXT:    s_bfe_i32 s7, s3, 0x80010
-; VI-NEXT:    s_sext_i32_i8 s3, s3
-; VI-NEXT:    s_min_i32 s4, s4, s6
+; VI-NEXT:    s_lshr_b32 s4, s2, 16
+; VI-NEXT:    s_bfe_i32 s4, s4, 0x80000
+; VI-NEXT:    s_lshr_b32 s7, s3, 16
+; VI-NEXT:    s_bfe_i32 s7, s7, 0x80000
+; VI-NEXT:    s_ashr_i32 s5, s2, 24
+; VI-NEXT:    s_ashr_i32 s8, s3, 24
+; VI-NEXT:    s_sext_i32_i16 s7, s7
+; VI-NEXT:    s_sext_i32_i16 s4, s4
+; VI-NEXT:    s_bfe_i32 s6, s2, 0x80000
+; VI-NEXT:    s_sext_i32_i16 s2, s2
+; VI-NEXT:    s_bfe_i32 s9, s3, 0x80000
+; VI-NEXT:    s_sext_i32_i16 s3, s3
+; VI-NEXT:    s_min_i32 s5, s5, s8
+; VI-NEXT:    s_min_i32 s4, s4, s7
+; VI-NEXT:    s_lshl_b32 s5, s5, 8
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
+; VI-NEXT:    s_ashr_i32 s3, s3, 8
+; VI-NEXT:    s_ashr_i32 s2, s2, 8
+; VI-NEXT:    s_or_b32 s4, s4, s5
 ; VI-NEXT:    s_min_i32 s2, s2, s3
-; VI-NEXT:    s_min_i32 s3, s5, s7
-; VI-NEXT:    v_min_i32_sdwa v0, sext(v0), sext(v1) dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 8, s4
-; VI-NEXT:    v_mov_b32_e32 v2, s3
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_sext_i32_i16 s3, s9
+; VI-NEXT:    s_sext_i32_i16 s5, s6
+; VI-NEXT:    s_min_i32 s3, s5, s3
+; VI-NEXT:    s_lshl_b32 s2, s2, 8
+; VI-NEXT:    s_and_b32 s3, s3, 0xff
+; VI-NEXT:    s_or_b32 s2, s3, s2
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_or_b32 s2, s2, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_test_imin_sle_v4i8:
 ; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-NEXT:    s_bfe_i32 s6, s4, 0x80000
 ; GFX9-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX9-NEXT:    s_bfe_i32 s8, s7, 0x80000
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-NEXT:    s_bfe_i32 s4, s4, 0x80000
+; GFX9-NEXT:    s_bfe_i32 s7, s7, 0x80000
+; GFX9-NEXT:    s_ashr_i32 s5, s2, 24
+; GFX9-NEXT:    s_ashr_i32 s8, s3, 24
+; GFX9-NEXT:    s_sext_i32_i16 s7, s7
+; GFX9-NEXT:    s_sext_i32_i16 s4, s4
+; GFX9-NEXT:    s_bfe_i32 s6, s2, 0x80000
+; GFX9-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX9-NEXT:    s_bfe_i32 s9, s3, 0x80000
-; GFX9-NEXT:    v_min_i16_sdwa v1, sext(s4), sext(v1) dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NEXT:    v_min_i16_e32 v2, s6, v2
-; GFX9-NEXT:    s_bfe_i32 s5, s2, 0x80000
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_min_i16_sdwa v2, sext(s2), sext(v2) dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; GFX9-NEXT:    v_min_i16_e32 v3, s5, v3
-; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_sext_i32_i16 s3, s3
+; GFX9-NEXT:    s_min_i32 s5, s5, s8
+; GFX9-NEXT:    s_min_i32 s4, s4, s7
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX9-NEXT:    s_ashr_i32 s3, s3, 8
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 8
+; GFX9-NEXT:    s_or_b32 s4, s4, s5
+; GFX9-NEXT:    s_min_i32 s2, s2, s3
+; GFX9-NEXT:    s_sext_i32_i16 s3, s9
+; GFX9-NEXT:    s_sext_i32_i16 s5, s6
+; GFX9-NEXT:    s_min_i32 s3, s5, s3
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX9-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX9-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT:    s_or_b32 s2, s2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -649,66 +682,81 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; GFX10-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; GFX10-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX10-NEXT:    s_lshr_b32 s5, s3, 16
-; GFX10-NEXT:    v_ashrrev_i16 v0, 8, s2
-; GFX10-NEXT:    v_ashrrev_i16 v1, 8, s4
-; GFX10-NEXT:    v_ashrrev_i16 v2, 8, s5
-; GFX10-NEXT:    v_ashrrev_i16 v3, 8, s3
-; GFX10-NEXT:    s_bfe_i32 s2, s2, 0x80000
-; GFX10-NEXT:    s_bfe_i32 s3, s3, 0x80000
+; GFX10-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX10-NEXT:    s_ashr_i32 s5, s2, 24
+; GFX10-NEXT:    s_bfe_i32 s6, s2, 0x80000
+; GFX10-NEXT:    s_sext_i32_i16 s2, s2
+; GFX10-NEXT:    s_ashr_i32 s8, s3, 24
+; GFX10-NEXT:    s_bfe_i32 s9, s3, 0x80000
+; GFX10-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX10-NEXT:    s_bfe_i32 s4, s4, 0x80000
-; GFX10-NEXT:    v_min_i16 v1, v1, v2
-; GFX10-NEXT:    v_min_i16 v0, v0, v3
-; GFX10-NEXT:    s_bfe_i32 s5, s5, 0x80000
-; GFX10-NEXT:    v_min_i16 v2, s2, s3
-; GFX10-NEXT:    v_min_i16 v3, s4, s5
-; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT:    s_bfe_i32 s7, s7, 0x80000
+; GFX10-NEXT:    s_min_i32 s5, s5, s8
+; GFX10-NEXT:    s_ashr_i32 s3, s3, 8
+; GFX10-NEXT:    s_ashr_i32 s2, s2, 8
+; GFX10-NEXT:    s_sext_i32_i16 s8, s9
+; GFX10-NEXT:    s_sext_i32_i16 s6, s6
+; GFX10-NEXT:    s_sext_i32_i16 s7, s7
+; GFX10-NEXT:    s_sext_i32_i16 s4, s4
+; GFX10-NEXT:    s_min_i32 s2, s2, s3
+; GFX10-NEXT:    s_min_i32 s3, s6, s8
+; GFX10-NEXT:    s_min_i32 s4, s4, s7
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX10-NEXT:    s_or_b32 s4, s4, s5
+; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    s_lshl_b32 s3, s4, 16
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX10-NEXT:    s_or_b32 s2, s2, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_test_imin_sle_v4i8:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x28
-; GFX11-NEXT:    s_load_b32 s1, s[2:3], 0x4c
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-NEXT:    v_ashrrev_i16 v0, 8, s0
-; GFX11-NEXT:    v_ashrrev_i16 v1, 8, s1
-; GFX11-NEXT:    v_ashrrev_i16 v2, 8, s4
-; GFX11-NEXT:    v_ashrrev_i16 v3, 8, s5
-; GFX11-NEXT:    s_bfe_i32 s0, s0, 0x80000
-; GFX11-NEXT:    s_bfe_i32 s1, s1, 0x80000
-; GFX11-NEXT:    s_bfe_i32 s4, s4, 0x80000
-; GFX11-NEXT:    s_bfe_i32 s5, s5, 0x80000
-; GFX11-NEXT:    v_min_i16 v4, s0, s1
-; GFX11-NEXT:    v_min_i16 v5, s4, s5
-; GFX11-NEXT:    v_min_i16 v2, v2, v3
-; GFX11-NEXT:    v_min_i16 v0, v0, v1
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x28
+; GFX11-NEXT:    s_load_b32 s5, s[2:3], 0x4c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v2, 8, v2
-; GFX11-NEXT:    v_lshlrev_b16 v0, 8, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT:    s_lshr_b32 s2, s4, 16
+; GFX11-NEXT:    s_lshr_b32 s7, s5, 16
+; GFX11-NEXT:    s_ashr_i32 s3, s4, 24
+; GFX11-NEXT:    s_bfe_i32 s6, s4, 0x80000
+; GFX11-NEXT:    s_sext_i32_i16 s4, s4
+; GFX11-NEXT:    s_ashr_i32 s8, s5, 24
+; GFX11-NEXT:    s_bfe_i32 s9, s5, 0x80000
+; GFX11-NEXT:    s_sext_i32_i16 s5, s5
+; GFX11-NEXT:    s_bfe_i32 s2, s2, 0x80000
+; GFX11-NEXT:    s_bfe_i32 s7, s7, 0x80000
+; GFX11-NEXT:    s_min_i32 s3, s3, s8
+; GFX11-NEXT:    s_ashr_i32 s5, s5, 8
+; GFX11-NEXT:    s_ashr_i32 s4, s4, 8
+; GFX11-NEXT:    s_sext_i32_i16 s8, s9
+; GFX11-NEXT:    s_sext_i32_i16 s6, s6
+; GFX11-NEXT:    s_sext_i32_i16 s7, s7
+; GFX11-NEXT:    s_sext_i32_i16 s2, s2
+; GFX11-NEXT:    s_min_i32 s4, s4, s5
+; GFX11-NEXT:    s_min_i32 s5, s6, s8
+; GFX11-NEXT:    s_min_i32 s2, s2, s7
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xff
+; GFX11-NEXT:    s_or_b32 s2, s2, s3
+; GFX11-NEXT:    s_or_b32 s3, s5, s4
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -773,11 +821,11 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s4, s2, 16
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_ashr_i32 s5, s3, 16
+; VI-NEXT:    s_ashr_i32 s4, s3, 16
+; VI-NEXT:    s_ashr_i32 s5, s2, 16
 ; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_min_i32 s4, s4, s5
+; VI-NEXT:    s_sext_i32_i16 s2, s2
+; VI-NEXT:    s_min_i32 s4, s5, s4
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    s_lshl_b32 s3, s4, 16
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffff
@@ -937,24 +985,24 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s6, s1, 16
-; VI-NEXT:    s_sext_i32_i16 s1, s1
-; VI-NEXT:    s_ashr_i32 s8, s3, 16
+; VI-NEXT:    s_ashr_i32 s6, s3, 16
+; VI-NEXT:    s_ashr_i32 s7, s1, 16
 ; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_ashr_i32 s7, s0, 16
-; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_ashr_i32 s9, s2, 16
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_min_i32 s6, s6, s8
+; VI-NEXT:    s_sext_i32_i16 s1, s1
+; VI-NEXT:    s_min_i32 s6, s7, s6
 ; VI-NEXT:    s_min_i32 s1, s1, s3
-; VI-NEXT:    s_min_i32 s7, s7, s9
-; VI-NEXT:    s_min_i32 s0, s0, s2
-; VI-NEXT:    s_lshl_b32 s2, s6, 16
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
 ; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_or_b32 s1, s1, s2
-; VI-NEXT:    s_lshl_b32 s2, s7, 16
+; VI-NEXT:    s_or_b32 s1, s1, s6
+; VI-NEXT:    s_ashr_i32 s3, s2, 16
+; VI-NEXT:    s_ashr_i32 s6, s0, 16
+; VI-NEXT:    s_sext_i32_i16 s2, s2
+; VI-NEXT:    s_sext_i32_i16 s0, s0
+; VI-NEXT:    s_min_i32 s3, s6, s3
+; VI-NEXT:    s_min_i32 s0, s0, s2
+; VI-NEXT:    s_lshl_b32 s3, s3, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    s_or_b32 s0, s0, s3
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2645,19 +2693,22 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    flat_load_ushort v4, v[0:1]
-; VI-NEXT:    flat_load_ushort v5, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    flat_load_ushort v4, v[2:3]
+; VI-NEXT:    flat_load_ushort v5, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_and_b32_e32 v6, 0xffff, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cmp_lt_u32_e32 vcc, v4, v5
-; VI-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; VI-NEXT:    v_and_b32_e32 v7, 0xffff, v5
+; VI-NEXT:    v_cmp_lt_u32_e32 vcc, v7, v6
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; VI-NEXT:    flat_store_short v[0:1], v4
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    flat_store_byte v[2:3], v0
@@ -2671,7 +2722,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; GFX9-NEXT:    global_load_ushort v1, v0, s[12:13]
 ; GFX9-NEXT:    global_load_ushort v2, v0, s[14:15]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    global_store_short v0, v1, s[8:9]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -2687,7 +2738,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; GFX10-NEXT:    global_load_ushort v1, v0, s[12:13]
 ; GFX10-NEXT:    global_load_ushort v2, v0, s[14:15]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_cmp_lt_u32_sdwa vcc_lo, v1, v2 src0_sel:WORD_0 src1_sel:WORD_0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX10-NEXT:    global_store_short v0, v1, s[8:9]
@@ -2700,11 +2751,15 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_u16 v1, v0, s[4:5]
-; GFX11-NEXT:    global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT:    global_load_u16 v1, v0, s[6:7]
+; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5]
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v4, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
@@ -3158,38 +3213,38 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s2, s11, 16
-; VI-NEXT:    s_lshr_b32 s4, s10, 16
-; VI-NEXT:    s_and_b32 s5, s10, 0xffff
-; VI-NEXT:    s_lshr_b32 s10, s15, 16
+; VI-NEXT:    s_and_b32 s2, s15, 0xffff
 ; VI-NEXT:    s_and_b32 s3, s11, 0xffff
-; VI-NEXT:    s_and_b32 s11, s15, 0xffff
-; VI-NEXT:    s_lshr_b32 s15, s14, 16
-; VI-NEXT:    s_min_u32 s2, s2, s10
+; VI-NEXT:    s_lshr_b32 s4, s15, 16
+; VI-NEXT:    s_lshr_b32 s5, s11, 16
+; VI-NEXT:    s_min_u32 s2, s3, s2
+; VI-NEXT:    s_min_u32 s3, s5, s4
+; VI-NEXT:    s_lshl_b32 s3, s3, 16
+; VI-NEXT:    s_or_b32 s2, s2, s3
+; VI-NEXT:    s_and_b32 s3, s14, 0xffff
+; VI-NEXT:    s_and_b32 s4, s10, 0xffff
+; VI-NEXT:    s_min_u32 s3, s4, s3
+; VI-NEXT:    s_lshr_b32 s4, s14, 16
+; VI-NEXT:    s_lshr_b32 s5, s10, 16
+; VI-NEXT:    s_min_u32 s4, s5, s4
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_or_b32 s3, s3, s4
+; VI-NEXT:    s_and_b32 s4, s13, 0xffff
+; VI-NEXT:    s_and_b32 s5, s9, 0xffff
+; VI-NEXT:    s_min_u32 s4, s5, s4
+; VI-NEXT:    s_lshr_b32 s5, s13, 16
 ; VI-NEXT:    s_lshr_b32 s6, s9, 16
-; VI-NEXT:    s_and_b32 s7, s9, 0xffff
-; VI-NEXT:    s_lshr_b32 s9, s8, 16
-; VI-NEXT:    s_and_b32 s14, s14, 0xffff
-; VI-NEXT:    s_lshr_b32 s16, s13, 16
-; VI-NEXT:    s_lshr_b32 s17, s12, 16
-; VI-NEXT:    s_min_u32 s4, s4, s15
-; VI-NEXT:    s_min_u32 s3, s3, s11
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_and_b32 s8, s8, 0xffff
-; VI-NEXT:    s_and_b32 s13, s13, 0xffff
-; VI-NEXT:    s_and_b32 s12, s12, 0xffff
-; VI-NEXT:    s_min_u32 s9, s9, s17
-; VI-NEXT:    s_min_u32 s6, s6, s16
-; VI-NEXT:    s_min_u32 s5, s5, s14
-; VI-NEXT:    s_or_b32 s2, s3, s2
-; VI-NEXT:    s_lshl_b32 s3, s4, 16
-; VI-NEXT:    s_min_u32 s8, s8, s12
-; VI-NEXT:    s_min_u32 s7, s7, s13
-; VI-NEXT:    s_or_b32 s3, s5, s3
-; VI-NEXT:    s_lshl_b32 s4, s6, 16
-; VI-NEXT:    s_lshl_b32 s5, s9, 16
-; VI-NEXT:    s_or_b32 s4, s7, s4
-; VI-NEXT:    s_or_b32 s5, s8, s5
+; VI-NEXT:    s_min_u32 s5, s6, s5
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_and_b32 s5, s12, 0xffff
+; VI-NEXT:    s_and_b32 s6, s8, 0xffff
+; VI-NEXT:    s_min_u32 s5, s6, s5
+; VI-NEXT:    s_lshr_b32 s6, s12, 16
+; VI-NEXT:    s_lshr_b32 s7, s8, 16
+; VI-NEXT:    s_min_u32 s6, s7, s6
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    s_or_b32 s5, s5, s6
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s5
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3520,9 +3575,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_sext_i32_i16 s3, s2
-; VI-NEXT:    s_ashr_i32 s2, s2, 16
-; VI-NEXT:    s_min_i32 s2, s3, s2
+; VI-NEXT:    s_ashr_i32 s3, s2, 16
+; VI-NEXT:    s_sext_i32_i16 s2, s2
+; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
@@ -3535,9 +3590,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s3, s2
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX9-NEXT:    s_min_i32 s2, s3, s2
+; GFX9-NEXT:    s_ashr_i32 s3, s2, 16
+; GFX9-NEXT:    s_sext_i32_i16 s2, s2
+; GFX9-NEXT:    s_min_i32 s2, s2, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -3549,9 +3604,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i16 s3, s2
-; GFX10-NEXT:    s_ashr_i32 s2, s2, 16
-; GFX10-NEXT:    s_min_i32 s2, s3, s2
+; GFX10-NEXT:    s_ashr_i32 s3, s2, 16
+; GFX10-NEXT:    s_sext_i32_i16 s2, s2
+; GFX10-NEXT:    s_min_i32 s2, s2, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
@@ -3563,10 +3618,10 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sext_i32_i16 s2, s4
-; GFX11-NEXT:    s_ashr_i32 s3, s4, 16
+; GFX11-NEXT:    s_ashr_i32 s2, s4, 16
+; GFX11-NEXT:    s_sext_i32_i16 s3, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_min_i32 s2, s2, s3
+; GFX11-NEXT:    s_min_i32 s2, s3, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 9b44b58c4a01e7..30a40e6af85389 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -1512,29 +1512,29 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
 ;
 ; VI-LABEL: s_mul_i1:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dword s4, s[2:3], 0x70
-; VI-NEXT:    s_load_dword s5, s[2:3], 0x4c
+; VI-NEXT:    s_load_dword s4, s[2:3], 0x4c
+; VI-NEXT:    s_load_dword s5, s[2:3], 0x70
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mul_i32 s4, s4, s5
+; VI-NEXT:    s_and_b32 s4, s4, 1
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_mul_lo_u16_e32 v0, s5, v0
-; VI-NEXT:    v_and_b32_e32 v0, 1, v0
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_mul_i1:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x70
-; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x4c
+; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x4c
+; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x70
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    s_and_b32 s4, s4, 1
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mul_lo_u16_e32 v0, s5, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1545,10 +1545,11 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
 ; GFX10-NEXT:    s_load_dword s5, s[2:3], 0x70
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mul_lo_u16 v0, s4, s5
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    s_mul_i32 s4, s4, s5
+; GFX10-NEXT:    s_and_b32 s2, s4, 1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    s_mov_b32 s2, -1
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -1559,11 +1560,12 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
 ; GFX11-NEXT:    s_load_b32 s5, s[2:3], 0x70
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mul_lo_u16 v0, s4, s5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_mul_i32 s4, s4, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s4, 1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1576,11 +1578,12 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
 ; GFX12-NEXT:    s_load_b32 s5, s[2:3], 0x70
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mul_lo_u16 v0, s4, s5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT:    s_mul_i32 s4, s4, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_b32 s2, s4, 1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_mov_b32 s2, -1
 ; GFX12-NEXT:    buffer_store_b8 v0, off, s[0:3], null
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1655,7 +1658,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; VI-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; VI-NEXT:    v_and_b32_e32 v0, 1, v0
 ; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -1675,7 +1678,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_mov_b32 s1, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
@@ -1696,7 +1699,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    s_mov_b32 s1, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX10-NEXT:    s_endpgm
@@ -1717,7 +1720,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX11-NEXT:    s_mov_b32 s4, s0
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX11-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    buffer_store_b8 v0, off, s[4:7], 0
@@ -1741,7 +1744,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX12-NEXT:    s_mov_b32 s4, s0
 ; GFX12-NEXT:    s_mov_b32 s5, s1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v1
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX12-NEXT:    buffer_store_b8 v0, off, s[4:7], null
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 050300a69c46bb..57f5473749513f 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -611,50 +611,65 @@ define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %i
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshr_b32 s3, s3, 8
-; GFX10-NEXT:    s_lshr_b32 s4, s9, 16
-; GFX10-NEXT:    v_lshlrev_b16 v0, 8, s9
-; GFX10-NEXT:    v_and_b32_e64 v1, 0xffffff00, s8
-; GFX10-NEXT:    v_lshlrev_b16 v2, 8, s4
-; GFX10-NEXT:    v_lshlrev_b16 v3, 8, s8
-; GFX10-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX10-NEXT:    v_or_b32_sdwa v0, s3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX10-NEXT:    s_bfe_u32 s3, s3, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s5, s9, 8
+; GFX10-NEXT:    s_bfe_u32 s9, s9, 0x100010
+; GFX10-NEXT:    s_bfe_u32 s4, s2, 0x80008
+; GFX10-NEXT:    s_lshl_b32 s6, s8, 8
+; GFX10-NEXT:    s_and_b32 s7, s8, 0xff00
+; GFX10-NEXT:    s_bfe_u32 s8, s2, 0x80010
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX10-NEXT:    s_or_b32 s3, s3, s5
+; GFX10-NEXT:    s_lshl_b32 s5, s9, 8
+; GFX10-NEXT:    s_or_b32 s4, s4, s6
+; GFX10-NEXT:    s_or_b32 s6, s8, s7
+; GFX10-NEXT:    s_or_b32 s2, s2, s5
+; GFX10-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX10-NEXT:    s_lshl_b32 s5, s6, 16
+; GFX10-NEXT:    s_or_b32 s3, s4, s3
+; GFX10-NEXT:    s_or_b32 s2, s2, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: shuffle8i8:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffffff00
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 8
-; GFX9-NEXT:    v_lshlrev_b16_e64 v1, 8, s9
-; GFX9-NEXT:    v_or_b32_sdwa v4, s3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_lshr_b32 s3, s9, 16
-; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
-; GFX9-NEXT:    v_lshlrev_b16_e64 v3, 8, s8
-; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
-; GFX9-NEXT:    v_lshlrev_b16_e64 v1, 8, s3
-; GFX9-NEXT:    v_or_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    s_bfe_u32 s4, s2, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s6, s8, 8
+; GFX9-NEXT:    s_lshl_b32 s5, s9, 8
+; GFX9-NEXT:    s_bfe_u32 s3, s3, 0x80008
+; GFX9-NEXT:    s_or_b32 s4, s4, s6
+; GFX9-NEXT:    s_bfe_u32 s6, s9, 0x100010
+; GFX9-NEXT:    s_and_b32 s7, s8, 0xff00
+; GFX9-NEXT:    s_or_b32 s3, s3, s5
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x80010
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_or_b32 s5, s5, s7
+; GFX9-NEXT:    s_or_b32 s2, s2, s6
+; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX9-NEXT:    s_or_b32 s2, s2, s5
+; GFX9-NEXT:    s_or_b32 s3, s4, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index b7113a65607fc5..2aaffd7121ae97 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -33,10 +33,9 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero
 ; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-NEXT:  ; %bb.0:
-; GFX940-NEXT:    s_mov_b32 s0, 0xffff
-; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    s_and_b32 s0, s4, 0xff
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 0
-; GFX940-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -44,10 +43,9 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero
 ; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-NEXT:  ; %bb.0:
-; GFX90a-NEXT:    s_mov_b32 s0, 0xffff
-; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-NEXT:    s_and_b32 s0, s8, 0xff
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_and_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-NEXT:    s_endpgm
   %ext = zext i8 %arg0 to i32
@@ -164,22 +162,18 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2
 ; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-NEXT:  ; %bb.0:
-; GFX940-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX940-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0
-; GFX940-NEXT:    global_store_short v1, v0, s[2:3] sc0 sc1
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: ptr1_v2i8_preload_arg:
 ; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-NEXT:  ; %bb.0:
-; GFX90a-NEXT:    s_lshr_b32 s0, s8, 8
-; GFX90a-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90a-NEXT:    global_store_short v1, v0, s[6:7]
+; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
 ; GFX90a-NEXT:    s_endpgm
   store <2 x i8> %in, ptr addrspace(1) %out
   ret void
@@ -388,36 +382,36 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou
 ; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-NEXT:  ; %bb.0:
-; GFX940-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX940-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX940-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX940-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-NEXT:    v_mov_b32_e32 v2, s5
-; GFX940-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0
-; GFX940-NEXT:    global_store_byte v1, v2, s[2:3] offset:4 sc0 sc1
-; GFX940-NEXT:    global_store_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-NEXT:    s_lshr_b32 s1, s4, 24
+; GFX940-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX940-NEXT:    s_bfe_u32 s4, s4, 0x80010
+; GFX940-NEXT:    s_or_b32 s1, s4, s1
+; GFX940-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX940-NEXT:    s_or_b32 s0, s0, s1
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NEXT:    global_store_byte v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: v5i8_preload_arg:
 ; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-NEXT:  ; %bb.0:
-; GFX90a-NEXT:    s_lshr_b32 s0, s8, 8
-; GFX90a-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-NEXT:    s_lshr_b32 s0, s8, 24
-; GFX90a-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX90a-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX90a-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX90a-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s9
-; GFX90a-NEXT:    global_store_byte v1, v2, s[6:7] offset:4
-; GFX90a-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX90a-NEXT:    s_lshr_b32 s1, s8, 24
+; GFX90a-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX90a-NEXT:    s_bfe_u32 s2, s8, 0x80010
+; GFX90a-NEXT:    s_or_b32 s1, s2, s1
+; GFX90a-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX90a-NEXT:    s_or_b32 s0, s0, s1
+; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-NEXT:    global_store_byte v0, v1, s[6:7] offset:4
+; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-NEXT:    s_endpgm
   store <5 x i8> %in, ptr addrspace(1) %out, align 4
   ret void
@@ -478,23 +472,22 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8
 ; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-NEXT:  ; %bb.0:
-; GFX940-NEXT:    s_lshr_b32 s0, s5, 8
-; GFX940-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-NEXT:    s_lshr_b32 s0, s5, 24
-; GFX940-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX940-NEXT:    s_lshr_b32 s0, s5, 16
-; GFX940-NEXT:    v_or_b32_sdwa v0, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX940-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX940-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX940-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; GFX940-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-NEXT:    s_nop 0
-; GFX940-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX940-NEXT:    s_lshr_b32 s1, s5, 24
+; GFX940-NEXT:    s_and_b32 s0, s5, 0xffff
+; GFX940-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX940-NEXT:    s_bfe_u32 s5, s5, 0x80010
+; GFX940-NEXT:    s_or_b32 s1, s5, s1
+; GFX940-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX940-NEXT:    s_lshr_b32 s5, s4, 24
+; GFX940-NEXT:    s_or_b32 s0, s0, s1
+; GFX940-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX940-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX940-NEXT:    s_bfe_u32 s4, s4, 0x80010
+; GFX940-NEXT:    s_or_b32 s4, s4, s5
+; GFX940-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX940-NEXT:    s_or_b32 s1, s1, s4
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
@@ -503,22 +496,22 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8
 ; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-NEXT:  ; %bb.0:
-; GFX90a-NEXT:    s_lshr_b32 s0, s9, 8
-; GFX90a-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-NEXT:    s_lshr_b32 s0, s9, 24
-; GFX90a-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX90a-NEXT:    s_lshr_b32 s0, s9, 16
-; GFX90a-NEXT:    v_or_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-NEXT:    s_lshr_b32 s0, s8, 8
-; GFX90a-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX90a-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-NEXT:    s_lshr_b32 s0, s8, 24
-; GFX90a-NEXT:    v_lshlrev_b16_e64 v2, 8, s0
-; GFX90a-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX90a-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-NEXT:    v_or_b32_sdwa v2, s0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX90a-NEXT:    s_lshr_b32 s1, s9, 24
+; GFX90a-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX90a-NEXT:    s_bfe_u32 s2, s9, 0x80010
+; GFX90a-NEXT:    s_or_b32 s1, s2, s1
+; GFX90a-NEXT:    s_lshr_b32 s2, s8, 24
+; GFX90a-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX90a-NEXT:    s_bfe_u32 s3, s8, 0x80010
+; GFX90a-NEXT:    s_and_b32 s0, s9, 0xffff
+; GFX90a-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX90a-NEXT:    s_or_b32 s2, s3, s2
+; GFX90a-NEXT:    s_or_b32 s0, s0, s1
+; GFX90a-NEXT:    s_and_b32 s1, s8, 0xffff
+; GFX90a-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX90a-NEXT:    s_or_b32 s1, s1, s2
+; GFX90a-NEXT:    v_mov_b32_e32 v0, s1
+; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX90a-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90a-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX90a-NEXT:    s_endpgm
@@ -782,44 +775,38 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out,
 ; GFX940:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-NEXT:  ; %bb.0:
-; GFX940-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX940-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX940-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX940-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-NEXT:    v_or_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-NEXT:    s_lshr_b32 s0, s5, 8
-; GFX940-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX940-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0
-; GFX940-NEXT:    v_mov_b32_e32 v3, s5
-; GFX940-NEXT:    v_or_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-NEXT:    global_store_byte_d16_hi v2, v3, s[2:3] offset:6 sc0 sc1
-; GFX940-NEXT:    global_store_short v2, v1, s[2:3] offset:4 sc0 sc1
-; GFX940-NEXT:    global_store_dword v2, v0, s[2:3] sc0 sc1
+; GFX940-NEXT:    s_lshr_b32 s1, s4, 24
+; GFX940-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX940-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX940-NEXT:    s_bfe_u32 s4, s4, 0x80010
+; GFX940-NEXT:    s_or_b32 s1, s4, s1
+; GFX940-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX940-NEXT:    s_or_b32 s0, s0, s1
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s5
+; GFX940-NEXT:    global_store_byte_d16_hi v0, v1, s[2:3] offset:6 sc0 sc1
+; GFX940-NEXT:    global_store_short v0, v1, s[2:3] offset:4 sc0 sc1
+; GFX940-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NEXT:    global_store_dword v0, v1, s[2:3] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: v7i8_kernel_preload_arg:
 ; GFX90a:         s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
 ; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-NEXT:  ; %bb.0:
-; GFX90a-NEXT:    s_lshr_b32 s0, s8, 8
-; GFX90a-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-NEXT:    s_lshr_b32 s0, s8, 24
-; GFX90a-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX90a-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX90a-NEXT:    v_or_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-NEXT:    v_or_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-NEXT:    s_lshr_b32 s0, s9, 8
-; GFX90a-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX90a-NEXT:    v_lshlrev_b16_e64 v1, 8, s0
-; GFX90a-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v3, s9
-; GFX90a-NEXT:    v_or_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-NEXT:    global_store_byte_d16_hi v2, v3, s[6:7] offset:6
-; GFX90a-NEXT:    global_store_short v2, v1, s[6:7] offset:4
-; GFX90a-NEXT:    global_store_dword v2, v0, s[6:7]
+; GFX90a-NEXT:    s_lshr_b32 s1, s8, 24
+; GFX90a-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX90a-NEXT:    s_bfe_u32 s2, s8, 0x80010
+; GFX90a-NEXT:    s_or_b32 s1, s2, s1
+; GFX90a-NEXT:    s_and_b32 s0, s8, 0xffff
+; GFX90a-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX90a-NEXT:    s_or_b32 s0, s0, s1
+; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NEXT:    v_mov_b32_e32 v1, s9
+; GFX90a-NEXT:    global_store_byte_d16_hi v0, v1, s[6:7] offset:6
+; GFX90a-NEXT:    global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-NEXT:    global_store_dword v0, v1, s[6:7]
 ; GFX90a-NEXT:    s_endpgm
   store <7 x i8> %in, ptr addrspace(1) %out
   ret void
@@ -948,13 +935,14 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o
 ; GFX940-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX940-NEXT:  ; %bb.0:
 ; GFX940-NEXT:    s_lshr_b32 s0, s4, 24
-; GFX940-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX940-NEXT:    s_lshr_b32 s0, s4, 16
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0
-; GFX940-NEXT:    v_mov_b32_e32 v2, s4
-; GFX940-NEXT:    v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX940-NEXT:    global_store_short v1, v2, s[2:3] sc0 sc1
-; GFX940-NEXT:    global_store_short v1, v0, s[6:7] sc0 sc1
+; GFX940-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX940-NEXT:    s_bfe_u32 s1, s4, 0x80010
+; GFX940-NEXT:    s_or_b32 s0, s1, s0
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    v_mov_b32_e32 v1, s4
+; GFX940-NEXT:    global_store_short v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT:    v_mov_b32_e32 v1, s0
+; GFX940-NEXT:    global_store_short v0, v1, s[6:7] sc0 sc1
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX90a-LABEL: i16_v2i8_kernel_preload_arg:
@@ -962,13 +950,14 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o
 ; GFX90a-NEXT:    .fill 63, 4, 0xbf800000 ; s_nop 0
 ; GFX90a-NEXT:  ; %bb.0:
 ; GFX90a-NEXT:    s_lshr_b32 s0, s8, 24
-; GFX90a-NEXT:    v_lshlrev_b16_e64 v0, 8, s0
-; GFX90a-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX90a-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90a-NEXT:    v_mov_b32_e32 v2, s8
-; GFX90a-NEXT:    v_or_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX90a-NEXT:    global_store_short v1, v2, s[6:7]
-; GFX90a-NEXT:    global_store_short v1, v0, s[10:11]
+; GFX90a-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX90a-NEXT:    s_bfe_u32 s1, s8, 0x80010
+; GFX90a-NEXT:    s_or_b32 s0, s1, s0
+; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NEXT:    v_mov_b32_e32 v1, s8
+; GFX90a-NEXT:    global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
+; GFX90a-NEXT:    global_store_short v0, v1, s[10:11]
 ; GFX90a-NEXT:    s_endpgm
   store i16 %in, ptr addrspace(1) %out
   store <2 x i8> %in2, ptr addrspace(1) %out2
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 1700ce302cc9db..f299232918d99b 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -120,13 +120,14 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() {
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v0
-; VI-NEXT:    v_or_b32_e32 v2, v1, v0
-; VI-NEXT:    v_and_b32_e32 v1, 0xffffff00, v2
-; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; VI-NEXT:    v_or_b32_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-NEXT:    s_lshl_b32 s1, s0, 8
+; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_lshl_b32 s1, s0, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 bb:
@@ -160,13 +161,19 @@ define amdgpu_kernel void @scalar_to_vector_v4f16() {
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v0
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_and_b32_e32 v1, 0xffffff00, v0
-; VI-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-NEXT:    s_lshl_b32 s1, s0, 8
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    s_and_b32 s1, s0, 0xff00
+; VI-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; VI-NEXT:    s_or_b32 s1, s4, s1
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_lshl_b32 s4, s1, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_or_b32 s1, s1, s4
+; VI-NEXT:    s_or_b32 s0, s0, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 64b3317edc5192..b1066e0f8f26ad 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -1787,15 +1787,14 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 ; NOSDWA-NEXT:    flat_load_dword v1, v[0:1]
 ; NOSDWA-NEXT:    flat_load_dword v2, v[2:3]
 ; NOSDWA-NEXT:    v_mov_b32_e32 v0, s4
-; NOSDWA-NEXT:    s_waitcnt vmcnt(1)
-; NOSDWA-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; NOSDWA-NEXT:    s_waitcnt vmcnt(0)
-; NOSDWA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; NOSDWA-NEXT:    v_add_u32_e32 v3, vcc, v1, v2
+; NOSDWA-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; NOSDWA-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; NOSDWA-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; NOSDWA-NEXT:    v_add_u32_e32 v2, vcc, v3, v4
-; NOSDWA-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; NOSDWA-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; NOSDWA-NEXT:    v_or_b32_e32 v2, v1, v2
+; NOSDWA-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; NOSDWA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; NOSDWA-NEXT:    v_or_b32_e32 v2, v3, v1
 ; NOSDWA-NEXT:    v_mov_b32_e32 v1, s5
 ; NOSDWA-NEXT:    flat_store_dword v[0:1], v2
 ; NOSDWA-NEXT:    s_endpgm
@@ -1813,9 +1812,9 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX89-NEXT:    flat_load_dword v2, v[2:3]
 ; GFX89-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_add_u32_sdwa v3, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX89-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX89-NEXT:    v_or_b32_sdwa v2, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT:    v_add_u32_e32 v3, vcc, v1, v2
+; GFX89-NEXT:    v_add_u32_sdwa v1, vcc, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX89-NEXT:    v_or_b32_sdwa v2, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX89-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX89-NEXT:    flat_store_dword v[0:1], v2
 ; GFX89-NEXT:    s_endpgm
@@ -1868,62 +1867,58 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
 ; NOSDWA-NEXT:    v_mov_b32_e32 v0, s0
 ; NOSDWA-NEXT:    v_mov_b32_e32 v1, s1
 ; NOSDWA-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; NOSDWA-NEXT:    v_mov_b32_e32 v2, s2
 ; NOSDWA-NEXT:    v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT:    v_mov_b32_e32 v2, s2
 ; NOSDWA-NEXT:    s_waitcnt vmcnt(0)
-; NOSDWA-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; NOSDWA-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; NOSDWA-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; NOSDWA-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; NOSDWA-NEXT:    v_and_b32_e32 v7, 0xff, v1
-; NOSDWA-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
-; NOSDWA-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; NOSDWA-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; NOSDWA-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; NOSDWA-NEXT:    v_lshlrev_b16_e32 v6, 8, v6
-; NOSDWA-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; NOSDWA-NEXT:    v_lshlrev_b16_e32 v8, 8, v8
-; NOSDWA-NEXT:    v_lshlrev_b16_e32 v9, 8, v9
-; NOSDWA-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; NOSDWA-NEXT:    v_or_b32_e32 v4, v4, v5
-; NOSDWA-NEXT:    v_or_b32_e32 v0, v0, v6
-; NOSDWA-NEXT:    v_or_b32_e32 v5, v7, v8
-; NOSDWA-NEXT:    v_or_b32_e32 v1, v1, v9
-; NOSDWA-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; NOSDWA-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; NOSDWA-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; NOSDWA-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; NOSDWA-NEXT:    v_or_b32_e32 v0, v4, v0
-; NOSDWA-NEXT:    v_or_b32_e32 v1, v5, v1
+; NOSDWA-NEXT:    v_readfirstlane_b32 s0, v1
+; NOSDWA-NEXT:    v_readfirstlane_b32 s1, v0
+; NOSDWA-NEXT:    s_lshr_b32 s3, s1, 24
+; NOSDWA-NEXT:    s_lshr_b32 s5, s0, 24
+; NOSDWA-NEXT:    s_and_b32 s2, s1, 0xffff
+; NOSDWA-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; NOSDWA-NEXT:    s_and_b32 s4, s0, 0xffff
+; NOSDWA-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; NOSDWA-NEXT:    s_lshl_b32 s3, s3, 8
+; NOSDWA-NEXT:    s_lshl_b32 s5, s5, 8
+; NOSDWA-NEXT:    s_or_b32 s1, s1, s3
+; NOSDWA-NEXT:    s_or_b32 s0, s0, s5
+; NOSDWA-NEXT:    s_lshl_b32 s1, s1, 16
+; NOSDWA-NEXT:    s_lshl_b32 s0, s0, 16
+; NOSDWA-NEXT:    s_or_b32 s1, s2, s1
+; NOSDWA-NEXT:    s_or_b32 s0, s4, s0
+; NOSDWA-NEXT:    v_mov_b32_e32 v0, s1
+; NOSDWA-NEXT:    v_mov_b32_e32 v1, s0
 ; NOSDWA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; NOSDWA-NEXT:    s_endpgm
 ;
 ; GFX89-LABEL: pulled_out_test:
 ; GFX89:       ; %bb.0: ; %entry
 ; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX89-NEXT:    v_mov_b32_e32 v4, 8
-; GFX89-NEXT:    v_mov_b32_e32 v5, 0xff
 ; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX89-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX89-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX89-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX89-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX89-NEXT:    v_mov_b32_e32 v3, s3
+; GFX89-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_lshrrev_b32_sdwa v6, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX89-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
-; GFX89-NEXT:    v_lshrrev_b32_sdwa v4, v4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX89-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX89-NEXT:    v_and_b32_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX89-NEXT:    v_and_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX89-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT:    v_lshlrev_b16_e32 v6, 8, v7
-; GFX89-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX89-NEXT:    v_lshlrev_b16_e32 v4, 8, v9
-; GFX89-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX89-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX89-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX89-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX89-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX89-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX89-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX89-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX89-NEXT:    s_and_b32 s2, s1, 0xffff
+; GFX89-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX89-NEXT:    s_and_b32 s4, s0, 0xffff
+; GFX89-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX89-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX89-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX89-NEXT:    s_or_b32 s1, s1, s3
+; GFX89-NEXT:    s_or_b32 s0, s0, s5
+; GFX89-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX89-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX89-NEXT:    s_or_b32 s1, s2, s1
+; GFX89-NEXT:    s_or_b32 s0, s4, s0
+; GFX89-NEXT:    v_mov_b32_e32 v0, s1
+; GFX89-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX89-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX89-NEXT:    s_endpgm
 ;
@@ -1931,25 +1926,27 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 8
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_sdwa v4, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX9-NEXT:    v_lshrrev_b32_sdwa v3, v3, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX9-NEXT:    v_and_b32_sdwa v6, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v4, 8, v5
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 8, v7
-; GFX9-NEXT:    v_or_b32_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX9-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX9-NEXT:    s_and_b32 s2, s1, 0xffff
+; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX9-NEXT:    s_and_b32 s4, s0, 0xffff
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX9-NEXT:    s_or_b32 s1, s1, s3
+; GFX9-NEXT:    s_or_b32 s0, s0, s5
+; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX9-NEXT:    s_or_b32 s1, s2, s1
+; GFX9-NEXT:    s_or_b32 s0, s4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1957,24 +1954,27 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 8
-; GFX10-NEXT:    v_mov_b32_e32 v4, 24
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0xff
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v6, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v7, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v3, v3, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_lshrrev_b32_sdwa v4, v4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v6, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX10-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX10-NEXT:    s_and_b32 s2, s0, 0xffff
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0x80010
+; GFX10-NEXT:    s_and_b32 s4, s1, 0xffff
+; GFX10-NEXT:    s_bfe_u32 s1, s1, 0x80010
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX10-NEXT:    s_or_b32 s0, s0, s3
+; GFX10-NEXT:    s_or_b32 s1, s1, s5
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX10-NEXT:    s_or_b32 s0, s2, s0
+; GFX10-NEXT:    s_or_b32 s1, s4, s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 entry:
@@ -2101,16 +2101,16 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; NOSDWA:       ; %bb.0: ; %bb0
 ; NOSDWA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; NOSDWA-NEXT:    s_mov_b64 s[4:5], 0
-; NOSDWA-NEXT:    v_mov_b32_e32 v0, 0x100
 ; NOSDWA-NEXT:    s_and_b64 vcc, exec, -1
 ; NOSDWA-NEXT:  .LBB22_1: ; %bb1
 ; NOSDWA-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; NOSDWA-NEXT:    s_lshl_b32 s6, s4, 3
-; NOSDWA-NEXT:    v_mov_b32_e32 v1, s4
-; NOSDWA-NEXT:    v_mov_b32_e32 v2, s5
+; NOSDWA-NEXT:    v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT:    s_lshr_b32 s6, 0x100, s6
+; NOSDWA-NEXT:    v_mov_b32_e32 v1, s5
 ; NOSDWA-NEXT:    s_mov_b64 s[4:5], 1
-; NOSDWA-NEXT:    v_lshrrev_b16_e32 v3, s6, v0
-; NOSDWA-NEXT:    flat_store_byte v[1:2], v3
+; NOSDWA-NEXT:    v_mov_b32_e32 v2, s6
+; NOSDWA-NEXT:    flat_store_byte v[0:1], v2
 ; NOSDWA-NEXT:    s_mov_b64 vcc, vcc
 ; NOSDWA-NEXT:    s_cbranch_vccnz .LBB22_1
 ; NOSDWA-NEXT:  ; %bb.2: ; %DummyReturnBlock
@@ -2121,16 +2121,16 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX89:       ; %bb.0: ; %bb0
 ; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX89-NEXT:    s_mov_b64 s[4:5], 0
-; GFX89-NEXT:    v_mov_b32_e32 v0, 0x100
 ; GFX89-NEXT:    s_and_b64 vcc, exec, -1
 ; GFX89-NEXT:  .LBB22_1: ; %bb1
 ; GFX89-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX89-NEXT:    s_lshl_b32 s6, s4, 3
-; GFX89-NEXT:    v_mov_b32_e32 v1, s4
-; GFX89-NEXT:    v_mov_b32_e32 v2, s5
+; GFX89-NEXT:    v_mov_b32_e32 v0, s4
+; GFX89-NEXT:    s_lshr_b32 s6, 0x100, s6
+; GFX89-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX89-NEXT:    s_mov_b64 s[4:5], 1
-; GFX89-NEXT:    v_lshrrev_b16_e32 v3, s6, v0
-; GFX89-NEXT:    flat_store_byte v[1:2], v3
+; GFX89-NEXT:    v_mov_b32_e32 v2, s6
+; GFX89-NEXT:    flat_store_byte v[0:1], v2
 ; GFX89-NEXT:    s_mov_b64 vcc, vcc
 ; GFX89-NEXT:    s_cbranch_vccnz .LBB22_1
 ; GFX89-NEXT:  ; %bb.2: ; %DummyReturnBlock
@@ -2141,16 +2141,16 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX9:       ; %bb.0: ; %bb0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x100
 ; GFX9-NEXT:    s_and_b64 vcc, exec, -1
 ; GFX9-NEXT:  .LBB22_1: ; %bb1
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_lshl_b32 s6, s4, 3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_lshr_b32 s6, 0x100, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 1
-; GFX9-NEXT:    v_lshrrev_b16_e32 v3, s6, v0
-; GFX9-NEXT:    flat_store_byte v[1:2], v3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    flat_store_byte v[0:1], v2
 ; GFX9-NEXT:    s_mov_b64 vcc, vcc
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB22_1
 ; GFX9-NEXT:  ; %bb.2: ; %DummyReturnBlock
@@ -2167,7 +2167,8 @@ define void @crash_lshlrevb16_not_reg_op() {
 ; GFX10-NEXT:    s_lshl_b32 s6, s4, 3
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    v_lshrrev_b16 v2, s6, 0x100
+; GFX10-NEXT:    s_lshr_b32 s4, 0x100, s6
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX10-NEXT:    s_mov_b64 s[4:5], 1
 ; GFX10-NEXT:    flat_store_byte v[0:1], v2
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB22_1
diff --git a/llvm/test/CodeGen/AMDGPU/select-i1.ll b/llvm/test/CodeGen/AMDGPU/select-i1.ll
index eb7ceb82ff9e98..06a2d86c2755ee 100644
--- a/llvm/test/CodeGen/AMDGPU/select-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-i1.ll
@@ -4,7 +4,7 @@
 ; FIXME: This should go in existing select.ll test, except the current testcase there is broken on GCN
 
 ; GCN-LABEL: {{^}}select_i1:
-; GCN: v_cndmask_b32
+; GCN: s_cselect_b32
 ; GCN-NOT: v_cndmask_b32
 define amdgpu_kernel void @select_i1(ptr addrspace(1) %out, i32 %cond, i1 %a, i1 %b) nounwind {
   %cmp = icmp ugt i32 %cond, 5
@@ -16,14 +16,9 @@ define amdgpu_kernel void @select_i1(ptr addrspace(1) %out, i32 %cond, i1 %a, i1
 ; GCN-LABEL: {{^}}s_minmax_i1:
 ; GCN: s_load_dword [[LOAD:s[0-9]+]],
 ; GCN: s_bitcmp1_b32 [[LOAD]], 0
-; GCN: s_cselect_b64 vcc, -1, 0
-; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8
-; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16
-
-; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
-; GCN: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
-; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]]
-; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]]
+; GCN: s_cselect_b32 [[SHIFTVAL:s[0-9]+]], 8, 16
+; GCN: s_lshr_b32 [[LOAD]], [[LOAD]], [[SHIFTVAL]]
+; GCN: s_and_b32  [[LOAD]], [[LOAD]], 1
 define amdgpu_kernel void @s_minmax_i1(ptr addrspace(1) %out, [8 x i32], i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
   %cmp = icmp slt i1 %cond, false
   %sel = select i1 %cmp, i1 %a, i1 %b
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index cca44548bb8f8b..c8c40d41dab720 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -14,7 +14,7 @@
 
 ; This is worse when i16 is legal and packed is not because
 ; SelectionDAGBuilder for some reason changes the select type.
-; VI: v_cndmask_b32
+; VI: s_cselect_b64
 ; VI: v_cndmask_b32
 define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
   %a = load <2 x i8>, ptr addrspace(1) %a.ptr, align 2
@@ -111,8 +111,7 @@ define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1
 ; SI: cndmask
 ; SI-NOT: cndmask
 
-; VI: s_cselect_b32
-; VI: s_cselect_b32
+; VI: s_cselect_b64
 ; GFX9: cndmask
 ; GFX9: cndmask
 define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
index 46aafed322cd8c..b477a72d3810ea 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -154,19 +154,12 @@ define amdgpu_kernel void @zext_bool_icmp_ne_neg1(ptr addrspace(1) %out, i32 %a,
 }
 
 ; FUNC-LABEL: {{^}}cmp_zext_k_i8max:
-; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], 0xff
-; SI: s_cmpk_lg_i32 [[B]], 0xff
-; SI: s_cselect_b64 [[CC:[^,]+]], -1, 0
-
-; VI: v_mov_b32_e32 [[VK255:v[0-9]+]], 0xff
-; VI: s_movk_i32 [[K255:s[0-9]+]], 0xff
-; VI: v_and_b32_e32 [[B:v[0-9]+]], [[VALUE]], [[VK255]]
-; VI: v_cmp_ne_u16_e32 vcc, [[K255]], [[B]]
+; GCN: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
+; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], 0xff
+; GCN: s_cmpk_lg_i32 [[B]], 0xff
+; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
 
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
-; VI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @cmp_zext_k_i8max(ptr addrspace(1) %out, i8 %b) nounwind {
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
index 4e3dccb975fe8e..bc147ea3067c4f 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -521,13 +521,9 @@ define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(ptr addrspace(1) %o
 ; FUNC-LABEL: {{^}}s_sext_in_reg_i1_i16:
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 
-; SI: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x10000
-; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
-; SI: buffer_store_short [[VBFE]]
-
-; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
-; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
+; GCN: s_bfe_i32 [[BFE:s[0-9]+]], [[VAL]], 0x10000
+; GCN: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
+; GCN: buffer_store_short [[VBFE]]
 define amdgpu_kernel void @s_sext_in_reg_i1_i16(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
   %ld = load i32, ptr addrspace(4) %ptr
   %in = trunc i32 %ld to i16
@@ -622,9 +618,7 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(ptr addrspace(1) %out, i16 %
 ; SI: v_mov_b32_e32 [[VSEXT:v[0-9]+]], [[SSEXT]]
 ; SI: buffer_store_short [[VSEXT]]
 
-; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
-; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
+; GFX89: s_bfe_i32 [[VAL]], [[VAL]], 0x80000
 define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(ptr addrspace(1) %out, i16 %in) #0 {
   %shl = shl i16 %in, 8
   %sext = ashr i16 %shl, 8
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 47ab5ba666877a..c379c7a6826aca 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -317,11 +317,12 @@ define amdgpu_kernel void @shl_i16_v_compute_s(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_mov_b32 s1, s7
 ; VI-NEXT:    s_mov_b32 s3, s11
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; VI-NEXT:    s_add_i32 s12, s12, 3
+; VI-NEXT:    s_and_b32 s0, s12, 0xffff
+; VI-NEXT:    s_add_i32 s0, s0, 3
 ; VI-NEXT:    s_mov_b32 s8, s4
 ; VI-NEXT:    s_mov_b32 s9, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 9b9f03ff74aa3f..44dd0b6e27e740 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -27,9 +27,9 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshr_b32 s0, s2, 16
-; VI-NEXT:    s_lshr_b32 s1, s3, 16
-; VI-NEXT:    s_lshl_b32 s0, s0, s1
+; VI-NEXT:    s_lshr_b32 s0, s3, 16
+; VI-NEXT:    s_lshr_b32 s1, s2, 16
+; VI-NEXT:    s_lshl_b32 s0, s1, s0
 ; VI-NEXT:    s_lshl_b32 s1, s2, s3
 ; VI-NEXT:    s_lshl_b32 s0, s0, 16
 ; VI-NEXT:    s_and_b32 s1, s1, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index b54df3b4d0c6c6..5f1e3bd9a9fe1a 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -372,14 +372,14 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b16_e64 v0, 8, s4
 ; VI-NEXT:    s_ashr_i32 s5, s4, 24
 ; VI-NEXT:    s_bfe_i32 s6, s4, 0x80010
+; VI-NEXT:    s_bfe_i32 s7, s4, 0x80008
 ; VI-NEXT:    s_sext_i32_i8 s4, s4
-; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; VI-NEXT:    v_mov_b32_e32 v1, s4
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s7
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s6
@@ -447,19 +447,18 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
-; VI-NEXT:    v_ashrrev_i32_e32 v2, 24, v0
-; VI-NEXT:    v_bfe_i32 v3, v0, 16, 8
+; VI-NEXT:    v_ashrrev_i32_e32 v1, 24, v0
+; VI-NEXT:    v_bfe_i32 v2, v0, 16, 8
+; VI-NEXT:    v_bfe_i32 v3, v0, 8, 8
 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; VI-NEXT:    v_bfe_i32 v1, v1, 0, 8
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
   %a = load i32, ptr addrspace(1) %in
   %cast = bitcast i32 %a to <4 x i8>
diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll
index e0d0ddce208c46..193b5b14d12889 100644
--- a/llvm/test/CodeGen/AMDGPU/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/smed3.ll
@@ -581,10 +581,24 @@ bb:
   ret void
 }
 
+; FIXME: Why the bfe on VI?
+
 ; GCN-LABEL: {{^}}s_test_smed3_i8_pat_0:
-; GCN: s_sext_i32_i8
-; GCN: s_sext_i32_i8
-; GCN: s_sext_i32_i8
+
+; VI: s_bfe_i32
+; VI: s_bfe_i32
+; VI: s_bfe_i32
+; VI: s_sext_i32_i16
+; VI: s_sext_i32_i16
+; VI: s_sext_i32_i16
+
+; SI: s_sext_i32_i8
+; SI: s_sext_i32_i8
+; SI: s_sext_i32_i8
+
+; GFX9: s_sext_i32_i16
+; GFX9: s_sext_i32_i16
+; GFX9: s_sext_i32_i16
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @s_test_smed3_i8_pat_0(ptr addrspace(1) %arg, [8 x i32], i8 %x, [8 x i32], i8 %y, [8 x i32], i8 %z) #1 {
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index fab001eddad5c6..4990e297b82100 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -25,17 +25,17 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s5, s4, 16
-; VI-NEXT:    s_sub_i32 s6, 0, s4
-; VI-NEXT:    s_sub_i32 s5, 0, s5
+; VI-NEXT:    s_sub_i32 s5, 0, s4
+; VI-NEXT:    s_lshr_b32 s6, s4, 16
+; VI-NEXT:    s_sub_i32 s6, 0, s6
 ; VI-NEXT:    s_ashr_i32 s7, s4, 16
+; VI-NEXT:    s_sext_i32_i16 s5, s5
 ; VI-NEXT:    s_sext_i32_i16 s4, s4
 ; VI-NEXT:    s_sext_i32_i16 s6, s6
-; VI-NEXT:    s_sext_i32_i16 s5, s5
-; VI-NEXT:    s_max_i32 s4, s4, s6
-; VI-NEXT:    s_max_i32 s5, s7, s5
+; VI-NEXT:    s_max_i32 s4, s4, s5
+; VI-NEXT:    s_max_i32 s6, s7, s6
 ; VI-NEXT:    s_add_i32 s4, s4, 2
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_lshl_b32 s5, s6, 16
 ; VI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s4, s4, 0x20000
@@ -175,17 +175,17 @@ define amdgpu_kernel void @s_abs_v2i16_2(ptr addrspace(1) %out, <2 x i16> %val)
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s5, s4, 16
-; VI-NEXT:    s_sub_i32 s6, 0, s4
-; VI-NEXT:    s_sub_i32 s5, 0, s5
+; VI-NEXT:    s_sub_i32 s5, 0, s4
+; VI-NEXT:    s_lshr_b32 s6, s4, 16
+; VI-NEXT:    s_sub_i32 s6, 0, s6
 ; VI-NEXT:    s_ashr_i32 s7, s4, 16
+; VI-NEXT:    s_sext_i32_i16 s5, s5
 ; VI-NEXT:    s_sext_i32_i16 s4, s4
 ; VI-NEXT:    s_sext_i32_i16 s6, s6
-; VI-NEXT:    s_sext_i32_i16 s5, s5
-; VI-NEXT:    s_max_i32 s4, s4, s6
-; VI-NEXT:    s_max_i32 s5, s7, s5
+; VI-NEXT:    s_max_i32 s4, s4, s5
+; VI-NEXT:    s_max_i32 s6, s7, s6
 ; VI-NEXT:    s_add_i32 s4, s4, 2
-; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_lshl_b32 s5, s6, 16
 ; VI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; VI-NEXT:    s_or_b32 s4, s5, s4
 ; VI-NEXT:    s_add_i32 s4, s4, 0x20000
@@ -339,38 +339,38 @@ define amdgpu_kernel void @s_abs_v4i16(ptr addrspace(1) %out, <4 x i16> %val) #0
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshr_b32 s0, s2, 16
-; VI-NEXT:    s_lshr_b32 s1, s3, 16
-; VI-NEXT:    s_sub_i32 s8, 0, s3
-; VI-NEXT:    s_sub_i32 s9, 0, s2
-; VI-NEXT:    s_sub_i32 s1, 0, s1
-; VI-NEXT:    s_sub_i32 s0, 0, s0
+; VI-NEXT:    s_sub_i32 s1, 0, s2
+; VI-NEXT:    s_lshr_b32 s9, s2, 16
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_sub_i32 s0, 0, s3
+; VI-NEXT:    s_lshr_b32 s8, s3, 16
+; VI-NEXT:    s_sub_i32 s9, 0, s9
 ; VI-NEXT:    s_ashr_i32 s10, s2, 16
-; VI-NEXT:    s_ashr_i32 s11, s3, 16
+; VI-NEXT:    s_sext_i32_i16 s1, s1
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_sext_i32_i16 s3, s3
+; VI-NEXT:    s_sub_i32 s8, 0, s8
 ; VI-NEXT:    s_sext_i32_i16 s9, s9
-; VI-NEXT:    s_sext_i32_i16 s8, s8
+; VI-NEXT:    s_max_i32 s1, s2, s1
 ; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_sext_i32_i16 s1, s1
-; VI-NEXT:    s_max_i32 s3, s3, s8
-; VI-NEXT:    s_max_i32 s2, s2, s9
-; VI-NEXT:    s_max_i32 s1, s11, s1
-; VI-NEXT:    s_max_i32 s0, s10, s0
-; VI-NEXT:    s_add_i32 s2, s2, 2
-; VI-NEXT:    s_add_i32 s3, s3, 2
-; VI-NEXT:    s_lshl_b32 s0, s0, 16
-; VI-NEXT:    s_lshl_b32 s1, s1, 16
-; VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; VI-NEXT:    s_or_b32 s1, s1, s3
-; VI-NEXT:    s_or_b32 s0, s0, s2
-; VI-NEXT:    s_add_i32 s1, s1, 0x20000
+; VI-NEXT:    s_sext_i32_i16 s2, s3
+; VI-NEXT:    s_max_i32 s9, s10, s9
+; VI-NEXT:    s_ashr_i32 s10, s3, 16
+; VI-NEXT:    s_sext_i32_i16 s8, s8
+; VI-NEXT:    s_max_i32 s0, s2, s0
+; VI-NEXT:    s_max_i32 s8, s10, s8
+; VI-NEXT:    s_add_i32 s0, s0, 2
+; VI-NEXT:    s_lshl_b32 s2, s8, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_add_i32 s1, s1, 2
+; VI-NEXT:    s_or_b32 s0, s2, s0
+; VI-NEXT:    s_lshl_b32 s2, s9, 16
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_or_b32 s1, s2, s1
 ; VI-NEXT:    s_add_i32 s0, s0, 0x20000
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_add_i32 s1, s1, 0x20000
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    v_mov_b32_e32 v1, s0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -579,23 +579,23 @@ define amdgpu_kernel void @s_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s8, s4
 ; VI-NEXT:    s_mov_b32 s9, s5
-; VI-NEXT:    s_mov_b32 s0, s6
 ; VI-NEXT:    s_mov_b32 s1, s7
-; VI-NEXT:    s_ashr_i32 s4, s12, 16
-; VI-NEXT:    s_sext_i32_i16 s5, s12
-; VI-NEXT:    s_ashr_i32 s6, s13, 16
+; VI-NEXT:    s_ashr_i32 s4, s13, 16
+; VI-NEXT:    s_ashr_i32 s5, s12, 16
 ; VI-NEXT:    s_sext_i32_i16 s7, s13
-; VI-NEXT:    s_max_i32 s12, s4, s6
-; VI-NEXT:    s_max_i32 s13, s5, s7
-; VI-NEXT:    s_lshl_b32 s12, s12, 16
+; VI-NEXT:    s_sext_i32_i16 s12, s12
+; VI-NEXT:    s_mov_b32 s0, s6
+; VI-NEXT:    s_max_i32 s6, s5, s4
+; VI-NEXT:    s_max_i32 s13, s12, s7
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
 ; VI-NEXT:    s_and_b32 s13, s13, 0xffff
-; VI-NEXT:    s_min_i32 s4, s4, s6
-; VI-NEXT:    s_min_i32 s5, s5, s7
-; VI-NEXT:    s_or_b32 s12, s13, s12
+; VI-NEXT:    s_min_i32 s4, s5, s4
+; VI-NEXT:    s_min_i32 s5, s12, s7
+; VI-NEXT:    s_or_b32 s6, s13, s6
 ; VI-NEXT:    s_lshl_b32 s4, s4, 16
 ; VI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; VI-NEXT:    s_or_b32 s4, s5, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s12
+; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    s_mov_b32 s3, s11
 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -700,12 +700,12 @@ define amdgpu_kernel void @v_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
 ; VI-NEXT:    s_mov_b32 s9, s1
 ; VI-NEXT:    s_mov_b32 s4, s2
 ; VI-NEXT:    s_mov_b32 s5, s3
-; VI-NEXT:    v_max_i32_sdwa v2, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; VI-NEXT:    v_max_i32_sdwa v3, sext(v0), sext(v1) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_min_i32_sdwa v4, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; VI-NEXT:    v_min_i32_sdwa v0, sext(v0), sext(v1) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_i32_sdwa v2, sext(v0), sext(v1) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_i32_sdwa v3, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-NEXT:    v_min_i32_sdwa v4, sext(v0), sext(v1) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_i32_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-NEXT:    v_or_b32_sdwa v1, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, off, s[8:11], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -794,38 +794,38 @@ define amdgpu_kernel void @s_min_max_v4i16(ptr addrspace(1) %out0, ptr addrspace
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ashr_i32 s12, s9, 16
-; VI-NEXT:    s_ashr_i32 s13, s8, 16
+; VI-NEXT:    s_ashr_i32 s12, s11, 16
+; VI-NEXT:    s_ashr_i32 s13, s9, 16
+; VI-NEXT:    s_sext_i32_i16 s11, s11
 ; VI-NEXT:    s_sext_i32_i16 s9, s9
-; VI-NEXT:    s_sext_i32_i16 s8, s8
-; VI-NEXT:    s_ashr_i32 s14, s11, 16
+; VI-NEXT:    s_max_i32 s14, s13, s12
+; VI-NEXT:    s_max_i32 s15, s9, s11
+; VI-NEXT:    s_lshl_b32 s14, s14, 16
+; VI-NEXT:    s_and_b32 s15, s15, 0xffff
+; VI-NEXT:    s_or_b32 s14, s15, s14
 ; VI-NEXT:    s_ashr_i32 s15, s10, 16
-; VI-NEXT:    s_sext_i32_i16 s11, s11
+; VI-NEXT:    s_ashr_i32 s16, s8, 16
 ; VI-NEXT:    s_sext_i32_i16 s10, s10
-; VI-NEXT:    s_max_i32 s16, s13, s15
-; VI-NEXT:    s_max_i32 s17, s12, s14
+; VI-NEXT:    s_sext_i32_i16 s8, s8
+; VI-NEXT:    s_max_i32 s17, s16, s15
 ; VI-NEXT:    s_max_i32 s18, s8, s10
-; VI-NEXT:    s_max_i32 s19, s9, s11
-; VI-NEXT:    s_min_i32 s12, s12, s14
-; VI-NEXT:    s_min_i32 s9, s9, s11
 ; VI-NEXT:    s_lshl_b32 s17, s17, 16
-; VI-NEXT:    s_and_b32 s19, s19, 0xffff
-; VI-NEXT:    s_lshl_b32 s16, s16, 16
 ; VI-NEXT:    s_and_b32 s18, s18, 0xffff
-; VI-NEXT:    s_min_i32 s13, s13, s15
+; VI-NEXT:    s_min_i32 s12, s13, s12
+; VI-NEXT:    s_min_i32 s9, s9, s11
+; VI-NEXT:    s_min_i32 s11, s16, s15
 ; VI-NEXT:    s_min_i32 s8, s8, s10
-; VI-NEXT:    s_lshl_b32 s10, s12, 16
+; VI-NEXT:    s_or_b32 s17, s18, s17
+; VI-NEXT:    s_lshl_b32 s12, s12, 16
 ; VI-NEXT:    s_and_b32 s9, s9, 0xffff
-; VI-NEXT:    s_or_b32 s17, s19, s17
-; VI-NEXT:    s_or_b32 s16, s18, s16
-; VI-NEXT:    s_or_b32 s9, s9, s10
-; VI-NEXT:    s_lshl_b32 s10, s13, 16
+; VI-NEXT:    s_lshl_b32 s11, s11, 16
 ; VI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; VI-NEXT:    s_mov_b32 s0, s4
 ; VI-NEXT:    s_mov_b32 s1, s5
-; VI-NEXT:    v_mov_b32_e32 v0, s16
-; VI-NEXT:    v_mov_b32_e32 v1, s17
-; VI-NEXT:    s_or_b32 s8, s8, s10
+; VI-NEXT:    v_mov_b32_e32 v0, s17
+; VI-NEXT:    v_mov_b32_e32 v1, s14
+; VI-NEXT:    s_or_b32 s9, s9, s12
+; VI-NEXT:    s_or_b32 s8, s8, s11
 ; VI-NEXT:    s_mov_b32 s4, s6
 ; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s6, s2
@@ -924,26 +924,26 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr
 ; GFX9-NEXT:    s_mov_b32 s12, s10
 ; GFX9-NEXT:    s_mov_b32 s13, s11
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_cmp_gt_i32_sdwa vcc, sext(v0), sext(v1) src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NEXT:    v_cmp_gt_i32_sdwa s[0:1], sext(v0), sext(v1) src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-NEXT:    v_cmp_gt_i16_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX9-NEXT:    v_cmp_gt_i16_e64 s[0:1], v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v3, v2, s[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_lshl_or_b32 v4, v5, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_lshl_or_b32 v3, v5, 16, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 1, v3
-; GFX9-NEXT:    buffer_store_dword v4, off, s[4:7], 0
+; GFX9-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX9-NEXT:    buffer_store_dword v3, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[12:15], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v0, v1, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 3, v1
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
@@ -951,59 +951,51 @@ define amdgpu_kernel void @v_min_max_v2i16_user(ptr addrspace(1) %out0, ptr addr
 ; VI-LABEL: v_min_max_v2i16_user:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_mov_b32 s6, s2
-; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s2, s6
+; VI-NEXT:    s_mov_b32 s3, s7
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s12
-; VI-NEXT:    s_mov_b32 s5, s13
+; VI-NEXT:    s_mov_b32 s0, s12
+; VI-NEXT:    s_mov_b32 s1, s13
 ; VI-NEXT:    s_mov_b32 s12, s14
 ; VI-NEXT:    s_mov_b32 s13, s15
-; VI-NEXT:    s_mov_b32 s14, s2
-; VI-NEXT:    s_mov_b32 s15, s3
-; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 glc
+; VI-NEXT:    s_mov_b32 s14, s6
+; VI-NEXT:    s_mov_b32 s15, s7
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_mov_b32 s0, s8
-; VI-NEXT:    s_mov_b32 s1, s9
-; VI-NEXT:    s_mov_b32 s5, s11
-; VI-NEXT:    s_mov_b32 s4, s10
-; VI-NEXT:    v_readfirstlane_b32 s8, v0
-; VI-NEXT:    v_readfirstlane_b32 s9, v1
-; VI-NEXT:    s_ashr_i32 s11, s8, 16
-; VI-NEXT:    s_ashr_i32 s13, s9, 16
-; VI-NEXT:    s_cmp_gt_i32 s11, s13
-; VI-NEXT:    s_sext_i32_i16 s10, s8
-; VI-NEXT:    s_sext_i32_i16 s12, s9
-; VI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[8:9]
-; VI-NEXT:    s_and_b64 s[8:9], s[8:9], exec
-; VI-NEXT:    s_cselect_b32 s8, s11, s13
-; VI-NEXT:    s_cselect_b32 s11, s13, s11
-; VI-NEXT:    s_lshl_b32 s13, s8, 16
-; VI-NEXT:    s_cmp_gt_i32 s10, s12
-; VI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[8:9]
-; VI-NEXT:    s_and_b64 s[8:9], s[8:9], exec
-; VI-NEXT:    s_cselect_b32 s8, s10, s12
-; VI-NEXT:    s_cselect_b32 s9, s12, s10
-; VI-NEXT:    s_and_b32 s8, s8, 0xffff
-; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
-; VI-NEXT:    s_lshl_b32 s10, s11, 16
-; VI-NEXT:    s_and_b32 s9, s9, 0xffff
-; VI-NEXT:    s_or_b32 s8, s8, s13
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    s_or_b32 s9, s9, s10
-; VI-NEXT:    v_mov_b32_e32 v1, s8
-; VI-NEXT:    v_and_b32_e32 v0, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v2, s9
-; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
-; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s8
+; VI-NEXT:    s_mov_b32 s5, s9
+; VI-NEXT:    s_mov_b32 s12, s10
+; VI-NEXT:    s_mov_b32 s13, s11
+; VI-NEXT:    v_bfe_i32 v2, v0, 0, 16
+; VI-NEXT:    v_bfe_i32 v3, v1, 0, 16
+; VI-NEXT:    v_ashrrev_i32_e32 v6, 16, v0
+; VI-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; VI-NEXT:    v_cmp_gt_i32_e32 vcc, v6, v7
+; VI-NEXT:    v_cmp_gt_i32_e64 s[0:1], v2, v3
+; VI-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v1, v5, v1
 ; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v0, 3, v1
+; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_endpgm
 ;
@@ -1099,23 +1091,23 @@ define amdgpu_kernel void @u_min_max_v2i16(ptr addrspace(1) %out0, ptr addrspace
 ; VI-NEXT:    s_mov_b32 s10, -1
 ; VI-NEXT:    s_mov_b32 s2, s10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s4
-; VI-NEXT:    s_mov_b32 s0, s6
-; VI-NEXT:    s_lshr_b32 s4, s12, 16
-; VI-NEXT:    s_lshr_b32 s6, s13, 16
 ; VI-NEXT:    s_mov_b32 s9, s5
 ; VI-NEXT:    s_mov_b32 s1, s7
 ; VI-NEXT:    s_and_b32 s5, s12, 0xffff
-; VI-NEXT:    s_and_b32 s7, s13, 0xffff
-; VI-NEXT:    s_max_u32 s13, s4, s6
-; VI-NEXT:    s_max_u32 s12, s5, s7
+; VI-NEXT:    s_lshr_b32 s7, s13, 16
+; VI-NEXT:    s_lshr_b32 s12, s12, 16
+; VI-NEXT:    s_mov_b32 s8, s4
+; VI-NEXT:    s_and_b32 s4, s13, 0xffff
+; VI-NEXT:    s_max_u32 s13, s12, s7
+; VI-NEXT:    s_mov_b32 s0, s6
+; VI-NEXT:    s_max_u32 s6, s5, s4
 ; VI-NEXT:    s_lshl_b32 s13, s13, 16
-; VI-NEXT:    s_min_u32 s4, s4, s6
-; VI-NEXT:    s_or_b32 s12, s12, s13
-; VI-NEXT:    s_min_u32 s5, s5, s7
-; VI-NEXT:    s_lshl_b32 s4, s4, 16
-; VI-NEXT:    s_or_b32 s4, s5, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s12
+; VI-NEXT:    s_min_u32 s4, s5, s4
+; VI-NEXT:    s_min_u32 s5, s12, s7
+; VI-NEXT:    s_or_b32 s6, s6, s13
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    s_mov_b32 s3, s11
 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index cd06a060a50cd8..c9157631e0dfc2 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -187,15 +187,15 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_readfirstlane_b32 s0, v1
-; VI-NEXT:    v_readfirstlane_b32 s1, v0
+; VI-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-NEXT:    v_readfirstlane_b32 s1, v1
 ; VI-NEXT:    s_ashr_i32 s2, s1, 16
-; VI-NEXT:    s_sext_i32_i16 s1, s1
 ; VI-NEXT:    s_ashr_i32 s3, s0, 16
+; VI-NEXT:    s_sext_i32_i16 s1, s1
 ; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_ashr_i32 s0, s1, s0
-; VI-NEXT:    s_ashr_i32 s1, s2, s3
-; VI-NEXT:    s_lshl_b32 s1, s1, 16
+; VI-NEXT:    s_ashr_i32 s2, s3, s2
+; VI-NEXT:    s_ashr_i32 s0, s0, s1
+; VI-NEXT:    s_lshl_b32 s1, s2, 16
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; VI-NEXT:    s_or_b32 s0, s0, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -282,43 +282,43 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ;
 ; VI-LABEL: ashr_v4i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_mov_b32 s10, s6
-; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s11, s3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s8, s2
-; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    s_mov_b32 s8, s6
+; VI-NEXT:    s_mov_b32 s9, s7
 ; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_readfirstlane_b32 s0, v2
-; VI-NEXT:    v_readfirstlane_b32 s1, v3
-; VI-NEXT:    v_readfirstlane_b32 s2, v0
-; VI-NEXT:    v_readfirstlane_b32 s3, v1
-; VI-NEXT:    s_ashr_i32 s8, s3, 16
-; VI-NEXT:    s_sext_i32_i16 s3, s3
-; VI-NEXT:    s_ashr_i32 s9, s2, 16
-; VI-NEXT:    s_sext_i32_i16 s2, s2
-; VI-NEXT:    s_ashr_i32 s10, s1, 16
-; VI-NEXT:    s_sext_i32_i16 s1, s1
-; VI-NEXT:    s_ashr_i32 s11, s0, 16
-; VI-NEXT:    s_sext_i32_i16 s0, s0
-; VI-NEXT:    s_ashr_i32 s0, s2, s0
-; VI-NEXT:    s_ashr_i32 s2, s9, s11
-; VI-NEXT:    s_ashr_i32 s1, s3, s1
-; VI-NEXT:    s_ashr_i32 s3, s8, s10
-; VI-NEXT:    s_lshl_b32 s3, s3, 16
-; VI-NEXT:    s_and_b32 s1, s1, 0xffff
-; VI-NEXT:    s_lshl_b32 s2, s2, 16
-; VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; VI-NEXT:    s_or_b32 s1, s1, s3
-; VI-NEXT:    s_or_b32 s0, s0, s2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    v_readfirstlane_b32 s4, v0
+; VI-NEXT:    v_readfirstlane_b32 s5, v2
+; VI-NEXT:    v_readfirstlane_b32 s6, v1
+; VI-NEXT:    v_readfirstlane_b32 s7, v3
+; VI-NEXT:    s_ashr_i32 s8, s7, 16
+; VI-NEXT:    s_ashr_i32 s9, s6, 16
+; VI-NEXT:    s_sext_i32_i16 s7, s7
+; VI-NEXT:    s_sext_i32_i16 s6, s6
+; VI-NEXT:    s_ashr_i32 s10, s5, 16
+; VI-NEXT:    s_ashr_i32 s11, s4, 16
+; VI-NEXT:    s_sext_i32_i16 s5, s5
+; VI-NEXT:    s_sext_i32_i16 s4, s4
+; VI-NEXT:    s_ashr_i32 s8, s9, s8
+; VI-NEXT:    s_ashr_i32 s6, s6, s7
+; VI-NEXT:    s_ashr_i32 s7, s11, s10
+; VI-NEXT:    s_ashr_i32 s4, s4, s5
+; VI-NEXT:    s_lshl_b32 s5, s8, 16
+; VI-NEXT:    s_and_b32 s6, s6, 0xffff
+; VI-NEXT:    s_lshl_b32 s7, s7, 16
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_or_b32 s5, s6, s5
+; VI-NEXT:    s_or_b32 s4, s4, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: ashr_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index 7b0241984a3491..5b7d4d1d724cf3 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -9,17 +9,18 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_movk_i32 s0, 0x4925
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_bfe_i32 v2, v1, 0, 16
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, s0
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 31, v2
-; GCN-NEXT:    v_ashrrev_i32_e32 v2, 17, v2
-; GCN-NEXT:    v_add_u16_e32 v2, v2, v3
-; GCN-NEXT:    v_mul_lo_u16_e32 v2, 7, v2
-; GCN-NEXT:    v_sub_u16_e32 v1, v1, v2
+; GCN-NEXT:    v_readfirstlane_b32 s0, v1
+; GCN-NEXT:    s_sext_i32_i16 s0, s0
+; GCN-NEXT:    s_mulk_i32 s0, 0x4925
+; GCN-NEXT:    s_lshr_b32 s1, s0, 31
+; GCN-NEXT:    s_ashr_i32 s0, s0, 17
+; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
+; GCN-NEXT:    s_add_i32 s0, s0, s1
+; GCN-NEXT:    s_mul_i32 s0, s0, 7
+; GCN-NEXT:    v_subrev_u32_e32 v1, s0, v1
 ; GCN-NEXT:    global_store_short v0, v1, s[4:5]
 ; GCN-NEXT:    s_endpgm
 ;
@@ -54,17 +55,18 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_mov_b32_e32 v0, s2
 ; TONGA-NEXT:    v_mov_b32_e32 v1, s3
 ; TONGA-NEXT:    flat_load_ushort v2, v[0:1]
-; TONGA-NEXT:    s_movk_i32 s2, 0x4925
+; TONGA-NEXT:    v_mov_b32_e32 v0, s0
 ; TONGA-NEXT:    v_mov_b32_e32 v1, s1
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
-; TONGA-NEXT:    v_bfe_i32 v0, v2, 0, 16
-; TONGA-NEXT:    v_mul_lo_u32 v3, v0, s2
-; TONGA-NEXT:    v_mov_b32_e32 v0, s0
-; TONGA-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
-; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 17, v3
-; TONGA-NEXT:    v_add_u16_e32 v3, v3, v4
-; TONGA-NEXT:    v_mul_lo_u16_e32 v3, 7, v3
-; TONGA-NEXT:    v_sub_u16_e32 v2, v2, v3
+; TONGA-NEXT:    v_readfirstlane_b32 s0, v2
+; TONGA-NEXT:    s_sext_i32_i16 s0, s0
+; TONGA-NEXT:    s_mulk_i32 s0, 0x4925
+; TONGA-NEXT:    s_lshr_b32 s1, s0, 31
+; TONGA-NEXT:    s_ashr_i32 s0, s0, 17
+; TONGA-NEXT:    s_and_b32 s0, s0, 0xffff
+; TONGA-NEXT:    s_add_i32 s0, s0, s1
+; TONGA-NEXT:    s_mul_i32 s0, s0, 7
+; TONGA-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v2
 ; TONGA-NEXT:    flat_store_short v[0:1], v2
 ; TONGA-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 5a821db6ff0408..327a85e80da9dc 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -117,23 +117,21 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s6, s[6:7], 0x0
-; VI-NEXT:    s_load_dword s7, s[0:1], 0x0
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dword s2, s[6:7], 0x0
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshr_b32 s4, s6, 16
-; VI-NEXT:    s_lshr_b32 s5, s7, 16
-; VI-NEXT:    s_sub_i32 s6, s6, s7
-; VI-NEXT:    s_sub_i32 s4, s4, s5
-; VI-NEXT:    s_and_b32 s5, s6, 0xffff
-; VI-NEXT:    s_lshl_b32 s4, s4, 16
-; VI-NEXT:    s_or_b32 s4, s5, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_sub_i32 s1, s2, s0
+; VI-NEXT:    s_lshr_b32 s0, s0, 16
+; VI-NEXT:    s_lshr_b32 s2, s2, 16
+; VI-NEXT:    s_sub_i32 s0, s2, s0
+; VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; VI-NEXT:    s_lshl_b32 s0, s0, 16
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_test_sub_v2i16:
@@ -235,9 +233,9 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshr_b32 s0, s2, 16
-; VI-NEXT:    s_lshr_b32 s1, s3, 16
-; VI-NEXT:    s_sub_i32 s0, s0, s1
+; VI-NEXT:    s_lshr_b32 s0, s3, 16
+; VI-NEXT:    s_lshr_b32 s1, s2, 16
+; VI-NEXT:    s_sub_i32 s0, s1, s0
 ; VI-NEXT:    s_sub_i32 s1, s2, s3
 ; VI-NEXT:    s_lshl_b32 s0, s0, 16
 ; VI-NEXT:    s_and_b32 s1, s1, 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index 22eb7dddb84f4d..80b0bdd8c03759 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -41,7 +41,7 @@ define i16 @trunc_bitcast_v2i32_to_i16(<2 x i32> %bar) {
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v0, 4, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %load0 = load i32, ptr addrspace(1) undef
   %load1 = load i32, ptr addrspace(1) null
@@ -70,7 +70,7 @@ define i16 @trunc_bitcast_v2f32_to_i16(<2 x float> %bar) {
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v0, 4, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %load0 = load float, ptr addrspace(1) undef
   %load1 = load float, ptr addrspace(1) null
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store.ll b/llvm/test/CodeGen/AMDGPU/trunc-store.ll
index efb1a630f927ca..088aff983ddc9c 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store.ll
@@ -58,35 +58,47 @@ define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx16 s[4:19], s[2:3], 0x64
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e64 v0, 8, s17
-; VI-NEXT:    v_mov_b32_e32 v1, s16
-; VI-NEXT:    v_lshlrev_b16_e64 v2, 8, s19
-; VI-NEXT:    v_mov_b32_e32 v3, s18
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v0, 8, s13
-; VI-NEXT:    v_mov_b32_e32 v1, s12
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 8, s15
-; VI-NEXT:    v_mov_b32_e32 v2, s14
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v0, 8, s9
-; VI-NEXT:    v_mov_b32_e32 v1, s8
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 8, s11
-; VI-NEXT:    v_mov_b32_e32 v4, s10
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v0, 8, s5
-; VI-NEXT:    v_mov_b32_e32 v4, s4
-; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v4, 8, s7
-; VI-NEXT:    v_mov_b32_e32 v5, s6
-; VI-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_lshl_b32 s2, s19, 8
+; VI-NEXT:    s_and_b32 s3, s18, 0xff
+; VI-NEXT:    s_lshl_b32 s17, s17, 8
+; VI-NEXT:    s_and_b32 s16, s16, 0xff
+; VI-NEXT:    s_or_b32 s2, s3, s2
+; VI-NEXT:    s_or_b32 s3, s16, s17
+; VI-NEXT:    s_lshl_b32 s2, s2, 16
+; VI-NEXT:    s_and_b32 s3, s3, 0xffff
+; VI-NEXT:    s_or_b32 s2, s3, s2
+; VI-NEXT:    s_lshl_b32 s3, s15, 8
+; VI-NEXT:    s_and_b32 s14, s14, 0xff
+; VI-NEXT:    s_lshl_b32 s13, s13, 8
+; VI-NEXT:    s_and_b32 s12, s12, 0xff
+; VI-NEXT:    s_lshl_b32 s11, s11, 8
+; VI-NEXT:    s_and_b32 s10, s10, 0xff
+; VI-NEXT:    s_lshl_b32 s9, s9, 8
+; VI-NEXT:    s_and_b32 s8, s8, 0xff
+; VI-NEXT:    s_lshl_b32 s7, s7, 8
+; VI-NEXT:    s_and_b32 s6, s6, 0xff
+; VI-NEXT:    s_lshl_b32 s5, s5, 8
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
+; VI-NEXT:    s_or_b32 s3, s14, s3
+; VI-NEXT:    s_or_b32 s12, s12, s13
+; VI-NEXT:    s_or_b32 s10, s10, s11
+; VI-NEXT:    s_or_b32 s8, s8, s9
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_lshl_b32 s3, s3, 16
+; VI-NEXT:    s_and_b32 s12, s12, 0xffff
+; VI-NEXT:    s_lshl_b32 s10, s10, 16
+; VI-NEXT:    s_and_b32 s8, s8, 0xffff
+; VI-NEXT:    s_lshl_b32 s6, s6, 16
+; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_or_b32 s3, s12, s3
+; VI-NEXT:    s_or_b32 s8, s8, s10
+; VI-NEXT:    s_or_b32 s4, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s8
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
@@ -153,35 +165,47 @@ define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x24
 ; VI-NEXT:    s_load_dwordx16 s[0:15], s[2:3], 0xa4
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_lshlrev_b16_e64 v0, 8, s26
-; VI-NEXT:    v_mov_b32_e32 v1, s24
-; VI-NEXT:    v_lshlrev_b16_e64 v2, 8, s30
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v1, s28
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v0, 8, s18
-; VI-NEXT:    v_mov_b32_e32 v1, s16
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 8, s22
-; VI-NEXT:    v_mov_b32_e32 v2, s20
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v0, 8, s10
-; VI-NEXT:    v_mov_b32_e32 v1, s8
-; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v1, 8, s14
-; VI-NEXT:    v_mov_b32_e32 v4, s12
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v0, 8, s2
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e64 v4, 8, s6
-; VI-NEXT:    v_mov_b32_e32 v5, s4
-; VI-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_lshl_b32 s1, s30, 8
+; VI-NEXT:    s_and_b32 s3, s28, 0xff
+; VI-NEXT:    s_lshl_b32 s5, s26, 8
+; VI-NEXT:    s_or_b32 s1, s3, s1
+; VI-NEXT:    s_and_b32 s3, s24, 0xff
+; VI-NEXT:    s_or_b32 s3, s3, s5
+; VI-NEXT:    s_lshl_b32 s1, s1, 16
+; VI-NEXT:    s_and_b32 s3, s3, 0xffff
+; VI-NEXT:    s_or_b32 s1, s3, s1
+; VI-NEXT:    s_lshl_b32 s3, s22, 8
+; VI-NEXT:    s_and_b32 s5, s20, 0xff
+; VI-NEXT:    s_or_b32 s3, s5, s3
+; VI-NEXT:    s_lshl_b32 s5, s18, 8
+; VI-NEXT:    s_and_b32 s7, s16, 0xff
+; VI-NEXT:    s_or_b32 s5, s7, s5
+; VI-NEXT:    s_lshl_b32 s3, s3, 16
+; VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; VI-NEXT:    s_or_b32 s3, s5, s3
+; VI-NEXT:    s_lshl_b32 s5, s14, 8
+; VI-NEXT:    s_and_b32 s7, s12, 0xff
+; VI-NEXT:    s_or_b32 s5, s7, s5
+; VI-NEXT:    s_lshl_b32 s7, s10, 8
+; VI-NEXT:    s_and_b32 s8, s8, 0xff
+; VI-NEXT:    s_lshl_b32 s6, s6, 8
+; VI-NEXT:    s_and_b32 s4, s4, 0xff
+; VI-NEXT:    s_lshl_b32 s2, s2, 8
+; VI-NEXT:    s_and_b32 s0, s0, 0xff
+; VI-NEXT:    s_or_b32 s7, s8, s7
+; VI-NEXT:    s_or_b32 s4, s4, s6
+; VI-NEXT:    s_or_b32 s0, s0, s2
+; VI-NEXT:    s_lshl_b32 s5, s5, 16
+; VI-NEXT:    s_and_b32 s7, s7, 0xffff
+; VI-NEXT:    s_lshl_b32 s4, s4, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_or_b32 s5, s7, s5
+; VI-NEXT:    s_or_b32 s0, s0, s4
 ; VI-NEXT:    v_mov_b32_e32 v4, s34
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v5, s35
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index c08571a733cc51..7a49e3f5b210f7 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -491,16 +491,19 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    flat_load_ushort v4, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    flat_load_ushort v5, v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v5, v4, v5
-; VI-NEXT:    v_cmp_lt_u16_e32 vcc, v5, v4
+; VI-NEXT:    v_add_u32_e32 v5, vcc, v4, v5
+; VI-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; VI-NEXT:    v_cmp_lt_u32_e32 vcc, v6, v4
 ; VI-NEXT:    flat_store_short v[0:1], v5
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    flat_store_byte v[2:3], v0
@@ -514,9 +517,9 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    global_load_ushort v1, v0, s[8:9]
 ; GFX9-NEXT:    global_load_ushort v2, v0, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v2, v1, v2
-; GFX9-NEXT:    v_cmp_lt_u16_e32 vcc, v2, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cmp_lt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_short v0, v2, s[4:5]
 ; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 36a0cbd3f09703..82893c47bc7460 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -491,16 +491,19 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    flat_load_ushort v4, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    flat_load_ushort v5, v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_sub_u16_e32 v5, v4, v5
-; VI-NEXT:    v_cmp_gt_u16_e32 vcc, v5, v4
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, v4, v5
+; VI-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; VI-NEXT:    v_cmp_gt_u32_e32 vcc, v6, v4
 ; VI-NEXT:    flat_store_short v[0:1], v5
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    flat_store_byte v[2:3], v0
@@ -514,9 +517,9 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    global_load_ushort v1, v0, s[8:9]
 ; GFX9-NEXT:    global_load_ushort v2, v0, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_sub_u16_e32 v2, v1, v2
-; GFX9-NEXT:    v_cmp_gt_u16_e32 vcc, v2, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_sub_u32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cmp_gt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_short v0, v2, s[4:5]
 ; GFX9-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index ee99fcc5863340..6195dd1992469b 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -92,10 +92,11 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
 ; SDAG-VI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v0, 0xff
 ; SDAG-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT:    v_max_i16_e64 v1, s2, 0
-; SDAG-VI-NEXT:    v_max_i16_e64 v2, s3, 0
-; SDAG-VI-NEXT:    v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; SDAG-VI-NEXT:    v_min_i16_e32 v1, 0xff, v1
+; SDAG-VI-NEXT:    s_sext_i32_i16 s2, s2
+; SDAG-VI-NEXT:    s_sext_i32_i16 s3, s3
+; SDAG-VI-NEXT:    v_med3_i32 v1, s2, 0, v0
+; SDAG-VI-NEXT:    v_med3_i32 v0, s3, 0, v0
+; SDAG-VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SDAG-VI-NEXT:    v_or_b32_e32 v2, v1, v0
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v0, s0
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -417,12 +418,12 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
 ; SDAG-VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v0, 0xff
 ; SDAG-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT:    s_lshr_b32 s2, s4, 16
-; SDAG-VI-NEXT:    v_max_i16_e64 v1, s4, 0
-; SDAG-VI-NEXT:    v_max_i16_e64 v2, s2, 0
-; SDAG-VI-NEXT:    v_min_i16_e32 v1, 0xff, v1
-; SDAG-VI-NEXT:    v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; SDAG-VI-NEXT:    v_or_b32_e32 v2, v1, v0
+; SDAG-VI-NEXT:    s_ashr_i32 s2, s4, 16
+; SDAG-VI-NEXT:    s_sext_i32_i16 s3, s4
+; SDAG-VI-NEXT:    v_med3_i32 v1, s3, 0, v0
+; SDAG-VI-NEXT:    v_med3_i32 v0, s2, 0, v0
+; SDAG-VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-VI-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v0, s0
 ; SDAG-VI-NEXT:    v_mov_b32_e32 v1, s1
 ; SDAG-VI-NEXT:    flat_store_dword v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index 85c34e036e1fd9..aec86ec343bdb5 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -85,8 +85,7 @@ entry:
 ; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float:
 ; GCN-ALLOCA: buffer_store_dword
 
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
+; GCN-PROMOTE: s_cmp_eq_u32
 
 ; GCN: s_cbranch
 
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index 50927a2cf21afe..15a83475f368e9 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -46,9 +46,11 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:    v_readlane_b32 s5, v1, 1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0xffff
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], v0, s4
+; CHECK-NEXT:    v_and_b32_e64 v0, s4, v0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], v0, s4
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_4
 ; CHECK-NEXT:  ; %bb.3: ; %bb201
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index 978ac548443f73..0be1a764272dbf 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -28,6 +28,7 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; VI-NEXT:    s_addk_i32 s0, 0x3e7
 ; VI-NEXT:    s_or_b32 s0, s0, 4
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -41,9 +42,11 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_addk_i32 s0, 0x3e7
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_addk_i32 s0, 0x3e7
 ; GFX11-NEXT:    s_or_b32 s0, s0, 4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_nop 0
@@ -318,42 +321,42 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
 ; VI-LABEL: widen_v2i8_constant_load:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 44
-; VI-NEXT:    v_mov_b32_e32 v1, 3
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s1, s0, 0xffff
+; VI-NEXT:    s_and_b32 s1, s0, 0xff00
+; VI-NEXT:    s_add_i32 s0, s0, 12
+; VI-NEXT:    s_or_b32 s0, s0, 4
+; VI-NEXT:    s_and_b32 s0, s0, 0xff
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_addk_i32 s0, 0x2c00
+; VI-NEXT:    s_or_b32 s0, s0, 0x300
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    s_add_i32 s1, s1, 12
-; VI-NEXT:    v_add_u32_sdwa v0, vcc, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; VI-NEXT:    s_or_b32 s0, s1, 4
-; VI-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_mov_b32_e32 v3, s0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
-; VI-NEXT:    v_mov_b32_e32 v1, 0
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: widen_v2i8_constant_load:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v0, s0, 12
-; GFX11-NEXT:    v_and_b32_e64 v1, 0xffffff00, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, 4, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_add_nc_u16 v2, v0, 0x2c00
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x300, v2
+; GFX11-NEXT:    s_add_i32 s1, s0, 12
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff00
+; GFX11-NEXT:    s_or_b32 s1, s1, 4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX11-NEXT:    s_addk_i32 s0, 0x2c00
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s0, s0, 0x300
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -621,6 +624,7 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; VI-NEXT:    s_addk_i32 s0, 0x3e7
 ; VI-NEXT:    s_or_b32 s0, s0, 4
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -635,9 +639,11 @@ define amdgpu_kernel void @widen_i16_constant32_load(ptr addrspace(6) %arg) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_addk_i32 s0, 0x3e7
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_addk_i32 s0, 0x3e7
 ; GFX11-NEXT:    s_or_b32 s0, s0, 4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_nop 0
@@ -675,6 +681,7 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
 ; VI-NEXT:    s_addk_i32 s0, 0x3e7
 ; VI-NEXT:    s_or_b32 s0, s0, 1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -688,9 +695,11 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_addk_i32 s0, 0x3e7
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_addk_i32 s0, 0x3e7
 ; GFX11-NEXT:    s_or_b32 s0, s0, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index af50e09f509a3b..f2276c000f17cd 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=SI,GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=VI,GCN %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
 
 ; R600: {{^}}s_mad_zext_i32_to_i64:
@@ -51,9 +51,10 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i
 ; GCN: s_load_dword [[A:s[0-9]+]]
 ; GCN: s_load_dword [[B:s[0-9]+]]
 
-; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}}
-; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}}
-; GCN: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]]
+; GCN-DAG: s_and_b32 [[A]], [[A]], 0xffff{{$}}
+; GCN-DAG: s_and_b32 [[B]], [[B]], 0xffff{{$}}
+; SI: s_cmp_eq_u32 [[A]], [[B]]
+; VI: s_cmp_eq_u32 [[B]], [[A]]
 ; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN: buffer_store_short [[RESULT]]



More information about the llvm-commits mailing list